In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn import metrics
from tqdm import tqdm

In [2]:
# Load datasets
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')

In [3]:
# Preprocess the text data
def preprocess_text(text):
    return str(text).lower().strip()


# Apply preprocessing
train_df['text'] = train_df['text'].str.lower().str.strip()
test_df['text'] = test_df['text'].str.lower().str.strip()

# Encode target labels
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'], train_df['target'], test_size=0.1, random_state=42
)

# Check the results of preprocessing and encoding
print(train_df.head()) 
print("Classes:", label_encoder.classes_)

                                                text  target  Word Count
0  python courses python courses, python exercise...       0         125
1  the learning point open digital education. a r...       0         147
2  tech news, latest technology, mobiles, laptops...       0         143
3  the best it certification materials in usa | k...       0         364
4  bioland scientific, for your research needs bi...       0         176
Classes: ['academic interests' 'arts and culture' 'automotives'
 'books and literature' 'business and finance' 'careers'
 'family and relationships' 'food and drinks' 'health' 'healthy living'
 'hobbies and interests' 'home and garden' 'movies' 'music and audio'
 'news and politics' 'personal finance' 'pets'
 'pharmaceuticals, conditions, and symptoms' 'real estate' 'shopping'
 'sports' 'style and fashion' 'technology and computing' 'television'
 'travel' 'video gaming']


In [4]:

# Convert pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(
    {'text': train_texts, 'labels': train_labels}))
val_dataset = Dataset.from_pandas(pd.DataFrame(
    {'text': val_texts, 'labels': val_labels}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_df['text']}))

# Display first few entries to verify datasets
print(train_dataset[0])
print(val_dataset[0])
print(test_dataset[0])

{'text': 'lockdown perfect time tollywood stars introspect line up outside profession actor actress began drive back first love music shruti hasaan others like pranitha subhash turned help others tough times various opening samantha akkineni find out foretell horticulture growing bring about said last found something passionate break job starting get tired answering people asked hobby reply represent counterargument job hobby oh baby actress began journey first harvest cabbage microgreens lockdown carry insta explained one need declamatory lawn backyard gardening using space uncommitted domicile initially used window sill bedroom grow microgreens said interested growing require tray cocopeat seeds cool room used bedroom window lets sunlight partly also gave guide fans farsighted takes cum sprout number days tray needs covered use lamp case one enough sunlight room inspired tollywood diva take gardening first piazza pandemic fear able-bodied feed oneself cut back houses said often hear 

In [5]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained(
    'PavanDeepak/text-classification-model-iab-categories-mixed-bert-base-uncased', use_fast=True)
model = BertForSequenceClassification.from_pretrained(
    'PavanDeepak/text-classification-model-iab-categories-mixed-bert-base-uncased',
    num_labels=len(label_encoder.classes_),
    ignore_mismatched_sizes=True  # Ignore the size mismatch for the classifier layer
)

# Alternatively, if you want to replace the classifier layer with a new one
# model.classifier = torch.nn.Linear(model.config.hidden_size, len(label_encoder.classes_))

# Freeze all layers except the last 2 to 4 transformer layers
for param in model.base_model.parameters():
    param.requires_grad = False
for i in range(-4, 0):  # Modify this range to freeze fewer or more layers
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = True

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at PavanDeepak/text-classification-model-iab-categories-mixed-bert-base-uncased and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([364]) in the checkpoint and torch.Size([26]) in the model instantiated
- classifier.weight: found shape torch.Size([364, 768]) in the checkpoint and torch.Size([26, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
import os
from datasets import load_from_disk, Dataset

# Initialize the tokenizer


train_dataset_path = 'nofin/tokenized_train_dataset'
val_dataset_path = 'nofin/tokenized_val_dataset'
test_dataset_path = 'nofin/tokenized_test_dataset'

if os.path.exists(train_dataset_path) and os.path.exists(val_dataset_path) and os.path.exists(test_dataset_path):
    # Load the tokenized datasets
    train_dataset = load_from_disk(train_dataset_path)
    val_dataset = load_from_disk(val_dataset_path)
    test_dataset = load_from_disk(test_dataset_path)
    print("Tokenized datasets loaded successfully from disk.")
else:
    # Define the tokenization function
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    # Perform tokenization without multiprocessing
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set the format for PyTorch
    train_dataset.set_format(type='torch', columns=[
                             'input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=[
                           'input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=[
                            'input_ids', 'attention_mask'])

    # Save the tokenized datasets to disk
    os.makedirs('nofin', exist_ok=True)
    train_dataset.save_to_disk(train_dataset_path)
    val_dataset.save_to_disk(val_dataset_path)
    test_dataset.save_to_disk(test_dataset_path)
    print("Tokenized datasets saved successfully to disk.")

# Check the results of tokenization
print(train_dataset[0])
print(val_dataset[0])

Tokenized datasets loaded successfully from disk.
{'labels': tensor(10), 'input_ids': tensor([  101,  5843,  7698,  3819,  2051,  9565, 26985,  3340, 17174, 13102,
        22471,  2240,  2039,  2648,  9518,  3364,  3883,  2211,  3298,  2067,
         2034,  2293,  2189, 14021, 22134,  2072,  2038, 14634,  2500,  2066,
        10975,  7088,  8322,  4942, 14949,  2232,  2357,  2393,  2500,  7823,
         2335,  2536,  3098, 11415, 17712,  4939, 18595,  2424,  2041, 18921,
        23567,  7570, 28228, 14561,  3652,  3288,  2055,  2056,  2197,  2179,
         2242, 13459,  3338,  3105,  3225,  2131,  5458, 10739,  2111,  2356,
        17792,  7514,  5050,  4675,  2906, 22850,  4765,  3105, 17792,  2821,
         3336,  3883,  2211,  4990,  2034, 11203, 28540, 12702, 28637,  3619,
         5843,  7698,  4287, 16021,  2696,  4541,  2028,  2342, 11703, 10278,
        14049, 10168, 16125, 21529,  2478,  2686,  4895,  9006, 22930,  3064,
        14383, 28775,  2571,  3322,  2109,  3332,  9033,

In [7]:
training_args = TrainingArguments(
    dataloader_num_workers=8,
    output_dir='./nofin/results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='none',
    logging_steps=5000,
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    load_best_model_at_end=True,
    fp16=True,
    report_to="none",
    resume_from_checkpoint=True
)

# Display model details
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e



In [8]:
# Define a function for computing metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metrics.f1_score(labels, predictions, average='weighted')
    accuracy = metrics.accuracy_score(labels, predictions)
    return {'f1': f1, 'accuracy': accuracy}


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Verify trainer configuration
print(trainer)

<transformers.trainer.Trainer object at 0x0000027C23895370>


In [9]:
import torch
print(torch.cuda.is_available())  # Should output True

True


In [10]:
# trainer.train()

In [11]:
trainer.train(resume_from_checkpoint='./nofin/results/checkpoint-14712')

  0%|          | 0/14712 [00:00<?, ?it/s]

{'train_runtime': 0.5878, 'train_samples_per_second': 3203897.375, 'train_steps_per_second': 25027.976, 'train_loss': 0.0, 'epoch': 3.0}


TrainOutput(global_step=14712, training_loss=0.0, metrics={'train_runtime': 0.5878, 'train_samples_per_second': 3203897.375, 'train_steps_per_second': 25027.976, 'total_flos': 4.9557961552082534e+17, 'train_loss': 0.0, 'epoch': 2.999898052808645})

In [12]:


# Make predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Decode the predictions back to original labels
pred_labels_decoded = label_encoder.inverse_transform(pred_labels)

# Create the submission file
submission_df = pd.DataFrame({
    'Index': 'Article_' + test_df.index.astype(str),
    'target': pred_labels_decoded
})
submission_df.to_csv('nofine_submission.csv', index=False)
print("Submission file 'final_submission.csv' generated successfully.")

nofin_predictions = trainer.predict(test_dataset)
# Save predicted probabilities for later ensemble use
nofin_probabilities = torch.nn.functional.softmax(
    torch.tensor(predictions.predictions), dim=-1).numpy()
np.save('nofine_model_probabilities.npy', nofin_probabilities)

nofin_pred_labels = np.argmax(nofin_probabilities, axis=1)
np.save('nofin_pred_labels.npy', nofin_pred_labels)

# Save the model and tokenizer
model.save_pretrained('nofine_model')
tokenizer.save_pretrained('nofine_tokenizer')
print("Model and tokenizer saved successfully.")

  0%|          | 0/2725 [00:00<?, ?it/s]

Submission file 'final_submission.csv' generated successfully.


  0%|          | 0/2725 [00:00<?, ?it/s]

Model and tokenizer saved successfully.


In [12]:
# Make predictions on the training set for nofine
nofine_train_predictions = trainer.predict(train_dataset)

# Save predicted probabilities for the training set
nofine_train_probabilities = torch.nn.functional.softmax(
    torch.tensor(nofine_train_predictions.predictions), dim=-1).numpy()
np.save('nofine_train_probabilities.npy', nofine_train_probabilities)

# Save the actual predicted labels for the training set
nofine_train_pred_labels = np.argmax(nofine_train_probabilities, axis=1)
np.save('nofine_train_pred_labels.npy', nofine_train_pred_labels)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/9809 [00:00<?, ?it/s]

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import os

# Load the previously saved model and tokenizer
model_path = 'nofine_model'
tokenizer_path = 'nofine_tokenizer'

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

for param in model.base_model.parameters():
    param.requires_grad = False
for i in range(-6, -4):  
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = True

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load datasets
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')

# Preprocess the text data


def preprocess_text(text):
    return str(text).lower().strip()


train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# Encode target labels
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

# Convert pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(
    train_df[['text', 'target']].rename(columns={'target': 'labels'}))
test_dataset = Dataset.from_pandas(test_df[['text']])

# Load previously tokenized datasets if they exist or perform tokenization again
train_dataset_path = 'nofin/tokenized_train_dataset'
val_dataset_path = 'nofin/tokenized_val_dataset'
test_dataset_path = 'nofin/tokenized_test_dataset'

if os.path.exists(train_dataset_path) and os.path.exists(val_dataset_path) and os.path.exists(test_dataset_path):
    train_dataset = Dataset.load_from_disk(train_dataset_path)
    val_dataset = Dataset.load_from_disk(val_dataset_path)
    test_dataset = Dataset.load_from_disk(test_dataset_path)
else:
    # Define the tokenization function
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    # Perform tokenization without multiprocessing
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set the format for PyTorch
    train_dataset.set_format(type='torch', columns=[
                             'input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=[
                            'input_ids', 'attention_mask'])

    # Save the tokenized datasets to disk
    os.makedirs('nofin', exist_ok=True)
    train_dataset.save_to_disk(train_dataset_path)
    test_dataset.save_to_disk(test_dataset_path)

# Training arguments
training_args = TrainingArguments(
    output_dir='./nofin/results',
    num_train_epochs=3,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='none',
    logging_steps=500,
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    load_best_model_at_end=True,
    fp16=True,  
    report_to="none",
    resume_from_checkpoint=True
)



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metrics.f1_score(labels, predictions, average='weighted')
    accuracy = metrics.accuracy_score(labels, predictions)
    return {'f1': f1, 'accuracy': accuracy}



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)


trainer.train()


predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

pred_labels_decoded = label_encoder.inverse_transform(pred_labels)


submission_df = pd.DataFrame({
    'Index': 'Article_' + test_df.index.astype(str),
    'target': pred_labels_decoded
})
submission_df.to_csv('6layer_submission.csv', index=False)
print("Submission file '6layer_submission.csv' generated successfully.")


nofin_probabilities = torch.nn.functional.softmax(
    torch.tensor(predictions.predictions), dim=-1).numpy()
np.save('nofine_model_probabilities1.npy', nofin_probabilities)

nofin_pred_labels = np.argmax(nofin_probabilities, axis=1)
np.save('nofin_pred_labels1.npy', nofin_pred_labels)

model.save_pretrained('nofine_model_updated')
tokenizer.save_pretrained('nofine_tokenizer_updated')
print("Model and tokenizer with 6 layers unfrozen saved successfully.")



  0%|          | 0/29427 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 