**Data** **Loading**

In [None]:
!pip install simpletransformers



In [None]:
#TASK1
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('/content/balanced_dataset.csv')


print(data.columns)


data = data.rename(columns={'category': 'labels', 'utterance': 'text'})

data['labels'] = data['labels'].astype('category').cat.codes


train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)


train_df = train_data[['text', 'labels']].copy()
val_df = val_data[['text', 'labels']].copy()

print(train_df.head())


Index(['utterance', 'category'], dtype='object')
                                                   text  labels
1567  I do not know what I need to do to buy several...       2
2292  how do I set a new shipping address up? My pac...       4
1703  I have a problem with removing something Could...       2
1987  I don't know how to check the status of my ref...       3
1271  I need help to check when my item is going to ...       1


**Text Processing**

In [None]:
#TASK2
import re


def clean_text(text):

    text = text.lower().strip()


    text = re.sub(r'[^a-z\s]', '', text)


    text = re.sub(r'\s+', ' ', text)

    return text


train_df['text'] = train_df['text'].apply(clean_text)
val_df['text'] = val_df['text'].apply(clean_text)


print(train_df.head())


                                                   text  labels
1567  i do not know what i need to do to buy several...       2
2292  how do i set a new shipping address up my pack...       4
1703  i have a problem with removing something could...       2
1987  i dont know how to check the status of my refu...       3
1271  i need help to check when my item is going to ...       1


**Text Embedding using BERT and RoBERTa**

In [None]:
#TASK3

from simpletransformers.classification import ClassificationModel

num_labels = train_df['labels'].nunique()

bert_model = ClassificationModel(
    'bert',
    'bert-base-uncased',
    num_labels=num_labels,
    use_cuda=False
)

roberta_model = ClassificationModel(
    'roberta',
    'roberta-base',
    num_labels=num_labels,
    use_cuda=False
)

print("BERT and RoBERTa models initialized successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

BERT and RoBERTa models initialized successfully!


 **Model Training with BERT and RoBERTa**

In [None]:
#TASK4 - I
from simpletransformers.classification import ClassificationArgs


model_args = ClassificationArgs(
    num_train_epochs=3,
    train_batch_size=8,
    eval_batch_size=8,
    learning_rate=3e-5,
    max_seq_length=128,
    weight_decay=0.01,
    warmup_steps=0,
    logging_steps=50,
    save_steps=200,
    overwrite_output_dir=True,
    evaluate_during_training=True,
    use_multiprocessing=False
)


In [None]:
#TASK 4 - II
from simpletransformers.classification import ClassificationModel


bert_model = ClassificationModel(
    'bert', 'bert-base-uncased',
    num_labels=train_df['labels'].nunique(),
    args=model_args,
    use_cuda=True
)

bert_model.train_model(train_df, eval_df=val_df)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 3:   0%|          | 0/300 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 3:   0%|          | 0/300 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 3:   0%|          | 0/300 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


(900,
 defaultdict(list,
             {'global_step': [300, 600, 900],
              'train_loss': [0.003695964813232422,
               0.0012881755828857422,
               0.0007731914520263672],
              'mcc': [np.float64(0.9979160206567347),
               np.float64(0.9979160206567347),
               np.float64(0.9979160206567347)],
              'eval_loss': [0.014784830637897055,
               0.014666608969370524,
               0.014791154861450195]}))

In [None]:
#TASK4-III

roberta_model = ClassificationModel(
    'roberta', 'roberta-base',
    num_labels=train_df['labels'].nunique(),
    args=model_args,
    use_cuda=True
)


roberta_model.train_model(train_df, eval_df=val_df)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

(900,
 defaultdict(list,
             {'global_step': [300, 600, 900],
              'train_loss': [0.0009003040613606572,
               0.0006755351205356419,
               0.0004987838328815997],
              'mcc': [np.float64(0.9979160206567347),
               np.float64(0.9979160206567347),
               np.float64(0.9979160206567347)],
              'eval_loss': [0.014559695296144733,
               0.014900982048905765,
               0.015219595825959307]}))

**Evaluation on Validation Set**

In [None]:
#TASK5 - I

result_bert, model_outputs_bert, wrong_predictions_bert = bert_model.eval_model(val_df)

print("🔹 BERT Evaluation Results:")
print(result_bert)


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/75 [00:00<?, ?it/s]

🔹 BERT Evaluation Results:
{'mcc': np.float64(0.9979160206567347), 'eval_loss': 0.014790678399149328}


In [None]:
#TASK5-II

result_roberta, model_outputs_roberta, wrong_predictions_roberta = roberta_model.eval_model(val_df)

print("🔹 RoBERTa Evaluation Results:")
print(result_roberta)


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/75 [00:00<?, ?it/s]

🔹 RoBERTa Evaluation Results:
{'mcc': np.float64(0.9979160206567347), 'eval_loss': 0.015219595825959307}


 **Saving the Best Model**

In [None]:
#TASK 6 - I
bert_model.save_model("bert_best_model")
print("✅ BERT Model Saved Successfully!")


✅ BERT Model Saved Successfully!


In [None]:
#TASK 6 - II
roberta_model.save_model("roberta_best_model")
print("✅ RoBERTa Model Saved Successfully!")


✅ RoBERTa Model Saved Successfully!


**Prediction on Real-World Input**

In [None]:
import os


os.makedirs("bert_best_model", exist_ok=True)
os.makedirs("roberta_best_model", exist_ok=True)


bert_model.save_model("bert_best_model")
roberta_model.save_model("roberta_best_model")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

bert_model.save_model("/content/drive/MyDrive/bert_best_model")
roberta_model.save_model("/content/drive/MyDrive/roberta_best_model")


Mounted at /content/drive


In [None]:

bert_model.save_model("/content/drive/MyDrive/bert_best_model")
roberta_model.save_model("/content/drive/MyDrive/roberta_best_model")


In [None]:

bert_model.save_model("/content/drive/MyDrive/bert_best_model")


roberta_model.save_model("/content/drive/MyDrive/roberta_best_model")


In [None]:
import os
print("BERT Model Files:", os.listdir("/content/bert_best_model"))
print("RoBERTa Model Files:", os.listdir("/content/roberta_best_model"))


BERT Model Files: ['config.json', 'special_tokens_map.json', 'tokenizer.json', 'vocab.txt', 'tokenizer_config.json', 'model.safetensors']
RoBERTa Model Files: ['config.json', 'special_tokens_map.json', 'merges.txt', 'tokenizer.json', 'tokenizer_config.json', 'model.safetensors', 'vocab.json']


In [None]:
#TASK 7
from simpletransformers.classification import ClassificationModel


bert_model = ClassificationModel(
    "bert",
    "/content/bert_best_model",
    use_cuda=False
)


roberta_model = ClassificationModel(
    "roberta",
    "/content/roberta_best_model",
    use_cuda=False
)


real_world_text = ["This is a great product!", "I didn't like the service."]


predictions_bert, _ = bert_model.predict(real_world_text)
print(f"BERT Predictions: {predictions_bert}")


predictions_roberta, _ = roberta_model.predict(real_world_text)
print(f"RoBERTa Predictions: {predictions_roberta}")


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

BERT Predictions: [3 2]


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RoBERTa Predictions: [2 0]
