<a href="https://colab.research.google.com/github/Riniii09/NLP-Mini-Project/blob/main/Task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install simpletransformers pandas scikit-learn



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "./NLP dataset - Sheet1.csv"
df = pd.read_csv(file_path)

# Drop 'Sr.no' column since it's not needed
df = df.drop(columns=["Sr.no"])

# Rename columns to match SimpleTransformers expected format
df = df.rename(columns={"Statements": "text", "Category": "labels"})

# Encode labels numerically
label_mapping = {"Beneficial": 0, "High-Risk": 1, "Standard": 2}
df["labels"] = df["labels"].map(label_mapping)

# Train-test split (80% train, 20% test)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Print basic information
print("Training Data:", train_df.shape)
print("Validation Data:", val_df.shape)
print("Label Mapping:", label_mapping)

Training Data: (1312, 2)
Validation Data: (328, 2)
Label Mapping: {'Beneficial': 0, 'High-Risk': 1, 'Standard': 2}


In [None]:
import re

# Define a function to clean text data
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.strip()  # Remove extra spaces
    return text

# Apply the function
train_df["text"] = train_df["text"].apply(clean_text)
val_df["text"] = val_df["text"].apply(clean_text)

# Print sample data
train_df.head()

Unnamed: 0,text,labels
63,the borrower may request a temporary reduction...,0
1308,the indemnified party shall cooperate in good ...,2
1018,violation of exclusivity agreements will resul...,1
1046,if a company misrepresents financial statement...,1
1202,all obligations under this agreement shall ext...,2


In [None]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should display 'Tesla T4'

True
Tesla T4


In [None]:
from simpletransformers.classification import ClassificationModel

# BERT Model
bert_model = ClassificationModel(
    "bert",
    "bert-base-uncased",
    num_labels=3,  # 3 classes: Beneficial, High-Risk, Standard
    use_cuda=True  # Set to True if using GPU
)

# RoBERTa Model
roberta_model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=3,
    use_cuda=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a dow

In [None]:
from simpletransformers.classification import ClassificationArgs

# Define training arguments
model_args = ClassificationArgs(
    num_train_epochs=3,
    train_batch_size=8,
    eval_batch_size=8,
    learning_rate=3e-5,
    max_seq_length=128,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=200,
    overwrite_output_dir=True  # ✅ Allows overwriting existing outputs
)

# Train RoBERTa model with overwrite enabled
roberta_model = ClassificationModel("roberta", "roberta-base", num_labels=3, args=model_args, use_cuda=True)
roberta_model.train_model(train_df)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 3:   0%|          | 0/164 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 3:   0%|          | 0/164 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/164 [00:00<?, ?it/s]

(492, 0.16781625799774155)

In [None]:
bert_model = ClassificationModel("bert", "bert-base-uncased", num_labels=3, args=model_args, use_cuda=True)
bert_model.train_model(train_df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 3:   0%|          | 0/164 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 3:   0%|          | 0/164 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/164 [00:00<?, ?it/s]

(492, 0.12897740546765366)

In [None]:
# Evaluate BERT
result_bert, _, _ = bert_model.eval_model(val_df)
print("BERT Evaluation Results:", result_bert)

# Evaluate RoBERTa
result_roberta, _, _ = roberta_model.eval_model(val_df)
print("RoBERTa Evaluation Results:", result_roberta)


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/41 [00:00<?, ?it/s]

  with amp.autocast():


BERT Evaluation Results: {'mcc': np.float64(0.9770137037968126), 'eval_loss': 0.0931173439432935}


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/41 [00:00<?, ?it/s]

  with amp.autocast():


RoBERTa Evaluation Results: {'mcc': np.float64(0.9816366398178626), 'eval_loss': 0.09619835527931772}


In [None]:
!ls /content/drive/MyDrive

bert_best_model  roberta_best_model


In [None]:
!ls -lh /content/drive/MyDrive/


total 8.0K
drwxr-xr-x 2 root root 4.0K Mar 26 15:44 bert_best_model
drwxr-xr-x 2 root root 4.0K Mar 26 15:44 roberta_best_model


In [None]:
try:
    print(bert_model)
    print("✅ BERT model is still in memory!")
except NameError:
    print("❌ BERT model is missing, you may need to reload or retrain.")

try:
    print(roberta_model)
    print("✅ RoBERTa model is still in memory!")
except NameError:
    print("❌ RoBERTa model is missing, you may need to reload or retrain.")


<simpletransformers.classification.classification_model.ClassificationModel object at 0x7d0c6864f2d0>
✅ BERT model is still in memory!
<simpletransformers.classification.classification_model.ClassificationModel object at 0x7d0c685cd2d0>
✅ RoBERTa model is still in memory!


In [None]:
# Save the best-performing model
bert_model.save_model("/content/drive/MyDrive/bert_best_model")
roberta_model.save_model("/content/drive/MyDrive/roberta_best_model")

In [None]:
import os

save_path = "/content/drive/MyDrive/bert_best_model"
if not os.path.exists(save_path):
    os.makedirs(save_path)  # Create the directory if missing
    print("✅ Created save directory!")
else:
    print("📂 Save directory already exists.")


📂 Save directory already exists.


In [None]:
bert_model.save_model("/content/drive/MyDrive/bert_best_model")

In [None]:
import os

bert_path = "/content/drive/MyDrive/bert_best_model"
roberta_path = "/content/drive/MyDrive/roberta_best_model"

print("BERT Files:", os.listdir(bert_path) if os.path.exists(bert_path) else "❌ BERT model not found!")
print("RoBERTa Files:", os.listdir(roberta_path) if os.path.exists(roberta_path) else "❌ RoBERTa model not found!")


BERT Files: []
RoBERTa Files: []


In [None]:
bert_model.save_model("/content/bert_best_model")
roberta_model.save_model("/content/roberta_best_model")

print("✅ Models saved in Colab successfully!")


✅ Models saved in Colab successfully!


In [None]:
try:
    print(bert_model)
    print("✅ BERT model is still in memory!")
except NameError:
    print("❌ BERT model is missing, you may need to retrain.")

try:
    print(roberta_model)
    print("✅ RoBERTa model is still in memory!")
except NameError:
    print("❌ RoBERTa model is missing, you may need to retrain.")


<simpletransformers.classification.classification_model.ClassificationModel object at 0x7d0c6864f2d0>
✅ BERT model is still in memory!
<simpletransformers.classification.classification_model.ClassificationModel object at 0x7d0c685cd2d0>
✅ RoBERTa model is still in memory!


In [None]:
# Save BERT manually
bert_model.model.save_pretrained("/content/bert_best_model")
bert_model.tokenizer.save_pretrained("/content/bert_best_model")

# Save RoBERTa manually
roberta_model.model.save_pretrained("/content/roberta_best_model")
roberta_model.tokenizer.save_pretrained("/content/roberta_best_model")

print("✅ Models saved in Colab successfully!")


✅ Models saved in Colab successfully!


In [None]:
import os

print("BERT Files:", os.listdir("/content/bert_best_model"))
print("RoBERTa Files:", os.listdir("/content/roberta_best_model"))


BERT Files: ['special_tokens_map.json', 'tokenizer_config.json', 'vocab.txt', 'model.safetensors', 'tokenizer.json', 'config.json']
RoBERTa Files: ['merges.txt', 'special_tokens_map.json', 'vocab.json', 'tokenizer_config.json', 'model.safetensors', 'tokenizer.json', 'config.json']


In [None]:
!cp -r /content/bert_best_model /content/drive/MyDrive/
!cp -r /content/roberta_best_model /content/drive/MyDrive/

print("✅ Models successfully copied to Google Drive!")


✅ Models successfully copied to Google Drive!


In [None]:
!zip -r bert_best_model.zip /content/bert_best_model
!zip -r roberta_best_model.zip /content/roberta_best_model


  adding: content/bert_best_model/ (stored 0%)
  adding: content/bert_best_model/special_tokens_map.json (deflated 42%)
  adding: content/bert_best_model/tokenizer_config.json (deflated 75%)
  adding: content/bert_best_model/vocab.txt (deflated 53%)
  adding: content/bert_best_model/model.safetensors (deflated 7%)
  adding: content/bert_best_model/tokenizer.json (deflated 71%)
  adding: content/bert_best_model/config.json (deflated 51%)
  adding: content/roberta_best_model/ (stored 0%)
  adding: content/roberta_best_model/merges.txt (deflated 53%)
  adding: content/roberta_best_model/special_tokens_map.json (deflated 52%)
  adding: content/roberta_best_model/vocab.json (deflated 59%)
  adding: content/roberta_best_model/tokenizer_config.json (deflated 75%)
  adding: content/roberta_best_model/model.safetensors (deflated 16%)
  adding: content/roberta_best_model/tokenizer.json (deflated 82%)
  adding: content/roberta_best_model/config.json (deflated 52%)


In [None]:
from google.colab import files

files.download("bert_best_model.zip")
files.download("roberta_best_model.zip")

print("✅ Models are now downloading!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Models are now downloading!


In [None]:
import os

bert_path = "/content/bert_best_model"
roberta_path = "/content/roberta_best_model"

print("BERT Files:", os.listdir(bert_path) if os.path.exists(bert_path) else "❌ BERT model not found!")
print("RoBERTa Files:", os.listdir(roberta_path) if os.path.exists(roberta_path) else "❌ RoBERTa model not found!")


BERT Files: ['special_tokens_map.json', 'tokenizer_config.json', 'vocab.txt', 'model.safetensors', 'tokenizer.json', 'config.json']
RoBERTa Files: ['merges.txt', 'special_tokens_map.json', 'vocab.json', 'tokenizer_config.json', 'model.safetensors', 'tokenizer.json', 'config.json']


In [None]:
bert_model = ClassificationModel("bert", "/content/bert_best_model", use_cuda=True)
roberta_model = ClassificationModel("roberta", "/content/roberta_best_model", use_cuda=True)

# Test predictions
sample_texts = ["The lender shall provide a grace period.", "This agreement is risky."]
predictions_bert, _ = bert_model.predict(sample_texts)
predictions_roberta, _ = roberta_model.predict(sample_texts)

print("BERT Predictions:", predictions_bert)
print("RoBERTa Predictions:", predictions_roberta)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

BERT Predictions: [0 1]
RoBERTa Predictions: [0 1]


In [None]:
sample_texts = [
    "The insured has the right to cancel the policy anytime.",
    "This contract involves significant financial risks.",
    "The borrower can repay the loan without penalties.",
    "This clause introduces uncertainty in legal obligations."
]

predictions_bert, _ = bert_model.predict(sample_texts)
predictions_roberta, _ = roberta_model.predict(sample_texts)

print("BERT Predictions:", predictions_bert)
print("RoBERTa Predictions:", predictions_roberta)


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

BERT Predictions: [0 1 0 2]
RoBERTa Predictions: [0 1 0 1]
