In [None]:
!pip install datasets

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [35]:
import torch
import tensorflow as tf

In [36]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification , Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("NepBERTa/NepBERTa")
model = AutoModelForSequenceClassification .from_pretrained("NepBERTa/NepBERTa", from_tf=True,num_labels=3)

All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


In [37]:
df_train = pd.read_csv('/content/drive/MyDrive/Sentiment/train_clean.csv')

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X = list(df_train['text'])
y = list(df_train['label'])

In [40]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)

In [42]:
# Verify all the rows are of type str
for idx,text in enumerate(X):
  if not isinstance(text,str):
    print(text)
    print(idx)

In [43]:
# Convert text into tokens using the tokenizer from NepBERTa
train_encodings = tokenizer(X_train,truncation=True,padding=True,max_length=256)
test_encodings = tokenizer(X_test,truncation=True,padding=True,max_length=256)

In [44]:
# Convert the dataset into a dict to convert into Dataset object from huggingface
train_data = {
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": y_train
}
test_data = {
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": y_test
}

In [46]:
from datasets import Dataset
# Convert the dict into Dataset object
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

In [47]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4792
})

In [147]:
print(np.array(train_dataset[0]['input_ids']).shape)

(128,)
0


In [51]:
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=6,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [49]:
import os
# Unset the WANDB_API_KEY
os.environ["WANDB_MODE"] = "disabled"

In [52]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()

Step,Training Loss
10,1.0849
20,1.0749
30,1.0575
40,1.0383
50,1.0114
60,1.0052
70,0.9742
80,0.9282
90,0.898
100,0.8444


TrainOutput(global_step=450, training_loss=0.6877057467566596, metrics={'train_runtime': 1183.3479, 'train_samples_per_second': 24.297, 'train_steps_per_second': 0.38, 'total_flos': 3782518493257728.0, 'train_loss': 0.6877057467566596, 'epoch': 6.0})

In [53]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.7286009192466736,
 'eval_runtime': 17.2865,
 'eval_samples_per_second': 69.303,
 'eval_steps_per_second': 1.099,
 'epoch': 6.0}

In [54]:
output = trainer.predict(test_dataset)

In [64]:
preds = output[0]

In [68]:
out_preds =[out.argmax() for out in preds]

In [69]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,out_preds)
cm

array([[329,  66,  80],
       [ 57, 394,  25],
       [ 54,  35, 158]])

In [70]:
cm.diagonal().sum() / cm.sum()

0.7353923205342237

In [71]:
trainer.save_model('sentiment_modelv2')

In [5]:
from transformers import AutoModelForSequenceClassification

# Load the model from the saved folder
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/Sentiment/sentiment_model', num_labels=3)


In [72]:
df_test = pd.read_csv('/content/drive/MyDrive/Sentiment/test_clean.csv')

In [12]:
X = list(df_test['text'])[:10]
y = list(df_test['label'])[:10]

In [13]:
encodings = tokenizer(X,return_tensors="pt",truncation=True,padding=True,max_length=128)

In [14]:
with torch.no_grad():  # Disable gradient calculation
    outputs = model(**encodings)

In [15]:
logits = outputs.logits

In [17]:
logits

tensor([[-0.4558,  2.9378, -2.4275],
        [-0.5326,  2.9708, -2.3609],
        [-0.3759,  2.6379, -2.2442],
        [-0.7380,  3.1850, -2.3408],
        [-0.8793,  0.6530, -0.0384],
        [ 2.1670, -1.4592, -0.8833],
        [-1.1336,  3.3648, -2.1217],
        [-0.4942,  2.5403, -2.0024],
        [ 2.6898, -1.2663, -1.4416],
        [-0.4811,  1.0630, -0.5874]])

In [18]:
# Convert logits to predicted class
predicted_class = [torch.argmax(logit, dim=-1).item() for logit in logits]
predicted_class

[1, 1, 1, 1, 1, 0, 1, 1, 0, 1]

In [19]:
from sklearn.metrics import accuracy_score

In [20]:
accuracy = accuracy_score(predicted_class, y)

In [21]:
accuracy

0.9

In [73]:
X = list(df_test['text'])
y = list(df_test['label'])

In [29]:
batch_size = 32
all_predictions = []

for i in range(0, len(X), batch_size):
    batch_texts = X[i:i + batch_size]
    inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128)

    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    batch_predictions = [torch.argmax(logit, dim=-1).item() for logit in logits]
    all_predictions.extend(batch_predictions)
    print(f'Batch no: {i}')

Batch no: 0
Batch no: 32
Batch no: 64
Batch no: 96
Batch no: 128
Batch no: 160
Batch no: 192
Batch no: 224
Batch no: 256
Batch no: 288
Batch no: 320
Batch no: 352
Batch no: 384
Batch no: 416
Batch no: 448
Batch no: 480
Batch no: 512
Batch no: 544
Batch no: 576
Batch no: 608
Batch no: 640
Batch no: 672
Batch no: 704
Batch no: 736
Batch no: 768
Batch no: 800
Batch no: 832
Batch no: 864
Batch no: 896
Batch no: 928
Batch no: 960
Batch no: 992
Batch no: 1024
Batch no: 1056
Batch no: 1088
Batch no: 1120
Batch no: 1152
Batch no: 1184
Batch no: 1216
Batch no: 1248
Batch no: 1280
Batch no: 1312
Batch no: 1344
Batch no: 1376
Batch no: 1408
Batch no: 1440
Batch no: 1472
Batch no: 1504
Batch no: 1536
Batch no: 1568
Batch no: 1600
Batch no: 1632
Batch no: 1664
Batch no: 1696
Batch no: 1728
Batch no: 1760
Batch no: 1792
Batch no: 1824
Batch no: 1856
Batch no: 1888
Batch no: 1920
Batch no: 1952
Batch no: 1984


In [30]:
accuracy = accuracy_score(all_predictions, y)

In [31]:
accuracy

0.6531124497991968

In [74]:
actual_test_encodings = tokenizer(X,truncation=True,padding=True,max_length=256)

In [76]:
actual_test_data = {
    "input_ids": actual_test_encodings["input_ids"],
    "attention_mask": actual_test_encodings["attention_mask"],
    "labels": y
}

In [78]:
actual_test_dataset = Dataset.from_dict(actual_test_data)

In [79]:
predictions = trainer.predict(actual_test_dataset)

In [80]:
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)
true_labels = predictions.label_ids

In [82]:
accuracy = accuracy_score(true_labels, predicted_labels)

In [83]:
accuracy

0.6450803212851406