In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel,  get_linear_schedule_with_warmup
from sklearn.model_selection  import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [2]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [5]:
data_file = r"C:\Users\ganes\OneDrive\Desktop\Edu\Datasets\imdb_Dataset\IMDB Dataset.csv"


In [6]:
data_file

'C:\\Users\\ganes\\OneDrive\\Desktop\\Edu\\Datasets\\imdb_Dataset\\IMDB Dataset.csv'

In [7]:
def data_loader(data_file):
    df = pd.read_csv(data_file)
    text = df['review'].tolist()
    labels = [1 if sentiment == 'positive' else 0 for sentiment in df['sentiment'].tolist()]
    return text , labels

In [8]:
text, labels = data_loader(data_file)

In [9]:
#text

In [None]:
size(text)

In [38]:
class Bert_TextClassification(Dataset):
    def __init__(self , texts, lengths, tokenizer , max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return(len(self.texts))
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text , return_tensors = 'pt' , max_length = self.max_length , padding = 'max_length',truncation = True)
        return {'input_ids':encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [39]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier,self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids = input_ids, attention_mask =attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits
        

In [40]:
def train(model , data_loader , optimizer , scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask= attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [41]:
# Building evalution model

In [82]:
def evalute(model, data_loader , device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids = input_ids , attention_mask = attention_mask)
            _, preds = torch.max(outputs, dim = 1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels , predictions) , classification_report(actual_labels, predictions)

In [83]:
#Building prediction model

In [96]:
def predict_sentiment(text, model, tokenizer, device, max_length = 128):
    model.eval()
    encoding = tokenizer(text , return_tensors = 'pt' , max_length = max_length, padding = 'max_length', truncation = True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        _,preds = torch.max(outputs, dim =1)
        return "positive" if preds.item() == 1 else "negative"
    

In [85]:
# Model Parameters
bert_model_name ='bert_base_uncased'
num_classes = 2
max_length = 2
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [86]:
train_texts , val_texts , train_labels , val_labels = train_test_split(text, labels, test_size = 0.2 , random_state = 42)

In [87]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dataset = Bert_TextClassification(train_texts, train_labels, tokenizer , max_length)
val_dataset = Bert_TextClassification(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset , batch_size = batch_size , shuffle = True)
val_dataloader = DataLoader(val_dataset , batch_size=batch_size)



In [88]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "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"

In [89]:
from transformers import AutoTokenizer


In [90]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

model = BERTClassifier(bert_model_name, num_classes).to(device)

In [91]:
from transformers import AdamW

In [92]:
optimizer = AdamW(model.parameters(), lr = learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [93]:
for epoch in range(num_epochs):
    print(f"Epoch  {epoch +1} / {num_epochs}")
    train(model, train_dataloader , optimizer, scheduler, device)
    accuracy , report = evalute(model, val_dataloader,device)
    print(f"validation score : {accuracy :.4f}")
    print(report)

Epoch  1 / 4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


validation score : 0.4972
              precision    recall  f1-score   support

           0       0.50      1.00      0.66      4972
           1       0.00      0.00      0.00      5028

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.33     10000

Epoch  2 / 4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


validation score : 0.5028
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4972
           1       0.50      1.00      0.67      5028

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.34     10000

Epoch  3 / 4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


validation score : 0.4972
              precision    recall  f1-score   support

           0       0.50      1.00      0.66      4972
           1       0.00      0.00      0.00      5028

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.33     10000

Epoch  4 / 4
validation score : 0.5028
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4972
           1       0.50      1.00      0.67      5028

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.34     10000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [94]:
torch.save(model.state_dict(), "bert_classifier.pth"

SyntaxError: incomplete input (4087485091.py, line 1)

In [99]:
test_text   =  " The movie Devara is so not good, but people are liking it  and it also has some mixed opinion and I don't like that movie"
sentiment = predict_sentiment(test_text , model , tokenizer , device)
print(test_text)
print(f'Predicted Sentiment :')
sentiment

 The movie Devara is so not good, but people are liking it  and it also has some mixed opinion and I don't like that movie
Predicted Sentiment :


'negative'

In [100]:
test_text   =  " The movie Devara is way better than the old films , but people are liking it  and it also has some mixed opinion and I like that movie"
sentiment = predict_sentiment(test_text , model , tokenizer , device)
print(test_text)
print(f'Predicted Sentiment :')
sentiment

 The movie Devara is way better than the old films , but people are liking it  and it also has some mixed opinion and I like that movie
Predicted Sentiment :


'negative'

In [103]:
type(labels)

list

In [104]:
labels.count(0)

25000

In [105]:
labels.count(1)

25000