In [20]:
!pip install transformers



In [21]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch.optim as optim

# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [22]:
df = pd.read_csv('/kaggle/input/sentiment-analysis/train.csv')

In [23]:
df.head()

Unnamed: 0,label,tweet
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [24]:
df['label'] = df['label'].replace({'Positive': 1, 'Negative': 0, 'Neutral':3, 'Irrelevant':4})

In [25]:
df

Unnamed: 0,label,tweet
0,1,im getting on borderlands and i will murder yo...
1,1,I am coming to the borders and I will kill you...
2,1,im getting on borderlands and i will kill you ...
3,1,im coming on borderlands and i will murder you...
4,1,im getting on borderlands 2 and i will murder ...
...,...,...
73991,1,Just realized that the Windows partition of my...
73992,1,Just realized that my Mac window partition is ...
73993,1,Just realized the windows partition of my Mac ...
73994,1,Just realized between the windows partition of...


In [26]:
train_X, temp_X, train_y, temp_y = train_test_split(df['tweet'], df['label'],random_state=42, test_size=0.3, stratify=df['label'])



In [27]:
val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y,random_state=42, test_size=0.5, stratify=temp_y)

In [28]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [29]:
train_encodings = tokenizer(list(train_X), truncation=True, padding=True, max_length=100)
val_encodings = tokenizer(list(val_X), truncation=True, padding=True,max_length=100)
test_encodings = tokenizer(list(test_X), truncation=True, padding=True,max_length=100)


In [30]:
# Convert labels to tensors
train_y = torch.tensor(train_y.values)
val_y = torch.tensor(val_y.values)
test_y = torch.tensor(test_y.values)

In [31]:
train_X = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),torch.tensor(train_encodings['attention_mask']),train_y)
val_X = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),torch.tensor(val_encodings['attention_mask']),val_y)
test_X = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),torch.tensor(test_encodings['attention_mask']),test_y)

In [32]:
batch_size = 32
train_loader = DataLoader(train_X, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_X, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_X, batch_size=batch_size, shuffle=True)

In [33]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x77fef3c72e90>

In [40]:
import torch
import torch.nn.functional as F

def train(model, optimizer, criterion, train_loader, device):
    model.train()
    train_loss = 0.0
    total_correct = 0
    total_samples = 0
    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids,attention_mask=attention_mask,labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        preds = outputs.logits
        preds = torch.argmax(preds, dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += len(labels)

        if (i+1) % 250 == 0:
            print('Loss at ',i, "th batch is: ",loss.item())
            
        loss.backward()
        optimizer.step()
    
    train_acc = total_correct / total_samples
    train_loss = train_loss/len(train_loader)    
    return train_loss,train_acc


def validate(model, criterion, val_loader, device):
    model.eval()
    val_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask = attention_mask,labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            preds = outputs.logits
            preds = torch.argmax(preds, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += len(labels)
            
            if(i%10 == 0):
                print("Preds: ",preds)
                print("Truth: ",labels)
            
    val_acc = total_correct / total_samples
    val_loss /= len(val_loader)
    print("VAL accuracy at epoch: ", val_acc)
    return val_loss, val_acc

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

def test_(model, criterion, test_loader, device):
    model.eval()
    test_loss = 0.0
    total_correct = 0
    total_samples = 0
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_loss += loss.item()
            
            preds = outputs.logits
            preds = torch.argmax(preds, dim=1)
            
            true_labels.extend(labels.tolist())
            predicted_labels.extend(preds.tolist())
            
            total_correct += (preds == labels).sum().item()
            total_samples += len(labels)
            
            if(i%10 == 0):
                print("Preds: ",preds)
                print("Truth: ",labels)
            
    test_acc = total_correct / total_samples
    test_loss /= len(test_loader)
    
    # Calculate accuracy, precision, recall and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    print("TEST Accuracy: {:.4f}".format(accuracy))
   
    
    
    
    return test_loss, test_acc

In [36]:
num_epochs = 3
criterion = nn.CrossEntropyLoss()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

opt = optim.Adam(model.parameters(), lr=5e-7, eps=1e-8)
model.to(device)
TRAIN_LOSS = []
VAL_LOSS = []
TRAIN_ACC = []
VAL_ACC = []

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [41]:
for i in range(num_epochs):
    train_loss,train_acc = train(model,opt,criterion,train_loader,device)
    val_loss,val_acc = validate(model,criterion,val_loader,device)
    TRAIN_LOSS.append(train_loss)
    VAL_LOSS.append(val_loss)
    TRAIN_ACC.append(train_acc)
    VAL_ACC.append(val_acc)