In [15]:
!pip install torch

Collecting torch
  Downloading torch-2.0.1-cp310-cp310-win_amd64.whl (172.3 MB)
     -------------------------------------- 172.3/172.3 MB 1.7 MB/s eta 0:00:00
Collecting sympy (from torch)
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
     ---------------------------------------- 5.7/5.7 MB 10.5 MB/s eta 0:00:00
Collecting mpmath>=0.19 (from sympy->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ------------------------------------- 536.2/536.2 kB 11.4 MB/s eta 0:00:00
Installing collected packages: mpmath, sympy, torch
Successfully installed mpmath-1.3.0 sympy-1.12 torch-2.0.1



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch.optim as optim

# Set the device to GPU if available, otherwise CPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

In [26]:
df = pd.read_csv('train.csv')

In [27]:
df.head()

Unnamed: 0,label,tweet
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [28]:
df['label'] = df['label'].replace({'Positive': 1, 'Negative': 0, 'Neutral':3, 'Irrelevant':4})

In [29]:
df

Unnamed: 0,label,tweet
0,1,im getting on borderlands and i will murder yo...
1,1,I am coming to the borders and I will kill you...
2,1,im getting on borderlands and i will kill you ...
3,1,im coming on borderlands and i will murder you...
4,1,im getting on borderlands 2 and i will murder ...
...,...,...
73991,1,Just realized that the Windows partition of my...
73992,1,Just realized that my Mac window partition is ...
73993,1,Just realized the windows partition of my Mac ...
73994,1,Just realized between the windows partition of...


In [30]:
train_X, temp_X, train_y, temp_y = train_test_split(df['tweet'], df['label'],random_state=42, test_size=0.3, stratify=df['label'])



In [31]:
val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y,random_state=42, test_size=0.5, stratify=temp_y)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        tweet = self.dataframe.loc[index, 'tweet']
        label = self.dataframe.loc[index, 'label']
        encoding = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=32, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        return input_ids, attention_mask, label

In [32]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 232k/232k [00:00<00:00, 559kB/s]
Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████████████| 28.0/28.0 [00:00<?, ?B/s]


In [33]:
train_encodings = tokenizer(list(train_X), truncation=True, padding=True, max_length=100)
val_encodings = tokenizer(list(val_X), truncation=True, padding=True,max_length=100)
test_encodings = tokenizer(list(test_X), truncation=True, padding=True,max_length=100)


In [34]:
# Convert labels to tensors
train_y = torch.tensor(train_y.values)
val_y = torch.tensor(val_y.values)
test_y = torch.tensor(test_y.values)

In [35]:
train_X = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),torch.tensor(train_encodings['attention_mask']),train_y)
val_X = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),torch.tensor(val_encodings['attention_mask']),val_y)
test_X = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),torch.tensor(test_encodings['attention_mask']),test_y)

In [36]:
batch_size = 64
train_loader = DataLoader(train_X, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_X, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_X, batch_size=batch_size, shuffle=True)

In [37]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x219686caa40>

In [38]:
import torch
import torch.nn.functional as F

def train(model, optimizer, criterion, train_loader, device):
    model.train()
    train_loss = 0.0
    total_correct = 0
    total_samples = 0
    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids,attention_mask=attention_mask,labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        preds = outputs.logits
        preds = torch.argmax(preds, dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += len(labels)

        if (i+1) % 250 == 0:
            print('Loss at ',i, "th batch is: ",loss.item())
            
        loss.backward()
        optimizer.step()
    
    train_acc = total_correct / total_samples
    train_loss = train_loss/len(train_loader)    
    return train_loss,train_acc


def validate(model, criterion, val_loader, device):
    model.eval()
    val_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask = attention_mask,labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            preds = outputs.logits
            preds = torch.argmax(preds, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += len(labels)
            
            if(i%10 == 0):
                print("Preds: ",preds)
                print("Truth: ",labels)
            
    val_acc = total_correct / total_samples
    val_loss /= len(val_loader)
    print("VAL accuracy at epoch: ", val_acc)
    return val_loss, val_acc

In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

def test_(model, criterion, test_loader, device):
    model.eval()
    test_loss = 0.0
    total_correct = 0
    total_samples = 0
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_loss += loss.item()
            
            preds = outputs.logits
            preds = torch.argmax(preds, dim=1)
            
            true_labels.extend(labels.tolist())
            predicted_labels.extend(preds.tolist())
            
            total_correct += (preds == labels).sum().item()
            total_samples += len(labels)
            
            if(i%10 == 0):
                print("Preds: ",preds)
                print("Truth: ",labels)
            
    test_acc = total_correct / total_samples
    test_loss /= len(test_loader)
    
    # Calculate accuracy, precision, recall and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    print("TEST Accuracy: {:.4f}".format(accuracy))
   
    
    
    
    return test_loss, test_acc

In [40]:
num_epochs = 3
criterion = nn.CrossEntropyLoss()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

opt = optim.Adam(model.parameters(), lr=5e-7, eps=1e-8)
model.to('cpu')
TRAIN_LOSS = []
VAL_LOSS = []
TRAIN_ACC = []
VAL_ACC = []

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
for i in range(num_epochs):
    train_loss,train_acc = train(model,opt,criterion,train_loader,'cpu')
    val_loss,val_acc = validate(model,criterion,val_loader,'cpu')
    TRAIN_LOSS.append(train_loss)
    VAL_LOSS.append(val_loss)
    TRAIN_ACC.append(train_acc)
    VAL_ACC.append(val_acc)

IndexError: Target 4 is out of bounds.