## Imports

In [1]:
import torch
import matplotlib.pyplot as plt
import wandb
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.optim as optim
import time 
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, RobertaConfig, RobertaForSequenceClassification, XLNetForSequenceClassification, BertForSequenceClassification, ConvBertForSequenceClassification, XLMRobertaForSequenceClassification
from sklearn.model_selection import train_test_split



## Initialisations

In [2]:
SEED = 10
MAX_LENGTH = 190
BATCH_SIZE=32

np.random.seed(SEED)
torch.manual_seed(SEED)


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Data Import

In [3]:
df = pd.read_csv('/kaggle/input/opinrank-dataset-processed/ModelTrain.csv')
data = df['Review'].values.tolist()

labels = df['Sentiment'].values.tolist()
for i in range(len(labels)):
    if labels[i] == 'POSITIVE':
        labels[i] = 1
    else:
        labels[i] = 0
        

x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=SEED)
data_size = {'train': len(x_train), 'val': len(x_val)}

In [None]:
data_size

In [4]:
class HotelsDataset(Dataset):
  def __init__(self, X, Y):

    super(HotelsDataset, self).__init__()
    self.data = X
    self.labels = Y

  def __len__(self):

    return len(self.labels)

  def __getitem__(self, index):

#     text = self.data[index]
#     inputs = self.tokenizer(text ,truncation=True,padding='max_length', return_tensors='pt', max_length=MAX_LENGTH)
    ids = self.data["input_ids"][index]
    mask = self.data["attention_mask"][index]
    return {
      'ids': ids,
      'mask': mask,
        'labels': self.labels[index]
      
    }

## Train Function

In [5]:
def train(model, dataloader, opt, epochs, crit, sch=None, wb=True):
    best_model_wts = copy.deepcopy(model.state_dict())
    if wb:
        wandb.watch(model)
    epoch_loss = {
                'train': 0.0,
                'val': 0.0
            }
    epoch_acc = epoch_loss.copy()
    best_accuracy = 0.0
    beg = time.time()
    for epoch in range(epochs):
        print(f'Epoch {epoch}/{epochs - 1}')
        print('-' * 10)  # loop over the dataset multiple times
        for phase in ['train', 'val']:
            # if phase == 'train':
            #     model.train()  # Set model to training mode
            # else:
            #     model.eval()
            running_loss = 0.0
            running_corrects = 0
            total_items = 0
            pbar = tqdm(dataloader[phase])
            for inputs in pbar:
                # get the inputs; data is a list of [inputs, labels]
                ids = inputs['ids'].to(device)
                mask = inputs['mask'].to(device)
                labels = inputs['labels'].to(device)
                # zero the parameter gradients
                opt.zero_grad()
                model.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(input_ids = ids, attention_mask = mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits
                    _, preds = torch.max(logits, 1)
#                     loss = crit(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        opt.step()
                # forward + backward + optimize
                

            # print statistics
                running_loss += loss.item() * ids.shape[0]
                running_corrects += torch.sum(preds == labels.data)
                total_items += labels.shape[0]
                accu = running_corrects / total_items
                pbar.set_postfix({'Accuracy': accu.item()})
            # if phase == 'train':
                # sch.step()
            epoch_loss[phase] = running_loss / data_size[phase]
            epoch_acc[phase] = running_corrects.double() / data_size[phase]

            print(f'{phase} Loss: {epoch_loss[phase]:.4f} Acc: {epoch_acc[phase]:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc['val'] > best_accuracy:
                best_accuracy = epoch_acc['val']
                best_model_wts = copy.deepcopy(model.state_dict())
        if wb:
            wandb.log({'Training Loss': epoch_loss['train'], 'Validation Loss': epoch_loss['val'], 'Training Accuracy': epoch_acc['train'], 'Validation Accuracy': epoch_acc['val']})
        print()
    print('Finished Training')
    print(f'Best val Acc: {best_accuracy:4f}')
    end = time.time()
    if wb:
        wandb.log({'Training Time': end - beg})
    print(f"Training Time: {end - beg} seconds")
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

## RoBERTa

### Init

In [16]:
EPOCH = 3
BATCH_SIZE = 32
MAX_LENGTH = 400
LR = 2e-5

In [7]:
model_path = 'roberta-base'

### Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
data_train = tokenizer(x_train, truncation=True,padding='max_length', return_tensors='pt', max_length=MAX_LENGTH)
data_val = tokenizer(x_val, truncation=True,padding='max_length', return_tensors='pt', max_length=MAX_LENGTH)

DataTrain = HotelsDataset(data_train, y_train)
DataVal = HotelsDataset(data_val, y_val)

train_dataloader = DataLoader(DataTrain, batch_size=BATCH_SIZE, shuffle=False)
val_dataloader = DataLoader(DataVal, batch_size=BATCH_SIZE, shuffle=False)
trainLoader = {'train': train_dataloader, 'val': val_dataloader}


model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)
model = train(model, trainLoader, optimizer, EPOCH, criterion, wb=False)

In [None]:
torch.save(model, "Roberta.pt")

## Predictions

In [27]:
def val(model, loader):
    y_pred = []
    for inputs in tqdm(loader['val']):
        # get the inputs; data is a list of [inputs, labels]
        ids = inputs['ids'].to(device)
        mask = inputs['mask'].to(device)
        labels = inputs['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids = ids, attention_mask = mask, labels=labels)
            logits = outputs.logits
            _, preds = torch.max(logits, 1)
            y_pred.extend(preds.cpu().tolist())
    return y_pred


In [15]:
!wget https://bashupload.com/kdkcm/Roberta.pt

--2023-11-21 14:12:33--  https://bashupload.com/kdkcm/Roberta.pt
Resolving bashupload.com (bashupload.com)... 116.203.186.178
Connecting to bashupload.com (bashupload.com)|116.203.186.178|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 498712875 (476M) [application/octet-stream]
Saving to: ‘Roberta.pt’


2023-11-21 14:13:01 (17.7 MB/s) - ‘Roberta.pt’ saved [498712875/498712875]



In [17]:
model = torch.load("Roberta.pt").to(device)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
data_val = tokenizer(x_val, truncation=True,padding='max_length', return_tensors='pt', max_length=MAX_LENGTH)
DataVal = HotelsDataset(data_val, y_val)
val_dataloader = DataLoader(DataVal, batch_size=BATCH_SIZE, shuffle=False)
trainLoader = {'val': val_dataloader}

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [28]:
preds = val(model, trainLoader)

  0%|          | 0/772 [00:00<?, ?it/s]

In [29]:
len(preds)

24675

In [32]:
preds[:10]

[1, 1, 1, 0, 0, 1, 0, 0, 0, 0]

In [33]:
import pickle
with open("preds_Rob", "wb") as f:
    pickle.dump(preds, f)