## Install required packages and import them

In [None]:
# !pip install transformers
# !pip install datasets
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import load_datasetfrom torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
torch.cuda.is_available()

### In case the data is stored in Google Drive, you can mount the drive and give the path

---



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd '/content/drive/MyDrive/text/'

/content/drive/MyDrive/text


### Load the data from dataset package
### Also load Roberta-base model

In [None]:
dataset = load_dataset('mteb/tweet_sentiment_extraction')
train, test = dataset['train'].to_pandas(),  dataset['test'].to_pandas()

train_tweets = train['text'].values
train_labels = train['label'].values

test_tweets = test['text'].values
test_labels = test['label'].values

# Load the tokenizer and pre-trained RoBERTa model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)



  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

### Define torch.device, tokenize the train and test data and create a dataloader for batch sizes

In [None]:
# Let's see if Colab is generous to give us GPU :)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'The device used is {device}.........')
model.to(device)

# Convert the data to tensors and create dataloaders
train_encodings = tokenizer(list(train_tweets), truncation=True, padding=True, max_length=128)
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


test_encodings = tokenizer(list(test_tweets), truncation=True, padding=True, max_length=128)
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                              torch.tensor(test_encodings['attention_mask']),
                              torch.tensor(test_labels))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# Set up optimizer, scheduler, and loss function
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
loss_fn = nn.CrossEntropyLoss()

## Train the Model (Roberta)

In [None]:
# Train the model
model.train()
epochs = 5
for epoch in range(epochs):
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    for step, batch in enumerate(train_loader):
        print(step)
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        logits = outputs.logits
        
        loss.backward()
        optimizer.step()
        
        _, predicted_labels = torch.max(logits, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted_labels == labels).sum().item()
        
        running_loss += loss.item()
        
        if (step+1) % 10 == 0:    # Print every 100th step
            accuracy = correct_predictions / total_predictions
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch+1, epochs, step+1, len(train_loader), running_loss/100, accuracy))
            running_loss = 0.0
          
            
    scheduler.step()

print('Training complete.')

### Evaluate the model on the test data

In [None]:
preds = []
y_labels = []
for step2, batch in enumerate(test_loader):
  print(step2)
  input_ids = batch[0].to(device)
  attention_mask = batch[1].to(device)
  labels = batch[2].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
  
  loss = outputs.loss
  logits = outputs.logits
  
  _, predicted_labels = torch.max(logits, 1)
  total_predictions += labels.size(0)
  preds.append(predicted_labels.cpu().detach().numpy().tolist())
  y_labels.append(labels.cpu().detach().numpy().tolist())

  correct_predictions += (predicted_labels == labels).sum().item()
  
  running_loss += loss.item()

In [None]:
print(correct_predictions/total_predictions)