In [None]:
# Importing dependencies
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.optim as optim
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import classification_report
from ml_things import plot_dict
from transformers import (get_linear_schedule_with_warmup,set_seed,)

# Process bar
from tqdm.notebook import tqdm


# Hyper parameters
set_seed(468445)
max_len = 128
batch_size = 32
epochs = 2
n_classes = 2
learning_rate = 1e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "vinai/bertweet-base"
loss_fn = nn.CrossEntropyLoss()

In [None]:
# Loading the data and define a preprocessing class for it

dataset = pd.read_csv('data/train.csv')
dataset = dataset.drop(columns=['id', 'keyword', 'location'])
dataset.columns = ['tweet', 'target']

class TextDataset(Dataset):
    def __init__(self, tweets, targets, tokenizer, max_lenght):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_lenght = max_lenght

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        target = self.targets[idx]

        encoding = self.tokenizer(tweet, padding='max_length',
                                  truncation=True, max_length=self.max_lenght,
                                  return_tensors='pt')

        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(target, dtype=torch.long)}


In [None]:
def train(model, train_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(train_loader):
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=inputs, attention_mask=attention_mask)
        logits = outputs.logits  # Extract logits from SequenceClassifierOutput

        loss = torch.nn.functional.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions.double() / total_predictions
    return accuracy.item(), avg_loss

def evaluate(model, val_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_mask)
            logits = outputs.logits  # Extract logits from SequenceClassifierOutput

            loss = torch.nn.functional.cross_entropy(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    accuracy = correct_predictions.double() / total_predictions
    report = classification_report(all_labels, all_preds)
    return accuracy.item(), avg_loss, report

In [None]:
# Setting up the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.to(device)

# freeze all the parameters
#for param in model.parameters():
#    param.requires_grad = False

#unfreeze the last layer
#for param in model.roberta.encoder.layer[-1].parameters():
#    param.requires_grad = True

# Spliting data into training and validation.
trainset, evalset = train_test_split(dataset, test_size=0.2, random_state = 69)

# Resetting indexes to avoid keyerrors :-)
trainset = trainset.reset_index(drop=True)
evalset = evalset.reset_index(drop=True)

# Creating datasets
train_dataset = TextDataset(tweets = trainset['tweet'], targets = trainset['target'], tokenizer = tokenizer, max_lenght = max_len)
val_dataset = TextDataset(tweets = evalset['tweet'], targets = evalset['target'], tokenizer = tokenizer, max_lenght = max_len)

In [None]:
# Creating dataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
# Training the model
all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}
print('Training')
for epoch in tqdm(range(epochs)):
  print()
  print(f'Epoch {epoch+1}/{epochs}')

  print('Training on batches')
  train_acc, train_loss = train(model, train_loader, optimizer, scheduler, device)
  all_loss['train_loss'].append(train_loss)
  all_acc['train_acc'].append(train_acc)
  print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

  print('Evaluating on batches')
  val_acc, val_loss, report = evaluate(model, val_loader, device)
  all_loss['val_loss'].append(val_loss)
  all_acc['val_acc'].append(val_acc)
  print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
  print(report)


# Plots
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])
plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

In [None]:
#torch.save(model.state_dict(), 'BerTweetModel.pt')

In [None]:
#Testing the model on the test set
test_data = pd.read_csv('data/test.csv')
test_data = test_data.drop(columns=['keyword', 'location'])

# Renaming the columns
test_data.columns = ['id', 'tweet']

# Creating a dataset
test_dataset = TextDataset(tweets = test_data['tweet'], targets = np.zeros(len(test_data)), tokenizer = tokenizer, max_lenght = max_len)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluating the model on the test set
model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=inputs, attention_mask=attention_mask)
        logits = outputs.logits  # Extract logits from SequenceClassifierOutput

        _, preds = torch.max(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

# Saving the predictions with corresponding ids
submission = pd.DataFrame({'id': test_data['id'], 'target': all_preds})
submission.to_csv('/submissions/submission1.csv', index=False)