In [None]:
# Importing dependencies
import pandas as pd
import torch

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.optim as optim
import re

from sklearn.metrics import classification_report, accuracy_score
from transformers import (BertTokenizer,
                          get_linear_schedule_with_warmup,
                          set_seed,
                          AutoModelForSequenceClassification)

# Process bar
from tqdm.notebook import tqdm


# Hyper parameters
set_seed(1234)
max_len = 128
batch_size = 32
epochs = 5
n_classes = 2
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "bert-base-uncased"


In [None]:
# Loading the data and define a preprocessing class for it

dataset = pd.read_csv('data/train.csv')
dataset = dataset.drop(columns=['keyword', 'location'])
dataset.columns = ['id', 'text', 'target']

#cleaning the data for upper case, special characters, and links
def clean_text(text):
    text = text.lower() # lower case
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text) # removes twitter handles like tags.
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text) # removes links.
    return text

dataset['text'] = dataset['text'].apply(clean_text)

class TextDataset(Dataset):
    def __init__(self, tweets, targets, tokenizer, max_length):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        target = self.targets[idx]

        encoding = self.tokenizer(tweet, padding='max_length',
                                  truncation=True, max_length=self.max_length,
                                  return_tensors='pt')

        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(target, dtype=torch.long)}



In [None]:
# Defining training function and evaluation function

def train(model, train_loader, optimizer, scheduler, device):
    model.train()

    total_loss = 0
    prediction_labels = []
    true_labels = []

    for batch in tqdm(train_loader, total = len(train_loader)):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask=attention_mask)
      logits = outputs.logits

      loss = torch.nn.functional.cross_entropy(logits, labels)
      loss.backward()
      optimizer.step()
      scheduler.step()

      total_loss += loss.item()
      _, preds = torch.max(logits, 1)
      prediction_labels += preds.tolist()
      true_labels += labels.tolist()


    avg_epoch_loss = total_loss / len(train_loader)
    accurary = accuracy_score(true_labels, prediction_labels)
    return accurary, avg_epoch_loss

def evaluate(model, val_loader, device):
    model.eval()

    prediction_labels = []
    true_labels = []
    total_loss = 0

    with torch.no_grad():
      for batch in tqdm(val_loader, total = len(val_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = torch.nn.functional.cross_entropy(logits, labels)
        total_loss += loss.item()

        _, preds = torch.max(logits, 1)
        prediction_labels += preds.tolist()
        true_labels += labels.tolist()

    avg_epoch_loss = total_loss / len(val_loader)
    accurary = accuracy_score(true_labels, prediction_labels)
    report = classification_report(true_labels, prediction_labels)
    return accurary, report, avg_epoch_loss

In [None]:
# Setting up the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_classes)
model.to(device)

tokenizer = BertTokenizer.from_pretrained(model_name)

# Spliting data into training and validation.
trainset, evalset = train_test_split(dataset, test_size=0.2, random_state = 69)

# Resetting indexes to avoid keyerrors :-)
trainset = trainset.reset_index(drop=True)
evalset = evalset.reset_index(drop=True)

# Creating datasets
train_dataset = TextDataset(tweets = trainset['tweet'], targets = trainset['target'], tokenizer = tokenizer, max_length = max_len)
val_dataset = TextDataset(tweets = evalset['tweet'], targets = evalset['target'], tokenizer = tokenizer, max_length = max_len)

In [None]:
# Creating dataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
# Training the model
all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

epochs_with_no_improve = 0
patience = 2


print('Training')
for epoch in tqdm(range(epochs)):
  print()
  print(f'Epoch {epoch+1}/{epochs}')

  print('Training on batches')
  train_acc, train_loss = train(model, train_loader, optimizer, scheduler, device)
  all_loss['train_loss'].append(train_loss)
  all_acc['train_acc'].append(train_acc)
  print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

  print('Evaluating on batches')
  val_acc, report, val_loss = evaluate(model, val_loader, device)
  all_loss['val_loss'].append(val_loss)
  all_acc['val_acc'].append(val_acc)
  print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
  print(report)

  # Adding early stopping
  if val_loss < min(all_loss['val_loss']):
    epochs_with_no_improve = 0
  else:
    epochs_with_no_improve += 1
    if epochs_with_no_improve == patience:
      print('Early stopping')
      break

# Saving
torch.save(model.state_dict(), '/Trained models/BertModel.pt')