In [None]:
!pip install SentencePiece transformers

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss

import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold

In [4]:
# df3 = pd.read_csv('/tweets_prepared.csv')
df2 = pd.read_csv('/reddit_prepared_new.csv')
df1 = pd.read_csv('/gab_prepared_new.csv')

# only for hasoc
# df1 = pd.read_csv('/hasoc_train_emoji_removed.csv')
# df2 = pd.read_csv('/hasoc_test_emoji_removed.csv')

In [5]:
df1.dropna(axis=0, inplace=True)
df1.reset_index(drop=True, inplace=True)

df2.dropna(axis=0, inplace=True)
df2.reset_index(drop=True, inplace=True)

# df3.dropna(axis=0, inplace=True)
# df3.reset_index(drop=True, inplace=True)

In [None]:
df1.head()

Unnamed: 0,text,label
0,Trump was very prescient tonight when he warne...,0
1,Zuckerworm backs down again This is the jew wh...,0
2,The nigger says Reeeeeeeee Trump be 's Hitler ...,1
3,No problem Seem pissy No 100 no people who are...,1
4,What does make a black person black I know so ...,0


In [None]:
df1.drop(columns={'Unnamed: 0'}, inplace=True)
df2.drop(columns={'Unnamed: 0'}, inplace=True)

In [None]:
train_text, temp_text, train_label, temp_label = train_test_split(df3['text'], df3['label'], test_size=0.15, shuffle=True, random_state=697, stratify=df3['label'])
test_text, val_text, test_label, val_label = train_test_split(temp_text, temp_label, test_size = 0.5, shuffle=True, random_state=741, stratify=temp_label)

In [None]:
# only for HASOC data

# train_text, train_label = df1['text'], df1['label']
# test_text, test_label = df2['text'], df2['label']

In [16]:
 tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
 
 def create_inputs(text):   
    inputs = [tokenizer(str(x), add_special_tokens=True, max_length=, padding='max_length', truncation=True) for x in text ]
    # inputs
    input_ids = []
    attn_masks = []
    for input in inputs:
        input_ids.append(input['input_ids'])
        attn_masks.append(input['attention_mask'])

    return torch.tensor(input_ids), torch.tensor(attn_masks)
    # print(input_ids)
    # print(attn_masks)

In [None]:
train_inputs, train_masks = create_inputs(train_text)
train_labels = torch.tensor(train_label.values)

val_inputs, val_masks = create_inputs(val_text)
val_labels = torch.tensor(val_label.values)

test_inputs, test_masks = create_inputs(test_text)
test_labels = torch.tensor(test_label.values)

In [None]:
batch_size = 32

#preparing train data
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#preparing validation data
validation_data = TensorDataset(val_inputs, val_masks, val_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

#preparing test data
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
weights = sklearn.utils.class_weight.compute_class_weight('balanced', np.unique(df3['label']), df3['label'])
class_weights = torch.from_numpy(weights)
class_weights.to('cuda')
class_weights.cuda()

tensor([5.7758, 0.4304, 1.9854], device='cuda:0', dtype=torch.float64)

In [None]:
print(weights)
print(class_weights)
print(np.unique(df3['label']))

[5.77575758 0.43044264 1.98541667]
tensor([5.7758, 0.4304, 1.9854], dtype=torch.float64)
[0 1 2]


## **Experiment #1: With roberta-base model and 5 epochs** 

In [7]:
def get_model(num_labels) :
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
    model.cuda()
    return model

In [8]:
model = get_model(num_labels=2)
# print(model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [None]:
# freezing all transformer layers of the model except classification layer

# for param in model.bert.encoder.layer.parameters():
#     param.requires_grad = False

In [None]:
# freezing first few transformer layers of the model 

for param in model.roberta.encoder.layer.parameters():
    param.requires_grad = False

In [None]:
# freezing all layers of the model except the last hidden layer and the classification layer

# for param in model.transformer.layer.parameters():
#     param.requires_grad = False

In [9]:
# optimizing all model paramaters with weight decay except bias terms

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,num_warmup_steps=100, num_training_steps=(len(train_text)//32)*1)

In [10]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [11]:
device = 'cuda'

In [None]:
loss_fn = CrossEntropyLoss(weight=class_weights.float()).to('cuda')

In [12]:
def training( train_dataloader, epochs=5):

    train_loss_set = []

    # Number of training epochs (authors recommend between 2 and 4)
    epochs = epochs

    # trange is a tqdm wrapper around the normal python range
    for _ in range(epochs):
    
    
        # Training
        
        # Set our model to training mode (as opposed to evaluation mode)
        model.train()
        
        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        
        # Train the df1 for one epoch
        for step, batch in enumerate(train_dataloader):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our df1loader
            b_input_ids, b_input_mask, b_labels = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            # loss = loss_fn(logits, b_labels)
            train_loss_set.append(loss.item())    
            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            scheduler.step()
            
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss/nb_tr_steps))

In [13]:
def evaluation(validation_dataloader):

    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate df1 for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our df1loader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
        # Forward pass, calculate logit predictions
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = output[0]
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    return (eval_accuracy/nb_eval_steps)

In [14]:
def get_f1_score(test_dataloader):

    # Prediction on test set

    # Put model in evaluation mode
    model.eval()

    # Tracking variables 
    predictions , true_labels = [], []

    # Predict 
    for batch in test_dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our df1loader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

    # calculating f1 score on test set 
    from sklearn.metrics import f1_score

    pred_list = []
    labels = []
    for i in true_labels:
        labels.extend(i)

    for i in range(len(predictions)):
        pred_list.append(np.argmax(predictions[i], axis=1).flatten())

    preds = []
    for i in pred_list:
        preds.extend(i)

    score = f1_score(labels, preds, average='weighted')
    print ("F1 score: {}".format(score))
    return score, labels, preds

In [None]:
training(train_dataloader, epochs=1)

Train loss: 0.2944481255817305


In [None]:
np.mean([evaluation(validation_dataloader)*100, evaluation(test_dataloader)*100])

# only for hasoc
# evaluation(test_dataloader)*100

Validation Accuracy: 0.8961864406779662
Validation Accuracy: 0.8977754237288136


89.69809322033899

In [None]:
test_score, labels, preds= get_f1_score(test_dataloader)
valid_score = get_f1_score(validation_dataloader)[0]
np.mean([test_score*100, valid_score*100])

#only for HASOC
# test_score= get_f1_score(test_dataloader)[0]

F1 score: 0.870098110576582
F1 score: 0.8669239549734803


86.85110327750311

In [22]:
from sklearn.metrics import classification_report, plot_confusion_matrix
def get_report(labels, preds):
    report = classification_report(labels, preds)
    print(report)

# get_report(labels, preds)

In [None]:
# plt.figure(figsize=(15,8))
# plt.title("Training loss")
# plt.xlabel("Batch")
# plt.ylabel("Loss")
# plt.plot(train_loss_set) #inside training function; hence not accessible 
# plt.show()

## Cross validation

In [None]:
from sklearn.model_selection import StratifiedKFold

X = df3['text']
y = df3['label']

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y) 

print(skf)

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)


In [None]:
acc_list = []
f1_score_list = []

for train_index, test_index in skf.split(X, y):
#   print("TRAIN:", train_index, "TEST:", test_index)
    train_text, test_text = X[train_index], X[test_index]
    train_label, test_label = y[train_index], y[test_index]
        
    train_inputs, train_masks = create_inputs(train_text)
    train_labels = torch.tensor(train_label.values)
    
    test_inputs, test_masks = create_inputs(test_text)
    test_labels = torch.tensor(test_label.values)
    
    batch_size = 32
    
    #preparing train data
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    #preparing test data
    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
    model = get_model(num_labels=3)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}]
    
    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=100, num_training_steps=(len(train_text)//32)*2)
    
    training(train_dataloader, epochs=2)
    
    acc = evaluation(test_dataloader)
    acc_list.append(acc)
    
    f1_score = get_f1_score(test_dataloader)[0]
    f1_score_list.append(f1_score)
    
    print(acc, f1_score)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

0.9200367647058824 0.9197780865708056


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

0.9151858660130718 0.9163577069620717


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [None]:
print(acc_list, f1_score_list)
print(np.mean(acc_list))
print(np.mean(f1_score_list))

In [None]:
# only for hasoc

# hasoc_test_text, hasoc_test_label = df2['text'], df2['label']

# hasoc_test_inputs, hasoc_test_masks = create_inputs(hasoc_test_text)
# hasoc_test_labels = to)rch.tensor(hasoc_test_label.values)

# hasoc_test_data = TensorDataset(hasoc_test_inputs, hasoc_test_masks, hasoc_test_labels)
# hasoc_test_sampler = RandomSampler(hasoc_test_data)
# hasoc_test_dataloader = DataLoader(hasoc_test_data, sampler=hasoc_test_sampler, batch_size=batch_size)

# print('accuracy:', evaluation(hasoc_test_dataloader))
# print('f1 score:', get_f1_score(hasoc_test_dataloader)

# Training on one dataset and testing on another

In [15]:
batch_size = 32

train_text = df1['text']
train_label = df1['label']

test_text = df2['text']
test_label = df2['label']

train_inputs, train_masks = create_inputs(train_text)
train_labels = torch.tensor(train_label.values)
    
#preparing train data
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 

In [17]:
test_inputs, test_masks = create_inputs(test_text)
test_labels = torch.tensor(test_label.values)

#preparing test data
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [18]:
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,num_warmup_steps=50, num_training_steps=(len(train_text)//32)*2)

In [19]:
training(train_dataloader, epochs=2)

Train loss: 0.2622342546705127
Train loss: 0.18564267825210598


In [20]:
print(evaluation(test_dataloader))
score, labels, preds = get_f1_score(test_dataloader)
print(score)

Validation Accuracy: 0.9114128075253256
0.9114128075253256
F1 score: 0.9131267503945883
0.9131267503945883


In [23]:
get_report(labels, preds)

              precision    recall  f1-score   support

           0       0.96      0.92      0.94     16869
           1       0.78      0.87      0.82      5236

    accuracy                           0.91     22105
   macro avg       0.87      0.90      0.88     22105
weighted avg       0.92      0.91      0.91     22105

