# Tropes detection

In [None]:
import numpy as np
import pandas as pd
from collections import deque
import random
import copy
import json
import io

import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, BertModel, BertForPreTraining, BertForSequenceClassification, RobertaForSequenceClassification,  AlbertForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau, LambdaLR
from torch.optim import AdamW

from tqdm.notebook import tqdm, trange

random_seed = 0
torch.manual_seed(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)

In [None]:
TROPES = ["Time Will Tell",
          "Distrust Experts",
          "Too Fast",
          "Natural is Better",
          "Liberty, Freedom",
          "Hidden Motives",
          "Scapegoat",
          "Defend the Weak",
          "Wicked Fairness"]


# Data

In [None]:
from skmultilearn.model_selection import iterative_train_test_split

In [None]:
path = "../../"

df = pd.read_csv(path+'tropes_data.csv')

df

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np

X = np.array([x for x in range(0, len(df))])
y = np.array([df.iloc[i, 2:11].tolist() for i in range(0, len(df))])

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for train_index, test_index in mskf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
df_train = df.iloc[X_train,:]
df_test = df.iloc[X_test,:]


In [None]:
weights_tmp = [0 for i in range(0, len(TROPES))]
for i in range(0, len(TROPES)):
    for j in range(0, len(train_labels)):
        if train_labels[j][i]>0:
            weights_tmp[i]+=1

weights_tropes = torch.Tensor([[len(train_labels)/(len(train_labels)-w), len(train_labels)/(w)] for w in weights_tmp]).to('cuda')


In [None]:
tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

tokenized_input = tokenizer(train_text)

m = 0
for tokens in tokenized_input['input_ids']:
    if len(tokens)>m:
        m=len(tokens)
m

In [None]:
MAX_LEN = 128

tokenized_train = tokenizer(train_text, max_length=MAX_LEN, padding='max_length', truncation=True)
tokenized_validation = tokenizer(validation_text, max_length=MAX_LEN, padding='max_length', truncation=True)
    
train_input_ids, train_token_type_ids, train_attention_mask = tokenized_train['input_ids'], tokenized_train['token_type_ids'], tokenized_train['attention_mask']
validation_input_ids, validation_token_type_ids, validation_attention_mask = tokenized_validation['input_ids'], tokenized_validation['token_type_ids'], tokenized_validation['attention_mask']

train_token_type_ids = torch.tensor(train_token_type_ids)
validation_token_type_ids = torch.tensor(validation_token_type_ids)

# Convert to torch tensor
train_input_ids = torch.tensor(train_input_ids)
train_labels = torch.tensor(train_labels)
train_attention_mask = torch.tensor(train_attention_mask)
train_token_type_ids = torch.tensor(train_token_type_ids)

validation_input_ids = torch.tensor(validation_input_ids)
validation_labels = torch.tensor(validation_labels)
validation_attention_mask = torch.tensor(validation_attention_mask)
validation_token_type_ids = torch.tensor(validation_token_type_ids)

In [None]:
batch_size = 12 #

train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels, train_token_type_ids)
validation_data = TensorDataset(validation_input_ids, validation_attention_mask, validation_labels, validation_token_type_ids)

    
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


# Model

In [None]:
class bert(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=n_classes)
        
    def forward(self, input_ids, token_type_ids, input_mask):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)

        logits = outputs.logits
        
        return logits
    
class CTbert(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)
        
    def forward(self, input_ids, token_type_ids, input_mask):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)

        logits = outputs[1]
        
        return logits  

In [None]:
device="cuda"

model = CTbert((len(TROPES))*2)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr=2e-5, #CTBert
                  weight_decay = 0.01)

scheduler = ReduceLROnPlateau(optimizer, patience=4, factor=0.3)


In [None]:
criterions = []

for i in range(0, len(TROPES)):
    criterions.append(nn.CrossEntropyLoss(weight = weights_tropes[i]))



# Training

In [None]:
epochs = 25

best_loss = 999
best_state_dict = model.state_dict()
best_epoch = 0
METRICS = []
best_mean_f1 = 0
sig = nn.Sigmoid()

for e in trange(0, epochs):

    # Training
    print('Starting epoch ', e)
    model.train()
    
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels, b_token_type_ids = batch
        
        b_labels = b_labels.float()
        optimizer.zero_grad()
        
        logits = model(b_input_ids, b_token_type_ids, b_input_mask) #.logits
        
        losses = []
        for i in range(0, len(TROPES)):
            logits_i = logits[:,2*i:2*i+2]
            labels_i = b_labels[:, i].long()
            loss_i = criterions[i](logits_i, labels_i)
            losses.append(loss_i)
        loss = sum(losses)
        
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    # Testing
    
    model.eval()
    
    tweets_test = []
    
    predictions_sep = [[], [], [], [], [], [], [], [], [],[], [], [], [], [], [], [], [], [], [], []]
    predictions_sep = [[], [], [], [], [], [], [], [], [],[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]

    labels_sep = [[], [], [], [], [], [], [], [], [],[], [], [], [], [], [], [], [], [], [], []]
    labels_sep = [[], [], [], [], [], [], [], [], [],[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
    
    eval_loss = 0
    steps=0
    for step, batch in enumerate(validation_dataloader):

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels, b_token_type_ids = batch
            
        b_labels = b_labels.float()
        
        with torch.no_grad():

            logits = model(b_input_ids, b_token_type_ids, b_input_mask)
            losses = []
            for i in range(0, len(TROPES)):
                logits_i = logits[:,2*i:2*i+2]
                labels_i = b_labels[:, i].long()
                loss_i = criterions[i](logits_i, labels_i)
                losses.append(loss_i)
            loss = sum(losses)

        logits = logits.detach().cpu().numpy()
        ground_truth = b_labels.detach().cpu().numpy()
        
        steps+=1
        eval_loss+=loss.detach().item()
                
        for i in range(0, len(TROPES)):
            for p in logits:
                p_i = p[2*i:2*i+2]
                pred = np.argmax(p_i)
                predictions_sep[i].append(pred)

            for l in ground_truth:
                labels_sep[i].append(int(l[i]))
    scheduler.step(eval_loss/steps)
    LOSS = eval_loss/steps
    
    
    F1s = []
    PREs = []
    RECs = []
    for i in range(0, len(TROPES)):
        if i!=i_to_skip:
            precision_i, recall_i, f1_i, _= precision_recall_fscore_support(labels_sep[i], predictions_sep[i], average='binary')
            F1s.append(round(f1_i, 3))
            PREs.append(round(precision_i, 3))
            RECs.append(round(recall_i, 3))
        
    #precision, recall, f1_h, support =  metrics.precision_recall_fscore_support(labels_sep[0], predictions_sep[0], average='macro')
    mean_pre, mean_rec, mean_f1 = np.mean(PREs), np.mean(RECs), np.mean(F1s)
    METRICS.append([mean_pre, mean_rec, mean_f1, F1s])
    
    if mean_f1> best_mean_f1:
        best_loss = LOSS
        best_state_dict = copy.deepcopy(model.state_dict())
        best_epoch = e
        best_mean_f1 = mean_f1
    
    print("\t Eval loss: {}".format(LOSS))
    print("\t Eval F1: {}, PRE: {}, REC: {}".format(round(mean_f1, 3), round(mean_pre, 3), round(mean_rec, 3)))
    print("\t Eval F1s: {}".format(F1s))
    print("---"*25)
    print("\n")


In [None]:
torch.save(best_state_dict, "best_model_e"+str(best_epoch)+"_f"+round(best_mean_f1, 3)+".pth")
