In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader,RandomSampler,SequentialSampler,TensorDataset,WeightedRandomSampler
import torch.nn as nn
from torch.nn import functional as F
from tqdm.notebook import trange, tqdm
import os,math
from transformers import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
import pickle
from sklearn import dummy
from collections import Counter

In [3]:
from transformers import XLMRobertaTokenizer,\
    XLMRobertaForSequenceClassification,XLMRobertaConfig,XLMRobertaModel
from transformers import AutoModelForMaskedLM,AutoConfig,AutoTokenizer,\
        AutoModelForSequenceClassification

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import logging
logging.basicConfig(filename='example10.log',level=logging.INFO)
Le = LabelEncoder()
dm = dummy.DummyClassifier(strategy="stratified")

In [None]:
EPOCHES = 5
BATCH_SZ = 32
MAX_LENGTH = 128
LEARNING_RATE = 3e-5   #for finetuning on German Tweets
#LEARNING_RATE = 2e-5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(0)

### Insert Pretrained Language Model, should be finetunned on English

In [None]:
#config = AutoConfig.from_pretrained("../results/config.json")
#tokenizer = XLMRobertaTokenizer.from_pretrained("../results/")
#trg_model = AutoModelForSequenceClassification.from_pretrained("../results/")




#config = AutoConfig.from_pretrained("Spanish_LangModel/config.json")
#tokenizer = XLMRobertaTokenizer.from_pretrained("Spanish_LangModel/")
#trg_model = AutoModelForSequenceClassification.from_pretrained("Spanish_LangModel/")



#config = AutoConfig.from_pretrained("../Italian_LangModel/config.json")
#tokenizer = XLMRobertaTokenizer.from_pretrained("../Italian_LangModel/")
#trg_model = AutoModelForSequenceClassification.from_pretrained("../Italian_LangModel/")

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
def createLangDataset(lang):
    for f in os.listdir("."):
        if os.path.isdir(f) and not f.startswith(".") and not f.endswith("_LangModel"):
            print(f)
            files = os.listdir(f)
            eng = [name for name in files if name.startswith(lang)]
            print(eng)
            data_frames = [pd.read_csv(os.path.join(f,f1)) for f1 in eng]
            eng_all = pd.concat(data_frames)
            eng_all.drop("Unnamed: 0", axis=1,inplace=True)
            eng_all.to_csv(lang+"_"+f+".csv",index=False)

In [None]:
createLangDataset("English")

In [None]:
#df = pd.read_csv("English_train.csv")
#text = [df.iloc[0]["text"],df.iloc[1]["text"]]
#label = torch.tensor([df.iloc[0]["label"],df.iloc[1]["label"]],dtype=torch.long)
#inputs = tokenizer(text,return_attention_mask=True,
#                   return_token_type_ids=True,return_tensors="pt",padding="max_length",
#                  max_length=MAX_LENGTH,truncation=True)
#loss , logits = model(**inputs,labels=label)
#print(classification_report(label,torch.argmax(logits,dim=1)))

In [None]:
def createDataset(tokenizer,df):
    text = df["text"].tolist()
    Le.fit(df["label"])
    #label = torch.tensor(Le.transform(df["label"]),dtype=torch.float) # CHANGE POINT FOR BCELogitLoss
    label = torch.tensor(Le.transform(df["label"]),dtype=torch.long)
    inputs = tokenizer(text,return_attention_mask=True,
                   return_token_type_ids=True,return_tensors="pt",padding="max_length",
                  max_length=MAX_LENGTH,truncation=True)
    dataset = TensorDataset(inputs["input_ids"],inputs["attention_mask"],
                           inputs["token_type_ids"],label)
    return dataset

In [None]:
def prepareDataLoaders(lang):
    
    df_train = pd.read_csv(lang+"_train.csv")
    labels = torch.tensor(df_train["label"].tolist(),dtype=torch.long)
    print(torch.unique(labels,return_counts=True))
    class_count = torch.tensor([ (t == labels).sum() for t in torch.unique(labels,sorted=True)])
    #weight = 1.0/class_count.float()
    #weight = 1-class_count/torch.sum(class_count)
    weight = torch.sum(class_count) / class_count
    print(weight)
    sample_wts = torch.tensor([weight[t] for t in labels])
    dm.fit(df_train["text"],df_train["label"])
    sampler = WeightedRandomSampler(sample_wts,len(sample_wts),replacement=True) 
    df = pd.read_csv(lang+"_train.csv")
    #df_f = df[["text"]]
    #df_f.to_csv(lang+"__"+"Train.txt",header=None,index=False)
    train_dataset = createDataset(tokenizer,df_train)
    train_loader= DataLoader(train_dataset,batch_size=BATCH_SZ,sampler=sampler,shuffle=False)
    #train_loader= DataLoader(train_dataset,batch_size=BATCH_SZ,shuffle=False)

    df_eval = pd.read_csv(lang+"_val.csv")
    #df_f = df_eval[["text"]]
    #df_f.to_csv(lang+"__"+"Eval.txt",header=None,index=False)
    eval_dataset = createDataset(tokenizer,df_eval)
    eval_loader= DataLoader(eval_dataset,batch_size=BATCH_SZ,shuffle=False)

    df_test = pd.read_csv(lang+"_test.csv")
    test_dataset = createDataset(tokenizer,df_test)
    test_loader= DataLoader(test_dataset,batch_size=BATCH_SZ,shuffle=False)
    probs = dm.predict_proba(df_test["text"])
    print("Random Classifier Result: ",
          roc_auc_score(df_test["label"],probs[:,1]))
    
    return (train_loader,eval_loader,test_loader)

In [None]:
train_loader,eval_loader,test_loader = prepareDataLoaders("Italian")

In [None]:
print(len(train_loader)),print(len(eval_loader)),print(len(test_loader))

### Three model variants

In [None]:
class HateClassifier(torch.nn.Module):
    def __init__(self):
        super(HateClassifier,self).__init__()
        self.base_model_prefix = "roberta"
        self.config = XLMRobertaConfig.from_pretrained("xlm-roberta-base",base_model_prefix=self.base_model_prefix)
        self.model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base",
                                                                         config=self.config)
    
    def forward(self,inputs,labels):
        outputs = self.model(**inputs,labels=labels)
        return outputs

In [None]:
class HClassifier(torch.nn.Module):
    def __init__(self):
        super(HClassifier,self).__init__()
        self.model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.drop = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(768,1)
        self.tanh = torch.nn.Tanh()
    
    def forward(self,inputs):
        _, outputs = self.model(**inputs)
        outputs = self.drop(outputs)
        outputs = self.fc(outputs)
        outputs = self.tanh(outputs)
        return outputs

In [None]:
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")

### Train the model in full data, random 128, 256 examples by setting flag

In [None]:
def train(model,train_loader,eval_loader,run_type=[False,False]):
    train_loss_counter = []
    best_loss = math.inf
    criterion = torch.nn.BCEWithLogitsLoss()
    param_optimizer = list(model.named_parameters())
    no_decay= ['bias','LayerNorm.bias','LayerNorm.weight']
    optimizer_parameters = [
    {'params': [p for n,p in param_optimizer if not any (nd in n for nd in no_decay)],
    'weight_decay':0.01},
    {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)], 
    'weight_decay':0.0}]
    num_train_steps = len(train_loader)*EPOCHES
    optimizer = AdamW(optimizer_parameters,lr=LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                               num_warmup_steps=2,
                                               num_training_steps=num_train_steps)   
    #optimizer = torch.optim.Adam(params=model.parameters(),lr=LEARNING_RATE)
    for epoch in range(EPOCHES):
        model.train()
        epoch_loss_counter = []
        for ids,batch in tqdm(enumerate(train_loader),total=len(train_loader)):
            if run_type[0] and ids > 4:
                break
            if run_type[1] and ids > 7:
                break
            optimizer.zero_grad()
            inputs = {"input_ids":batch[0].cuda(),
                     "attention_mask":batch[1].cuda(),
                     "token_type_ids":batch[2].cuda()}
            labels = batch[3].cuda()
            #print(torch.unique(batch[3].cpu(),return_counts=True))
            output = model(**inputs,labels=labels)
            loss, logits = output.loss , output.logits
            #outputs = model(inputs)
            #loss = criterion(outputs.squeeze(1),labels)
            loss = loss.mean()
            epoch_loss_counter.append(loss.item())
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        train_loss_counter.append(np.mean(epoch_loss_counter))
        model.eval()
        actual = []
        predicted = []
        with torch.no_grad():
            eval_loss_counter = []
            for ids,batch in tqdm(enumerate(eval_loader),total=len(eval_loader)):
                inputs = {"input_ids":batch[0].cuda(),
                          "attention_mask":batch[1].cuda(),
                          "token_type_ids":batch[2].cuda()}
                labels = batch[3].cuda()
                output = model(**inputs,labels=labels)
                loss, logits = output.loss , output.logits
                loss = loss.mean()
                #outputs = model(inputs)
                #loss = criterion(outputs.squeeze(1),labels)
                #predicted.append(torch.round(torch.sigmoid(outputs.squeeze(1).cpu())))
                predicted.append(torch.argmax(logits,dim=1).cpu())
                actual.append(labels.cpu())
                eval_loss_counter.append(loss.item())
            if best_loss > np.mean(eval_loss_counter):
                best_loss = np.mean(eval_loss_counter)
                model_to_save = model.module if hasattr(model, 'module') else model
                torch.save(model_to_save.state_dict(),"torch_model_pretrained_italian_finet_eng_adapted_italian.bin")
                #torch.save({
                #    'model_state_dict':model.state_dict(),
                #    'optimizer_state_dict':optimizer.state_dict(),
                #    'loss':best_loss,},'saved_model.pth')
            print(epoch,np.mean(epoch_loss_counter),np.mean(eval_loss_counter))
            print(classification_report(torch.cat(actual),torch.cat(predicted)))
            logging.info("Validation Report")
            logging.info('%d %f %f',epoch,np.mean(epoch_loss_counter),np.mean(eval_loss_counter))
            logging.info(classification_report(torch.cat(actual),torch.cat(predicted)))

### Test trained model , save test predictions and probabilities

In [None]:
def test(model,test_loader):
        model.eval()
        actual = []
        predicted = []
        predicted_score = []
        criterion = torch.nn.BCEWithLogitsLoss()
        with torch.no_grad():
            for ids,batch in tqdm(enumerate(test_loader),total=len(test_loader)):
                inputs = {"input_ids":batch[0].cuda(),
                          "attention_mask":batch[1].cuda(),
                          "token_type_ids":batch[2].cuda()}
                labels = batch[3].cuda()
                #outputs = model(inputs)
                #loss = criterion(outputs.squeeze(1),labels)
                #loss = loss.mean()
                #predicted.append(torch.round(torch.sigmoid(outputs.squeeze(1).cpu())))
                output = model(**inputs,labels=labels)
                loss, logits = output.loss , output.logits
                p_score = F.softmax(logits,dim=1)
                #print(torch.max(logits,dim=1))
                loss = loss.mean()
                #print(torch.argmax(logits,dim=1).cpu())
                #print(p_score)
                #print(labels.cpu())
                predicted.append(torch.argmax(logits,dim=1).cpu())
                predicted_score.append(p_score[:,1].cpu())
                actual.append(labels.cpu())
            print(classification_report(torch.cat(actual),torch.cat(predicted)))
            print(roc_auc_score(torch.cat(actual),torch.cat(predicted_score)))
            logging.info("Test Report")
            logging.info(classification_report(torch.cat(actual),torch.cat(predicted)))
            logging.info(roc_auc_score(torch.cat(actual),torch.cat(predicted_score)))
            pickle.dump(actual,open("Actual_13.p","wb"))
            pickle.dump(predicted_score,open("Predicted_scores_13.p","wb"))
            pickle.dump(predicted,open("Predicted_13.p","wb"))

In [None]:
# How outputs of different languages are saved
# Spanish from Actual_6,7,8,9 
# Italian from Actual_10,11,12,13 

### Dummy example of model outputs

In [None]:
# models = HClassifier()
# inputs = tokenizer(["Hello, my dog is cute","Pluto is Mad"], return_tensors="pt",return_token_type_ids=True,
#                   padding="max_length",max_length=10,truncation=True)
# inputs = {"input_ids":inputs["input_ids"],"attention_mask":inputs["attention_mask"],"token_type_ids":inputs["token_type_ids"]}
# outputs = models(inputs)
# print(outputs.squeeze(1).shape)
# print(torch.round(torch.sigmoid(outputs.squeeze(1).cpu())))

In [None]:
# models = HateClassifier()
# inputs = tokenizer(["Hello, my dog is cute","Pluto is Mad"], return_tensors="pt",return_token_type_ids=True,
#                    padding="max_length",max_length=10,truncation=True)
# inputs = {"input_ids":inputs["input_ids"],"attention_mask":inputs["attention_mask"],"token_type_ids":inputs["token_type_ids"]}
# labels = torch.tensor(torch.ones(2),dtype=torch.long)
# outputs  = models(inputs,labels)
# print(outputs.logits)

### Train on full or partial data
### load adapted model
### apply adapted model on test data

In [None]:
def main(model,train_loader,eval_loader,test_loader):
    train(model.cuda(),train_loader,eval_loader,[False,True])
    #model_state_dict = torch.load("torch_model_finet_eng_adapted_italian.bin")
    #model_state_dict = torch.load("torch_model_pretrained_spanish_finet_eng_adapted_spanish.bin")
    model_state_dict = torch.load("torch_model_pretrained_italian_finet_eng_adapted_italian.bin")
    #model_state_dict = torch.load("torch_model_finet_eng_adapted_spanish.bin")
    #model_state_dict = torch.load("torch_model_finet_eng_adapted_german.bin")
    #model_state_dict = torch.load("torch_model_pretrain_germ_finet_eng_adapted_germ.bin")
    #model_state_dict = torch.load("torch_model_pretrained_germ_finet_eng_adapted.bin")
    config = XLMRobertaConfig.from_pretrained("xlm-roberta-base",base_model_prefix="roberta")
    model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base",config=config)
    #model = HateClassifier()
    #checkpoint = torch.load("saved_model.pth")
    #model_best = HateClassifier(checkpoint["model_state_dict"])
    model.load_state_dict(model_state_dict)
    test(model.cuda(),test_loader)
    return model

In [None]:
# Training from Scratch

config = XLMRobertaConfig.from_pretrained("xlm-roberta-base",base_model_prefix="roberta")
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base",config=config)

#model = HateClassifier()

#model_state_dict = torch.load("torch_model1_transfer.bin") # Pretrained on German Tweets

#model_state_dict = torch.load("torch_model1.bin") # filetuned on English XLMSequenceClassification

#model_state_dict = torch.load("torch_model.bin")   # filetuned on English HateClassifier
model_state_dict = torch.load("torch_model_pretrained_italian_finet_eng.bin")
#model_state_dict = torch.load("torch_model_pretrained_germ_finet_eng.bin")
#model_state_dict = torch.load("torch_model_pretrained_spanish_finet_eng.bin")
#model_state_dict = torch.load("torch_model_pretrained_ita")

model.load_state_dict(model_state_dict)

# For finetuning TRG-MLM

#model = trg_model

### Incase of multiple GPU's for better utilisation 

In [None]:
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model)

model.cuda()

### Run train / test function

In [None]:
model = main(model,train_loader,eval_loader,test_loader)

## Retrospective analysis of the predictions 

In [5]:
actual = pickle.load(open("Actual_10.p","rb"))
predicted_score = pickle.load(open("Predicted_scores_10.p","rb"))
predicted = pickle.load(open("Predicted_10.p","rb"))

actual = torch.cat(actual)
predicted = torch.cat(predicted)
predicted_score = torch.cat(predicted_score)

In [7]:
print(classification_report(actual,predicted))

              precision    recall  f1-score   support

           0       0.68      0.87      0.77      1195
           1       0.60      0.32      0.42       717

    accuracy                           0.67      1912
   macro avg       0.64      0.60      0.59      1912
weighted avg       0.65      0.67      0.64      1912



In [8]:
print(roc_auc_score(actual,predicted_score))

0.6087089978583474


In [None]:
actual[0:30]

In [None]:
predicted[0:30]

In [None]:
predicted_score[:30]

In [9]:
actual2 = pickle.load(open("Actual_11.p","rb"))
predicted_score2 = pickle.load(open("Predicted_scores_11.p","rb"))
predicted2 = pickle.load(open("Predicted_11.p","rb"))

actual2 = torch.cat(actual2)
predicted2 = torch.cat(predicted2)
predicted_score2 = torch.cat(predicted_score2)

In [10]:
print(classification_report(actual2,predicted2))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80      1195
           1       0.66      0.73      0.69       717

    accuracy                           0.76      1912
   macro avg       0.74      0.75      0.75      1912
weighted avg       0.77      0.76      0.76      1912



In [None]:
actual2[55:65]

In [None]:
predicted2[55:65]

In [11]:
actual3 = pickle.load(open("Actual_12.p","rb"))
predicted_score3 = pickle.load(open("Predicted_scores_12.p","rb"))
predicted3 = pickle.load(open("Predicted_12.p","rb"))

actual3 = torch.cat(actual3)
predicted3 = torch.cat(predicted3)
predicted_score3 = torch.cat(predicted_score3)

In [12]:
print(classification_report(actual3,predicted3))

              precision    recall  f1-score   support

           0       0.70      0.88      0.78      1195
           1       0.66      0.38      0.48       717

    accuracy                           0.69      1912
   macro avg       0.68      0.63      0.63      1912
weighted avg       0.69      0.69      0.67      1912



In [None]:
actual3[0:24]

In [None]:
predicted3[55:65]

In [None]:
predicted[0:24]

In [13]:
actual4 = pickle.load(open("Actual_13.p","rb"))
predicted_score4 = pickle.load(open("Predicted_scores_13.p","rb"))
predicted4 = pickle.load(open("Predicted_13.p","rb"))

actual4 = torch.cat(actual4)
predicted4 = torch.cat(predicted4)
predicted_score4 = torch.cat(predicted_score4)

In [14]:
print(classification_report(actual4,predicted4))

              precision    recall  f1-score   support

           0       0.87      0.77      0.81      1195
           1       0.67      0.81      0.74       717

    accuracy                           0.78      1912
   macro avg       0.77      0.79      0.77      1912
weighted avg       0.80      0.78      0.78      1912



In [119]:
a = 350
b = 365

In [120]:
actual[a:b]

tensor([1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1])

In [121]:
predicted[a:b]

tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0])

In [122]:
predicted2[a:b]

tensor([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0])

In [123]:
predicted3[a:b]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0])

In [124]:
predicted4[a:b]

tensor([1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0])

In [66]:
print(classification_report(actual,predicted4))

              precision    recall  f1-score   support

           0       0.87      0.77      0.81      1195
           1       0.67      0.81      0.74       717

    accuracy                           0.78      1912
   macro avg       0.77      0.79      0.77      1912
weighted avg       0.80      0.78      0.78      1912



In [None]:
print(classification_report(actual,predicted3))

In [None]:
print(roc_auc_score(actual,predicted_score2))

In [15]:
df = pd.read_csv("Italian_test.csv")

In [17]:
df.head(10)

Unnamed: 0,text,label
0,Ambasciatore : Teheran spera che l'Occidente c...,0.0
1,#ultimenotizie #news #notizie: Le rom messe in...,0.0
2,I padroni della guerra non dormono mai #Trump...,0.0
3,@FabianaDeNisi @matteosalvinimi Ma perchè esse...,0.0
4,"..e i laici contro islamici e cristiani,ovviam...",0.0
5,@ClaudioDeglinn2 Cessava l'invasione di migran...,0.0
6,#Migranti e terroristi islamici quantè facile ...,1.0
7,"Andiamo al Cairo, in una scuola di suore franc...",0.0
8,#dallavostraparte Ha fatto bene il ristoratore...,1.0
9,"#papamilano2017 abbraccia gli #islamici, un ab...",1.0


In [125]:
ids = 354

In [126]:
df.iloc[ids]["text"]

'Quel figlio di Vanna marchi....se la pigliasse ne ano più profondo'

In [127]:
df.iloc[ids]["label"]

1.0