In [1]:
import numpy as np
import pandas as pd
import transformers
import torch
from torch.utils.data import Dataset, DataLoader,RandomSampler, SequentialSampler,TensorDataset
from transformers import XLMRobertaConfig,XLMRobertaTokenizer,XLMRobertaModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import scipy as sc
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import math
from tqdm.notebook import trange, tqdm
import pickle

In [2]:
le = LabelEncoder()
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
def filter_language(lang):
    if lang in ["es-AR","es-ES","es-PE"]:
        return 1
    elif lang in ["fr-CA","fr-FR"]:
        return 2
    else:
        return 3

In [4]:
df_train = pd.read_csv("dsl/DSL-TRAIN.txt",sep="\t",header=None)
df_eval = pd.read_csv("dsl/DSL-DEV.txt",sep="\t",header=None)
df_test = pd.read_csv("dsl/DSL-TEST-GOLD.txt",sep="\t",header=None)


df_train = df_train.rename(columns={0:"Text",1:"language"})
df_eval = df_eval.rename(columns={0:"Text",1:"language"})
df_test = df_eval.rename(columns={0:"Text",1:"language"})


df_train["Meta_lang_tag"] = df_train["language"].apply(filter_language)
df_eval["Meta_lang_tag"] = df_eval["language"].apply(filter_language)
df_test["Meta_lang_tag"] = df_test["language"].apply(filter_language)


df_train_spanish = df_train[df_train["Meta_lang_tag"] == 2]
df_eval_spanish = df_eval[df_eval["Meta_lang_tag"] == 2]
df_test_spanish = df_test[df_test["Meta_lang_tag"] == 2]


le.fit(df_train_spanish["language"])

df_train_spanish.insert(len(df_train_spanish.columns),"label",
                  le.transform(df_train_spanish["language"]))

df_eval_spanish.insert(len(df_eval_spanish.columns),"label",
                  le.transform(df_eval_spanish["language"]))

df_test_spanish.insert(len(df_test_spanish.columns),"label",
                  le.transform(df_test_spanish["language"]))


In [5]:
df_train_spanish = df_train_spanish.sample(frac=1,random_state=44)
df_eval_spanish = df_eval_spanish.sample(frac=1,random_state=44)
df_test_spanish = df_test_spanish.sample(frac=1,random_state=44)

In [6]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
LEARNING_RATE = 2e-05

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [7]:
df_test_spanish.head()

Unnamed: 0,Text,language,Meta_lang_tag,label
14267,« J'ai été lâché par les politiques » répète l...,fr-FR,2,1
14190,La contribution au résultat d'exploitation du ...,fr-FR,2,1
14020,"Le lithium, que certains appellent ""l'or blanc...",fr-FR,2,1
15354,"RACING : Kéhi - Rivieyran, Outrebon, Sikimic (...",fr-FR,2,1
13260,Après des audiences où les députés et les séna...,fr-CA,2,0


In [8]:
def DSLDataset(data,tokenizer,max_length):
    inp_ids = []
    tok_type_ids = []
    atten_mask  = []
    labels = []
    for i in range(len(data)):
      text = data.iloc[i]["Text"]
      temp = tokenizer.encode_plus(text,pad_to_max_length = True,
                                  add_special_tokens=True,
                                  max_length=max_length,
                                  return_token_type_ids=True,
                                  truncation=True)
      inp_ids.append(temp["input_ids"])
      tok_type_ids.append(temp["token_type_ids"])
      atten_mask.append(temp["attention_mask"])
      labels.append([data.iloc[i]["label"]])
    
    input_ids = torch.tensor(inp_ids,dtype=torch.long)
    attention_mask = torch.tensor(atten_mask,dtype=torch.long)
    token_type_ids = torch.tensor(tok_type_ids,dtype=torch.long)
    labels = torch.tensor(labels,dtype=torch.long)

    dataset = TensorDataset(input_ids, attention_mask,token_type_ids,labels)
    return dataset

In [9]:
training_set = DSLDataset(df_train_spanish, tokenizer, MAX_LEN)
eval_set = DSLDataset(df_eval_spanish, tokenizer, MAX_LEN)
testing_set = DSLDataset(df_test_spanish, tokenizer, MAX_LEN)


random_sampler = RandomSampler(training_set)
train_loader = DataLoader(training_set,batch_size=TRAIN_BATCH_SIZE,sampler=random_sampler)
eval_loader = DataLoader(eval_set,batch_size=VALID_BATCH_SIZE)
test_loader = DataLoader(testing_set,batch_size=TEST_BATCH_SIZE)


In [10]:
class MRoberta(torch.nn.Module):
    def __init__(self):
        super(MRoberta, self).__init__()
        self.model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.drop = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(768, 1)
        self.tanh = torch.nn.Tanh()
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.model(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.drop(output_1)
        output = self.fc(output_2)
        output = self.tanh(output)
        return output

#model = MRoberta()

In [11]:
def trainer(model,train_loader,eval_loader,epochs,device):
    train_loss = []
    val_loss = []
    best_loss = math.inf
    #criterion1 = torch.nn.CrossEntropyLoss()
    criterion1 = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)    
    for epoch in range(epochs):
        print(epoch)
        epoch_train_loss = 0
        model.train()
        for batch_id,batch in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            inputs = {"input_ids": batch[0].to(device), 
                    "attention_mask": batch[1].to(device),  
                    "token_type_ids": batch[2].to(device),
                    "labels": batch[3].to(device)}
      
            output = model(inputs['input_ids'],
                         inputs['attention_mask'],
                         inputs['token_type_ids'])
            loss = criterion1(output.squeeze(1),
                              inputs["labels"].type_as(output).squeeze(1))
            epoch_train_loss = epoch_train_loss + loss.item()
            loss.backward()
            optimizer.step()

        epoch_train_loss = epoch_train_loss / (1.0 * len(train_loader))
        train_loss.append(epoch_train_loss)
        model.eval()
        with torch.no_grad():
            epoch_val_loss = 0.0
            y_pred_val = []
            y_true_val = []
            for batch_id,batch in enumerate(tqdm(eval_loader)):
                inputs = {"input_ids": batch[0].to(device),
                          "attention_mask": batch[1].to(device),
                          "token_type_ids": batch[2].to(device),
                          "labels": batch[3].to(device)}
                output = model(inputs['input_ids'],
                         inputs['attention_mask'],
                         inputs['token_type_ids'])
                loss = criterion1(output.squeeze(1),
                                  inputs["labels"].type_as(output).squeeze(1))
                epoch_val_loss = epoch_val_loss + loss.item()
                out_preds = torch.round(torch.sigmoid(output.squeeze(1)))
                print(out_preds)
                print(inputs["labels"].squeeze(1))
                y_pred_val.extend(out_preds.detach().cpu().numpy().tolist())
                y_true_val.extend(inputs["labels"].squeeze(1).detach().cpu().numpy().tolist())
            epoch_val_loss = epoch_val_loss / (len(eval_loader)*1.0)
            val_loss.append(epoch_val_loss)
            if best_loss > epoch_val_loss :
                best_loss = epoch_val_loss
                torch.save({
                    'model_state_dict':model.state_dict(),
                    'optimizer_state_dict':optimizer.state_dict(),
                    'loss':best_loss,},'french_modelv1.pth')
        
        target_name = list(le.classes_)
        print("*****************************************************************")
        print("Validation Report")
        print("*****************************************************************")
        print(classification_report(y_true_val,y_pred_val,target_names=target_name))
        print("*****************************************************************")
        print(epoch,train_loss[-1],val_loss[-1])      
    return (model,train_loss,val_loss)
        

In [12]:
def test_engine(model,test_loader,device):
  
    preds_all = []
    true_all = []
    test_loss = 0.0
    model.eval()
    #criterion1 = torch.nn.CrossEntropyLoss()
    criterion1 = torch.nn.BCEWithLogitsLoss()
    with torch.no_grad():
        for batch_id,batch in enumerate(tqdm(eval_loader)):
            inputs = {"input_ids": batch[0].to(device), 
                      "attention_mask": batch[1].to(device),  
                      "token_type_ids": batch[2].to(device),
                      "labels": batch[3].to(device)}
            output = model(inputs['input_ids'],
                         inputs['attention_mask'],
                         inputs['token_type_ids'])
            loss = criterion1(output.squeeze(1),inputs["labels"].type_as(output).squeeze(1))
            test_loss = test_loss + loss.item()
            out_preds = torch.round(torch.sigmoid(output.squeeze(1)))
            preds_all.extend(out_preds.detach().cpu().numpy().tolist())
            true_all.extend(inputs['labels'].squeeze(1).detach().cpu().numpy().tolist())
    
    test_loss = test_loss / (len(test_loader)*1.0)
    return (preds_all,true_all,test_loss)
            
            

In [13]:
def main(model,train_loader,eval_loader,device):
    
    model,train_loss,val_loss = trainer(model.to(device),train_loader,eval_loader,3,device)
    model_best = MRoberta()
    checkpoint = torch.load("french_modelv1.pth")
    model_best.load_state_dict(checkpoint["model_state_dict"])
    Y_predict , Y_true , loss = test_engine(model_best.to(device),test_loader,device)
    target_name = list(le.classes_)
    print("*****************************************************************")
    print("Test Report")
    print("*****************************************************************")
    print(classification_report(Y_true,Y_predict,target_names=target_name))
    print("*****************************************************************")
    pickle.dump(train_loss,open("train_loss.p","wb"))
    pickle.dump(val_loss,open("validation_loss.p","wb"))
    

In [None]:

model = MRoberta()
main(model.to(device),train_loader,eval_loader,device)

0


HBox(children=(FloatProgress(value=0.0, max=4397.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))

tensor([1., 1., 1., 1., 0., 1., 0., 1.], device='cuda:0')
tensor([1, 1, 1, 1, 0, 1, 0, 0], device='cuda:0')
tensor([0., 0., 0., 1., 1., 0., 1., 0.], device='cuda:0')
tensor([0, 1, 0, 1, 1, 0, 1, 0], device='cuda:0')
tensor([0., 0., 0., 1., 1., 1., 1., 0.], device='cuda:0')
tensor([0, 0, 0, 1, 1, 1, 1, 0], device='cuda:0')
tensor([1., 0., 0., 0., 1., 0., 1., 0.], device='cuda:0')
tensor([1, 0, 0, 0, 1, 0, 1, 0], device='cuda:0')
tensor([0., 1., 0., 0., 0., 1., 1., 0.], device='cuda:0')
tensor([0, 1, 0, 0, 0, 1, 1, 0], device='cuda:0')
tensor([1., 0., 1., 1., 1., 1., 0., 1.], device='cuda:0')
tensor([1, 0, 1, 0, 1, 1, 0, 1], device='cuda:0')
tensor([0., 0., 1., 1., 0., 1., 1., 1.], device='cuda:0')
tensor([0, 0, 1, 1, 0, 1, 0, 1], device='cuda:0')
tensor([1., 1., 1., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([1, 1, 1, 0, 0, 0, 0, 1], device='cuda:0')
tensor([0., 1., 1., 1., 1., 1., 0., 0.], device='cuda:0')
tensor([0, 1, 1, 1, 1, 1, 0, 1], device='cuda:0')
tensor([1., 0., 0., 0., 0., 

HBox(children=(FloatProgress(value=0.0, max=4397.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))

tensor([1., 1., 1., 1., 0., 1., 0., 0.], device='cuda:0')
tensor([1, 1, 1, 1, 0, 1, 0, 0], device='cuda:0')
tensor([0., 0., 0., 1., 1., 0., 1., 0.], device='cuda:0')
tensor([0, 1, 0, 1, 1, 0, 1, 0], device='cuda:0')
tensor([0., 0., 0., 1., 1., 1., 1., 0.], device='cuda:0')
tensor([0, 0, 0, 1, 1, 1, 1, 0], device='cuda:0')
tensor([1., 0., 0., 0., 1., 0., 1., 0.], device='cuda:0')
tensor([1, 0, 0, 0, 1, 0, 1, 0], device='cuda:0')
tensor([0., 1., 0., 0., 0., 1., 1., 0.], device='cuda:0')
tensor([0, 1, 0, 0, 0, 1, 1, 0], device='cuda:0')
tensor([0., 0., 1., 0., 1., 1., 0., 0.], device='cuda:0')
tensor([1, 0, 1, 0, 1, 1, 0, 1], device='cuda:0')
tensor([0., 0., 1., 1., 0., 1., 0., 1.], device='cuda:0')
tensor([0, 0, 1, 1, 0, 1, 0, 1], device='cuda:0')
tensor([1., 1., 1., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([1, 1, 1, 0, 0, 0, 0, 1], device='cuda:0')
tensor([0., 1., 1., 1., 1., 1., 0., 0.], device='cuda:0')
tensor([0, 1, 1, 1, 1, 1, 0, 1], device='cuda:0')
tensor([1., 0., 0., 0., 0., 

HBox(children=(FloatProgress(value=0.0, max=4397.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))

tensor([1., 1., 1., 1., 0., 1., 0., 0.], device='cuda:0')
tensor([1, 1, 1, 1, 0, 1, 0, 0], device='cuda:0')
tensor([0., 1., 0., 1., 1., 0., 1., 0.], device='cuda:0')
tensor([0, 1, 0, 1, 1, 0, 1, 0], device='cuda:0')
tensor([0., 0., 0., 1., 1., 1., 1., 0.], device='cuda:0')
tensor([0, 0, 0, 1, 1, 1, 1, 0], device='cuda:0')
tensor([1., 0., 0., 0., 1., 0., 1., 0.], device='cuda:0')
tensor([1, 0, 0, 0, 1, 0, 1, 0], device='cuda:0')
tensor([0., 1., 0., 0., 0., 1., 1., 0.], device='cuda:0')
tensor([0, 1, 0, 0, 0, 1, 1, 0], device='cuda:0')
tensor([1., 0., 1., 1., 1., 1., 0., 1.], device='cuda:0')
tensor([1, 0, 1, 0, 1, 1, 0, 1], device='cuda:0')
tensor([0., 0., 1., 1., 0., 1., 0., 1.], device='cuda:0')
tensor([0, 0, 1, 1, 0, 1, 0, 1], device='cuda:0')
tensor([1., 1., 1., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([1, 1, 1, 0, 0, 0, 0, 1], device='cuda:0')
tensor([0., 1., 1., 1., 1., 1., 0., 1.], device='cuda:0')
tensor([0, 1, 1, 1, 1, 1, 0, 1], device='cuda:0')
tensor([1., 0., 0., 0., 0., 

HBox(children=(FloatProgress(value=0.0, max=499.0), HTML(value='')))