In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import torch
from torch.utils.data import (TensorDataset, DataLoader,
                              RandomSampler, SequentialSampler)
from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from distutils.version import LooseVersion as LV
from sklearn.model_selection import train_test_split
import io

#path = r'D:\tirocinioLC\tirocinioLC'
from google.colab import drive
drive.mount('/content/drive')
directory = '/content/drive/My Drive/Text Analytics/Data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    devicename = '['+torch.cuda.get_device_name(0)+']'
else:
    device = torch.device('cpu')
    devicename = ""
    
print('Using PyTorch version:', torch.__version__,
      'Device:', device, devicename)
assert(LV(torch.__version__) >= LV("1.0.0"))

Using PyTorch version: 1.10.0+cu111 Device: cuda [Tesla K80]


Import dataset

In [None]:
df = pd.read_json(directory+'cleandf3.json')

df.Data = pd.to_datetime(df.Data)
df = df[df.Data.dt.year < 2019]
df = df.sample(frac=0.85, random_state=42)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32039 entries, 2840 to 29371
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  32039 non-null  int64         
 1   Titolo_Rif          32037 non-null  object        
 2   Rating              32039 non-null  object        
 3   Nome_Autore         32039 non-null  object        
 4   Data                32039 non-null  datetime64[ns]
 5   Numero_Capitoli     32039 non-null  int64         
 6   Racconto_Text_Only  32039 non-null  object        
 7   N_Tot_Rec           32039 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 2.2+ MB
None


Unnamed: 0,ID,Titolo_Rif,Rating,Nome_Autore,Data,Numero_Capitoli,Racconto_Text_Only,N_Tot_Rec
2840,59819,Tutto Per Una Meravigliosa Partita di Quidditch,verde,Gillywater,2005-11-19,1,Buon Compleanno Love! ...,12
33471,1135350,Dopotutto Babbanologia non è inutile...,verde,Gils_Malfoy,2012-06-29,1,AVVERTENZE! Prima di iniziare a leggere è bene...,7
34947,1260606,Non piangere.,verde,A n o n y m o u s Rei,2012-11-09,1,"Ti tappi un altro po’ le orecchie, Theo. Fa...",2
35908,1407567,Una finestra sul passato,verde,Ma_AiLing,2012-11-24,1,Il professor Lupin si stava dirigendo nel suo ...,2
22312,690486,Qualcuno uccida Potter,verde,lotti_,2011-06-04,1,"Un raggio di sole scarlatto, tenue e delicata ...",4


In [None]:
col         = 'N_Tot_Rec'
conditions  = [ (df[col] <= 2) , (df[col] >= 6)]
choices     = [ 'unpopular', 'popular' ] 
    
df[col] = np.select(conditions, choices, default=np.nan)

df = df[(df['N_Tot_Rec']=='unpopular') | (df['N_Tot_Rec']=='popular')]

df.head()

Unnamed: 0,ID,Titolo_Rif,Rating,Nome_Autore,Data,Numero_Capitoli,Racconto_Text_Only,N_Tot_Rec
2840,59819,Tutto Per Una Meravigliosa Partita di Quidditch,verde,Gillywater,2005-11-19,1,Buon Compleanno Love! ...,popular
33471,1135350,Dopotutto Babbanologia non è inutile...,verde,Gils_Malfoy,2012-06-29,1,AVVERTENZE! Prima di iniziare a leggere è bene...,popular
34947,1260606,Non piangere.,verde,A n o n y m o u s Rei,2012-11-09,1,"Ti tappi un altro po’ le orecchie, Theo. Fa...",unpopular
35908,1407567,Una finestra sul passato,verde,Ma_AiLing,2012-11-24,1,Il professor Lupin si stava dirigendo nel suo ...,unpopular
28797,889632,Stories,verde,Viki_chan,2011-10-12,1,Tante delle mie Auror preferite hanno usato - ...,popular


In [None]:
target = df[['N_Tot_Rec']]
del df['N_Tot_Rec']

In [None]:
from sklearn.model_selection import train_test_split, RepeatedKFold

#split on train-test 
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.30, random_state=42, stratify=target, shuffle=True) 
print(f"Train set size: {len(x_train)}\nTest set size: {len(x_test)}")

Train set size: 15080
Test set size: 6464


Text Preparation

In [None]:
import pickle

In [None]:
b_file = open(directory+r"SUCCESStokenized_test.pkl", "rb")
tokenized_test = pickle.load(b_file)

In [None]:
len(tokenized_test)

6464

In [None]:
MAX_LEN_TEST = 128

tokenized_test  = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test]

print ("The truncated tokenized first train sentence:")
print (tokenized_test[0])

The truncated tokenized first train sentence:
['[CLS]', 'De', '##glu', '##ti', '##i', 'e', 'mi', 'sistema', '##i', 'la', 'giacca', 'sosp', '##ira', '##ndo', '.', 'Era', 'pronto', '.', 'Funziona', '##va', '.', 'Questo', 'significava', 'che', 'non', 'sarei', 'morto', '.', 'Ma', 'significava', 'che', 'avrei', 'ucciso', '.', 'Qualcosa', 'di', 'caldo', 'mi', 'sce', '##se', 'lungo', 'il', 'viso', '.', 'Erano', 'lacrime', 'di', 'paura', '.', 'Non', 'ero', 'un', 'assassino', 'e', 'non', 'volevo', 'esserlo', '.', 'Ma', 'dovevo', 'esserlo', '.', 'Non', 'volevo', 'nemmeno', 'pensare', 'al', 'dolore', 'che', 'avrei', 'provato', 'nel', 'caso', 'avessi', 'fallito', '.', 'Chiu', '##si', 'gli', 'occhi', 'e', 'mi', 'sede', '##tti', 'sul', 'pavimento', 'pol', '##vero', '##so', 'della', 'Stan', '##za', 'delle', 'Cose', 'Nas', '##cos', '##te', '.', 'Ero', 'solo', ',', 'respira', '##vo', 'in', 'silenzio', 'ascoltando', 'il', 'ti', '##cchetti', '##o', 'irregolare', 'di', 'un', 'qualche', 'agg', '##eggio', '

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "dbmdz/bert-base-italian-xxl-cased"

tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test]
ids_test = np.array([np.pad(i, (0, 128-len(i)),
                            mode='constant') for i in ids_test])

print ("The indices of the first test sentence:")
print (ids_test[0])

The indices of the first test sentence:
[  102   650 23580   116 30877   126   318  1061 30877   146 14629  3762
  6808  2017   697  1703  4441   697 24300   187   697   966 30232   158
   212  6097  2762   697   348 30232   158  3865  2901   697  8614   120
  6637   318  3006   271  1760   162 10248   697  9653 16903   120  3143
   697   313  2109   141 12408   126   212  4027 13297   697   348 12132
 13297   697   313  4027  3484  3387   157  5251   158  3865  4898   207
   995  5619 14895   697 28965   210   368  2969   126   318  2483   238
   340  9694  2470   884   289   213  3671   186   324 14196 11237  2418
   124   697  4763   484  1307 22492   209   139  8370 16846   162   364
  8916 30879 21701   120   141  1507  3668  6979 10158   158 29333   187
   203   578   731   368  4547 16223 10720   101]


In [None]:
amasks_test = []

for seq in ids_test:
  seq_mask = [float(i>0) for i in seq]
  amasks_test.append(seq_mask)

In [None]:
labels_test  = [1 if (value=='popular') else 0 for value in y_test.N_Tot_Rec]
test_inputs = torch.tensor(ids_test)
test_labels = torch.tensor(labels_test)
test_masks  = torch.tensor(amasks_test)

In [None]:
BATCH_SIZE = 32

print('Test: ', end="")
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler,
                             batch_size=BATCH_SIZE)
print(len(test_data), 'documents')

Test: 6464 documents


# MODIFICHE

In [None]:
import torch.nn as nn
from sklearn.metrics import f1_score

In [None]:
from tabulate import tabulate

def evaluate(loader, loss_vector=None, test=False):
    model.eval()

    n_correct, n_all = 0, 0
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    b_labels_v2 = torch.tensor([],dtype=torch.long)
    b_input_ids_v2 = torch.tensor([], dtype=torch.long)
    b_attn_mask_v2 = torch.tensor([], dtype=torch.long)

    batchcount = 0
    for batch in loader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        b_labels_v2 = torch.cat((b_labels_v2.to(device), b_labels), 0)
        b_input_ids_v2 = torch.cat((b_input_ids_v2.to(device), b_input_ids), 0)
        b_attn_mask_v2 = torch.cat((b_attn_mask_v2.to(device), b_input_mask), 0)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu()#.numpy()

        #Predictions
        predictions = np.argmax(logits.numpy(), axis=1)
        #Labels
        labels = b_labels.to('cpu').numpy()

        n_correct += np.sum(predictions == labels)
        n_all += len(labels)

        for truth, pred in zip(labels, predictions):
          if truth == 0 and pred == 0:
            TN +=1
          elif truth == 1 and pred == 1:
            TP +=1
          elif truth == 1 and pred == 0:
            FN +=1
          elif truth ==0 and pred == 1:
            FP +=1

        if test:
          batchcount += 1
          #print("Ho preso", batchcount, 'batches su ?')


    print('Accuracy: [{}/{}] {:.4f}'.format(n_correct, n_all,
                                          n_correct/n_all))

    if (TP+FP!=0):
      precision_1 = (TP) / (TP+FP)
    else:
      precision_1 = 0

    if (TP+FN!=0):
      recall_1 = (TP) / (TP+FN)
    else:
      recall_1 = 0

    precision_0 = TN / (TN+FN)
    recall_0 = TN / (TN+FP)

    f1_0 = 2* ((precision_0*recall_0) / (precision_0+recall_0))
    if (precision_1+recall_1) != 0:
      f1_1 = 2* ((precision_1*recall_1) / (precision_1+recall_1))
    else:
      f1_1 = 0

    data = [['pop', precision_0, recall_0, f1_0],
            ['unp', precision_1, recall_1, f1_1]]

    data2 = [['predUNP', TP, FP], 
             ['predPOP', FN, TN]]

    print(tabulate(data, headers=["Class", "Precision", "Recall", "F1"]))
    print()
    print(tabulate(data2, headers=['', 'truthUNP', 'truthPOP']))



In [None]:
#with open(directory+r'MODEL_SUCCESS.pkl','wb') as outfile:
#    pickle.dump(model, outfile)

Evaluation

LOAD MODEL

In [None]:
c_file = open(directory+r"MODEL_SUCCESS3epochsFIRSTTOKENS.pkl", "rb")
model = pickle.load(c_file)

In [None]:
torch.cuda.empty_cache()
print('Test set:')
evaluate(test_dataloader, test=True)

Test set:
Accuracy: [4165/6464] 0.6443
Class      Precision    Recall        F1
-------  -----------  --------  --------
pop         0.67977   0.644495  0.661663
unp         0.607222  0.644153  0.625143

           truthUNP    truthPOP
-------  ----------  ----------
predUNP        1917        1240
predPOP        1059        2248
