In [1]:
import torch
import pandas as pd
import torch.nn as nn
import numpy as np

from Models.model import NeuralNet
from Utils.utils_Dataset import process_dataset, OneHotDataframe
from Utils.utils import TranslationDataset, make_loader
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
pkl_file = '/home/xnmaster/Language_GrauIAI_UAB.pkl' #Pickle file path

# Obtain the dataset
Task_Data          = pd.read_pickle(pkl_file)               # Read the pkl file containg the pandas dataframe object
Dataset_process    = process_dataset(Task_Data)             # Obtain the preprocess Dataset
One_hot_Dataframe  = OneHotDataframe(Dataset_process)       # Changed categorical columns using one hot vectors
num_classes = len(One_hot_Dataframe["TRANSLATOR"].unique()) # Number of translators
input_size  = len(One_hot_Dataframe.columns) - 1 

In [3]:
model = NeuralNet(input_size, num_classes)

In [5]:
weights = torch.load('/home/xnmaster/Synthesis_Project/CheckPoints/64_Batch_Size_with_Dropout.pth', map_location=torch.device('cpu'))
model.load_state_dict(weights)

RuntimeError: Error(s) in loading state_dict for NeuralNet:
	Missing key(s) in state_dict: "model.5.weight", "model.5.bias", "model.7.weight", "model.7.bias", "model.9.weight", "model.9.bias". 
	Unexpected key(s) in state_dict: "model.4.weight", "model.4.bias", "model.6.weight", "model.6.bias", "model.8.weight", "model.8.bias". 

In [5]:
def obtain_input_vector(X, 
                        PM,
                        TASK_TYPE, 
                        SOURCE_LANG, 
                        TARGET_LANG, 
                        FORECAST, 
                        HOURLY_RATECOST, 
                        QUALITY_EVALUATION, 
                        MANUFACTURER, 
                        MANUFACTURER_SECTOR):
    
    new_row = pd.Series(np.zeros(len(X.columns)), index=X.columns)

    new_row['FORECAST'], new_row['HOURLY_RATE'], new_row['QUALITY_EVALUATION'] = FORECAST, HOURLY_RATECOST, QUALITY_EVALUATION

    new_row['PM_' + PM]                                 = 1           
    new_row['TASK_TYPE_' + TASK_TYPE]                   = 1
    new_row['SOURCE_LANG_' + SOURCE_LANG]               = 1
    new_row['TARGET_LANG_' + TARGET_LANG]               = 1
    new_row['MANUFACTURER_'+ MANUFACTURER]              = 1
    new_row['MANUFACTURER_SECTOR_'+MANUFACTURER_SECTOR] = 1

    return torch.tensor(new_row.values, dtype=torch.float32) 

new_row = obtain_input_vector(One_hot_Dataframe.loc[:, One_hot_Dataframe.columns != 'TRANSLATOR'],PM = "BMT",
                    TASK_TYPE = "Engineering", 
                    SOURCE_LANG = "English", 
                    TARGET_LANG = "Spanish (Iberian)", 
                    FORECAST = 32, 
                    HOURLY_RATECOST = 5, 
                    QUALITY_EVALUATION = 1, 
                    MANUFACTURER = 'TrueConnect', 
                    MANUFACTURER_SECTOR = 'Information Technology')

In [6]:
X                  = One_hot_Dataframe.loc[:, One_hot_Dataframe.columns != 'TRANSLATOR'] # Features
Translators        = One_hot_Dataframe['TRANSLATOR']                                            # True labels

In [7]:
Splits_Train_Test  = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
Train_Test_indices = Splits_Train_Test.split(X, Translators)

train_indices, test_indices = next(Train_Test_indices)

Dataset = TranslationDataset(One_hot_Dataframe, X.values, test_indices)
DataLoader = make_loader(Dataset, 64, shuffle=False)

In [43]:
model = model.to('cuda')
y_true = []
y_pred = []

model.eval()
for data, label in DataLoader:
    data, label = data.to('cuda'), label.to('cuda')

    output = model(data)                                                         
    pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
    y_true.append(label.tolist())
    y_pred.append(pred.squeeze().tolist())                                                       
    


In [44]:
y_true = [[Dataset.Labels2Translator[int(translator_idx)] for translator_idx in list_true] for list_true in y_true]
y_pred = [[Dataset.Labels2Translator[int(translator_idx)] for translator_idx in list_pred] for list_pred in y_pred]

In [49]:
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix


def bio_classification_report(y_true, y_pred):
    """

    Classification report.
    You can use this as evaluation for both in the baseline model and new model.
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [50]:
TRANSLATORS = bio_classification_report(y_true, y_pred)
print('Translators evaluation', TRANSLATORS)

Translators evaluation                      precision    recall  f1-score   support

           Abelardo       0.00      0.00      0.00       333
      Acacio Poncio       0.00      0.00      0.00       577
 Adalberto Anatolio       0.00      0.00      0.00      1026
             Agueda       0.00      0.00      0.00       953
          Alejandro       0.01      0.64      0.01       327
              Aline       0.00      0.00      0.00       635
    Almudena Fiamma       0.00      0.00      0.00       572
              Amaro       0.00      0.00      0.00       424
      Ambrosia Adon       0.00      0.00      0.00      1144
        Ana Daniela       0.00      0.00      0.00       166
    Anselma Daciano       0.00      0.00      0.00       531
    Ariadna Laurina       0.00      0.00      0.00       943
    Artur Fulgencio       0.00      0.00      0.00       960
          Ascension       0.00      0.00      0.00      2479
            Beatriz       0.00      0.00      0.00       202


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
Softmax = nn.Softmax(dim=1)
def return_top_10(input, num):
    output = model(input)
    output_softmax = Softmax(output.unsqueeze(0))
    sorted_values, sorted_indices = torch.sort(output_softmax, descending=True)
    sorted_values, sorted_indices = sorted_values[0][:num], sorted_indices[0][:num]
    mapped_list_comprehension = [(Dataset.Labels2Translator[int(num)], round(float(values), 2)) for num, values in zip(sorted_indices, sorted_values)]

    return mapped_list_comprehension

return_top_10(new_row, 10)