In [1]:
import torch
import pandas as pd
import torch.nn as nn
import numpy as np

from Models.model import NeuralNet
from Utils.utils_Dataset import process_dataset, OneHotDataframe

from validation import validate
from Utils.utils import TranslationDataset, make_loader
from sklearn.model_selection import StratifiedShuffleSplit

In these notebook we will show some qualitative results about or Translators Model.
We will follow this steps:
1. Loading all the necessary data and functions
2. Importing the models weights
3. Making predictions with the model

## Data preparation

In [2]:
#pkl_file = '/home/xnmaster/Language_GrauIAI_UAB.pkl' #Pickle file path
pkl_file = r"C:\Users\34644\Desktop\Second Semester\Synthesis Project\Language_GrauIAI_UAB.pkl"

# Obtain the dataset
Task_Data          = pd.read_pickle(pkl_file)               # Read the pkl file containg the pandas dataframe object
Dataset_process    = process_dataset(Task_Data)             # Obtain the preprocess Dataset
One_hot_Dataframe  = OneHotDataframe(Dataset_process)       # Changed categorical columns using one hot vectors
num_classes = len(One_hot_Dataframe["TRANSLATOR"].unique()) # Number of translators
input_size  = len(One_hot_Dataframe.columns) - 1 

In [3]:
model = NeuralNet(input_size, num_classes)

In [4]:
weights = torch.load(r'C:\Users\34644\Desktop\Second Semester\Synthesis Project\Code_Project\CheckPoints\256_Batch_Size_30_epocs.pth', map_location="cpu")
model.load_state_dict(weights)

<All keys matched successfully>

In [5]:
X                  = One_hot_Dataframe.loc[:, One_hot_Dataframe.columns != 'TRANSLATOR'] # Features
Translators        = One_hot_Dataframe['TRANSLATOR']   

Splits_Train_Test  = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
Train_Test_indices = Splits_Train_Test.split(X, Translators)

train_indices, test_indices = next(Train_Test_indices)

Dataset = TranslationDataset(One_hot_Dataframe, X.values, test_indices)
DataLoader = make_loader(Dataset, 64, shuffle=False)

In [None]:
# Validation of the model. Needed of gpu to check it.
# criterion = torch.nn.CrossEntropyLoss() # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss
# validate(criterion, model, DataLoader, device = 'cuda')

Test set: Average loss: 0.0059, Accuracy: 56283/63485 (89%)



(0.005912481310931474, 88.65558793415768)

## Making Predictions

In [None]:
def obtain_input_vector(X, 
                        PM,
                        TASK_TYPE, 
                        SOURCE_LANG, 
                        TARGET_LANG, 
                        FORECAST, 
                        HOURLY_RATE,
                        COST, 
                        QUALITY_EVALUATION, 
                        MANUFACTURER, 
                        MANUFACTURER_SECTOR):
    
    new_row = pd.Series(np.zeros(len(X.columns)), index=X.columns)

    new_row['FORECAST'], new_row['HOURLY_RATE'], new_row['QUALITY_EVALUATION'], new_row['COST'] = FORECAST, HOURLY_RATE, QUALITY_EVALUATION, COST

    new_row['PM_' + PM]                                 = 1           
    new_row['TASK_TYPE_' + TASK_TYPE]                   = 1
    new_row['SOURCE_LANG_' + SOURCE_LANG]               = 1
    new_row['TARGET_LANG_' + TARGET_LANG]               = 1
    new_row['MANUFACTURER_'+ MANUFACTURER]              = 1
    new_row['MANUFACTURER_SECTOR_'+MANUFACTURER_SECTOR] = 1

    return torch.tensor(new_row.values, dtype=torch.float32) 

In [None]:
new_row = obtain_input_vector(One_hot_Dataframe.loc[:, One_hot_Dataframe.columns != 'TRANSLATOR'],
                    PM = "PMT",
                    TASK_TYPE = "ProofReading", 
                    SOURCE_LANG = "English", 
                    TARGET_LANG = "Spanish (Iberian)", 
                    FORECAST = 0.28,
                    HOURLY_RATE = 23,
                    COST = 6.44,
                    QUALITY_EVALUATION = 6, 
                    MANUFACTURER = 'MotorForge', 
                    MANUFACTURER_SECTOR = 'Consumer Discretionary')

In [None]:
def return_top_10(input, num):
    Softmax = nn.Softmax(dim=1)
    output = model(input)
    output_softmax = Softmax(output.unsqueeze(0))
    sorted_values, sorted_indices = torch.sort(output_softmax, descending=True)
    sorted_values, sorted_indices = sorted_values[0][:num], sorted_indices[0][:num]
    mapped_list_comprehension = [(Dataset.Labels2Translator[int(num)], round(float(values), 2)) for num, values in zip(sorted_indices, sorted_values)]

    return pd.DataFrame(mapped_list_comprehension, columns=['Translator', 'Suitability'])

return_top_10(new_row.to('cuda'), 10)

Unnamed: 0,Translator,Suitability
0,Christian,0.54
1,Enith,0.26
2,Maximo,0.14
3,Margarita,0.02
4,Fortunato,0.01
5,Paula,0.01
6,Abelardo,0.01
7,Rafaela,0.0
8,Luis Felipe,0.0
9,Porfirio,0.0


## Metric evaluation

In [None]:
model = model.to('cuda')
y_true = []
y_pred = []

model.eval()
for data, label in DataLoader:
    data, label = data.to('cuda'), label.to('cuda')

    output = model(data)                                                         
    pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
    y_true.append(label.tolist())
    y_pred.append(pred.squeeze().tolist())                                                       
    

In [None]:
y_true = [[Dataset.Labels2Translator[int(translator_idx)] for translator_idx in list_true] for list_true in y_true]
y_pred = [[Dataset.Labels2Translator[int(translator_idx)] for translator_idx in list_pred] for list_pred in y_pred]

In [None]:
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix


def bio_classification_report(y_true, y_pred):
    """

    Classification report.
    You can use this as evaluation for both in the baseline model and new model.
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [None]:
TRANSLATORS = bio_classification_report(y_true, y_pred)
print('Translators evaluation')
print(TRANSLATORS)

Translators evaluation
                     precision    recall  f1-score   support

           Abelardo       0.92      0.89      0.90       999
      Acacio Poncio       0.98      0.98      0.98      1730
 Adalberto Anatolio       0.87      0.97      0.92      3078
             Agueda       0.99      0.96      0.98      2860
          Alejandro       0.81      0.87      0.84       982
              Aline       1.00      0.99      1.00      1904
    Almudena Fiamma       0.99      0.97      0.98      1717
              Amaro       0.79      0.74      0.76      1273
      Ambrosia Adon       0.64      0.98      0.77      3432
        Ana Daniela       0.91      0.90      0.90       497
    Anselma Daciano       0.74      0.84      0.79      1592
    Ariadna Laurina       0.92      0.86      0.89      2829
    Artur Fulgencio       0.00      0.00      0.00      2880
          Ascension       0.96      0.99      0.98      7438
            Beatriz       0.99      0.71      0.82       605
