## Notebook copia dell'example_deeper per scovare il bug

In [5]:
import pandas as pd
import numpy as np
import os
import gensim.downloader as api
import models.DeepER as dp
from keras.models import load_model
from certa.local_explain import find_similarities
from certa.local_explain import dataset_local
from certa.triangles_method import explainSamples
from certa.eval import expl_eval


In [6]:
def merge_sources(table, left_prefix, right_prefix, left_source, right_source, copy_from_table, ignore_from_table):
    dataset = pd.DataFrame(columns={col: table[col].dtype for col in copy_from_table})
    ignore_column = copy_from_table + ignore_from_table

    for _, row in table.iterrows():
        leftid = row[left_prefix + 'id']
        rightid = row[right_prefix + 'id']

        new_row = {column: row[column] for column in copy_from_table}

        try:
            for id, source, prefix in [(leftid, left_source, left_prefix), (rightid, right_source, right_prefix)]:

                for column in source.keys():
                    if column not in ignore_column:
                        new_row[prefix + column] = source.loc[id][column]

            dataset = dataset.append(new_row, ignore_index=True)
        except:
            pass
    return dataset


def to_deeper_data(df: pd.DataFrame):
    res = []
    for r in range(len(df)):
        row = df.iloc[r]
        lpd = row.filter(regex='^ltable_')
        rpd = row.filter(regex='^rtable_')
        if 'label' in row:
            label = row['label']
            res.append((lpd.values.astype('str'), rpd.values.astype('str'), label))
        else:
            res.append((lpd.values.astype('str'), rpd.values.astype('str')))
    return res


def predict_fn(x, m, ignore_columns=['ltable_id', 'rtable_id', 'label']):
    data = to_deeper_data(x.drop([c for c in ignore_columns if c in x.columns], axis=1))
    out = dp.predict(data, model, embeddings_model, tokenizer)
    out_df = pd.DataFrame(out, columns=['nomatch_score', 'match_score'])
    out_df.index = x.index
    return pd.concat([x.copy(), out_df], axis=1)


def get_original_prediction(r1, r2):
    lprefix = 'ltable_'
    rprefix = 'rtable_'
    r1_df = pd.DataFrame(data=[r1.values], columns=r1.index)
    r2_df = pd.DataFrame(data=[r2.values], columns=r2.index)
    r1_df.columns = list(map(lambda col: lprefix + col, r1_df.columns))
    r2_df.columns = list(map(lambda col: rprefix + col, r2_df.columns))
    r1r2 = pd.concat([r1_df, r2_df], axis=1)
    r1r2['id'] = "0@" + str(r1r2[lprefix + 'id'].values[0]) + "#" + "1@" + str(r1r2[rprefix + 'id'].values[0])
    r1r2 = r1r2.drop([lprefix + 'id', rprefix + 'id'], axis=1)
    return predict_fn(r1r2, model)[['nomatch_score', 'match_score']].values[0]

In [7]:
datadir = 'datasets/beers/'
lsource = pd.read_csv(datadir + 'tableA.csv')
rsource = pd.read_csv(datadir + 'tableB.csv')
gt = pd.read_csv(datadir + 'train.csv')
valid = pd.read_csv(datadir + 'valid.csv')
test = pd.read_csv(datadir + 'test.csv')

train_df = merge_sources(gt, 'ltable_', 'rtable_', lsource, rsource, ['label'], ['id'])
valid_df = merge_sources(valid, 'ltable_', 'rtable_', lsource, rsource, ['label'], ['id'])
test_df = merge_sources(test, 'ltable_', 'rtable_', lsource, rsource, ['label'], ['id'])

if not os.path.exists('models/glove.6B.50d.txt'):
    word_vectors = api.load("glove-wiki-gigaword-50")
    word_vectors.save_word2vec_format('models/glove.6B.50d.txt', binary=False)

embeddings_index = dp.init_embeddings_index('models/glove.6B.50d.txt')
emb_dim = len(embeddings_index['cat'])
embeddings_model, tokenizer = dp.init_embeddings_model(embeddings_index)

* Costruzione indice degli embeddings.....Fatto. 400001 embeddings totali.
* Creazione del modello per il calcolo degli embeddings....
* Inizializzo il tokenizzatore.....Fatto: 400001 parole totali.
* Preparazione della matrice di embedding.....Fatto. Dimensioni matrice embeddings: (400002, 50)

°°° EMBEDDING MODEL °°°
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Tupla_A (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
Tupla_B (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding_lookup (Embedding)    (None, None, 50)     20000100    Tupla

## ------------------- Prima modifica ------------------------------

## Se esiste già un modello allenato prendi quello

In [8]:
if os.path.exists("DeepER_best_model.h5"):
    model = load_model("DeepER_best_model.h5")
else:
    model = dp.init_DeepER_model(emb_dim)
    model = dp.train_model_ER(to_deeper_data(
        train_df), model, embeddings_model, tokenizer)

## ---------------- Fine prima modifica ----------------------------

In [10]:
theta_min, theta_max = find_similarities(train_df, -2)

## ------------------------- Seconda modifica -------------------

## Con questo modello la coppia 217-217 produce l'errore 

In [11]:
l_tuple = lsource.iloc[217]
r_tuple = rsource.iloc[217]

## ----------------------- Fine seconda modifica -------------------

In [12]:
prediction = get_original_prediction(l_tuple, r_tuple)
class_to_explain = np.argmax(prediction)

local_samples = dataset_local(l_tuple, r_tuple, model, lsource, rsource, datadir, theta_min, theta_max, predict_fn,
                              num_triangles=8)

explanation, flipped_pred = explainSamples(local_samples, [lsource, rsource], model, predict_fn,
                                           class_to_explain=class_to_explain, maxLenAttributeSet=3)
print(explanation)

eval_data = []
for exp in explanation:
    e_attrs = exp.split('/')
    e_score = explanation[exp]
    expl_evaluation = expl_eval(class_to_explain, e_attrs, e_score, lsource, l_tuple, model, prediction, rsource,
                                r_tuple, predict_fn)
    print(expl_evaluation.head())
    impact_score = expl_evaluation["impact"].mean()
    mean_drop = expl_evaluation["drop"].mean()
    eval_data.append([impact_score, mean_drop])

eval_data_df = pd.DataFrame(eval_data, columns=['impact-score', 'mean-drop'])
print(f'aggregated impact-score:{eval_data_df["impact-score"].mean()}')
print(f'aggregated mean-drop:{eval_data_df["mean-drop"].mean()}')

100%|██████████| 8/8 [00:02<00:00,  3.89it/s]


defaultdict(<class 'int'>, {'Beer_Name/Brew_Factory_Name/ABV': 0.875})


InvalidArgumentError:    Tried to stack elements of an empty list with non-fully-defined element_shape: [?,150]
	 [[{{node TensorArrayV2Stack/TensorListStack}}]]
	 [[model_1/Composition/forward_lstm/PartitionedCall_1]] [Op:__inference_predict_function_8682]

Function call stack:
predict_function -> predict_function -> predict_function
