## Load dataset

In [1]:
import glob
import pandas as pd

dataset_path = 'full'
#dataset_path = 'small'

files = glob.glob(f'../evaluate_results/{dataset_path}/outer/*.csv', recursive=True)

print(len(files), 'files')
files[0:3]

40 files


["../evaluate_results/full/outer/LogisticRegression-split_x_y_split_with_one_hot_encoding-{'multi_class': ['auto'], 'solver': ['liblinear']}-cross_entropy-(1 of 5).csv",
 "../evaluate_results/full/outer/SVC-split_x_y_split_with_one_hot_encoding-{'C': [0.002, 2.0, 2000.0, 2000000.0, 2000000000.0, 2000000000000.0], 'gamma': [2e-13, 2e-10, 2e-07, 0.0002, 0.2, 200.0], 'kernel': ['rbf'], 'probability': [True]}-accuracy-(3 of 5).csv",
 "../evaluate_results/full/outer/LogisticRegression-split_x_y_with_bag_of_words-{'multi_class': ['auto'], 'solver': ['liblinear']}-cross_entropy-(2 of 5).csv"]

In [2]:
# Read all data
data = pd.concat([pd.read_csv(file, index_col=[0]) for file in files])
data.head(2)

Unnamed: 0,best_params,column,i_outer,metric,model,params,refit,split_method,value
0,"{'multi_class': 'auto', 'solver': 'liblinear'}",0,0,accuracy,LogisticRegression,"{'multi_class': ['auto'], 'solver': ['liblinea...",cross_entropy,split_x_y_split_with_one_hot_encoding,0.475751
1,"{'multi_class': 'auto', 'solver': 'liblinear'}",0,0,hit@5,LogisticRegression,"{'multi_class': ['auto'], 'solver': ['liblinea...",cross_entropy,split_x_y_split_with_one_hot_encoding,0.676674


## Results

In [3]:
result = data.groupby(['model', 'split_method', 'metric'])['value'].agg(["mean", "std"])
result.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,std
model,split_method,metric,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsClassifier,split_x_y_split_with_one_hot_encoding,accuracy,0.356083,0.131148
KNeighborsClassifier,split_x_y_split_with_one_hot_encoding,hit@5,0.525216,0.128174
KNeighborsClassifier,split_x_y_split_with_one_hot_encoding,map@5,0.303942,0.155788


## Result as pivot table

(Beauty table bellow)

In [4]:
pivot_table = result.pivot_table(index=['model', 'split_method'], columns='metric', values=['mean', 'std'])
pivot_table

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,mean,std,std,std,std,std
Unnamed: 0_level_1,metric,accuracy,cross_entropy,hit@5,map@5,mdcg,accuracy,cross_entropy,hit@5,map@5,mdcg
model,split_method,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
KNeighborsClassifier,split_x_y_split_with_one_hot_encoding,0.356083,,0.525216,0.303942,0.544766,0.131148,,0.128174,0.155788,0.103685
KNeighborsClassifier,split_x_y_with_bag_of_words,0.342662,,0.520976,0.30025,0.539627,0.135818,,0.131489,0.156469,0.10757
LogisticRegression,split_x_y_split_with_one_hot_encoding,0.389859,2.778379,0.593544,0.325379,0.591116,0.128102,0.643164,0.113773,0.153554,0.096964
LogisticRegression,split_x_y_with_bag_of_words,0.34266,3.131092,0.51519,0.317795,0.544051,0.13713,0.695718,0.140499,0.160684,0.109393
MLPClassifier,split_x_y_split_with_one_hot_encoding,0.347435,,0.552361,0.307099,0.556736,0.127203,,0.111177,0.151721,0.09647
MLPClassifier,split_x_y_with_bag_of_words,0.352223,,0.560688,0.31976,0.563953,0.130594,,0.117527,0.149939,0.099597
SVC,split_x_y_split_with_one_hot_encoding,0.397491,,0.583678,0.335118,0.58562,0.122176,,0.123908,0.157263,0.101471
SVC,split_x_y_with_bag_of_words,0.342274,,0.553214,0.328394,0.568154,0.142473,,0.137107,0.160132,0.108311


### 'Cute' Pivot table

In [5]:
def format_percentage(x):
    return '{:2.2%}'.format(x).replace('%', '\%')

def format_std(x):
    return '{:.4}'.format(x)


new_pivot_table = pd.DataFrame({
    'Accuracy': '$' + pivot_table['mean']['accuracy'].map(format_percentage) + ' \pm ' + pivot_table['std']['accuracy'].map(format_std) + "$",
    'Hit@5':    '$' + pivot_table['mean']['hit@5'].map(format_percentage) + ' \pm ' + pivot_table['std']['hit@5'].map(format_std) + "$",
    'MDCG':     '$' + pivot_table['mean']['mdcg'].map(format_std) + ' \pm ' + pivot_table['std']['mdcg'].map(format_std) + "$",
    'MAP@5':    '$' + pivot_table['mean']['map@5'].map(format_percentage) + ' \pm ' + pivot_table['std']['map@5'].map(format_std) + "$",
})

new_pivot_table.index.set_names(['Model','Embedding'],inplace=True)
new_pivot_table.rename(index={
    'split_x_y': 'No embedding',
    'split_x_y_split_with_one_hot_encoding': 'One-hot concatenated',
    'split_x_y_with_bag_of_words': 'Bag-of-words',
    'KNeighborsClassifier': '$k$-NN',
    'LogisticRegression': 'Logistic Regression',
    'MLPClassifier': 'MLP',
}, inplace=True)

new_pivot_table.to_latex("table.tex", escape=False)
new_pivot_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Hit@5,MDCG,MAP@5
Model,Embedding,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
$k$-NN,One-hot concatenated,$35.61\% \pm 0.1311$,$52.52\% \pm 0.1282$,$0.5448 \pm 0.1037$,$30.39\% \pm 0.1558$
$k$-NN,Bag-of-words,$34.27\% \pm 0.1358$,$52.10\% \pm 0.1315$,$0.5396 \pm 0.1076$,$30.03\% \pm 0.1565$
Logistic Regression,One-hot concatenated,$38.99\% \pm 0.1281$,$59.35\% \pm 0.1138$,$0.5911 \pm 0.09696$,$32.54\% \pm 0.1536$
Logistic Regression,Bag-of-words,$34.27\% \pm 0.1371$,$51.52\% \pm 0.1405$,$0.5441 \pm 0.1094$,$31.78\% \pm 0.1607$
MLP,One-hot concatenated,$34.74\% \pm 0.1272$,$55.24\% \pm 0.1112$,$0.5567 \pm 0.09647$,$30.71\% \pm 0.1517$
MLP,Bag-of-words,$35.22\% \pm 0.1306$,$56.07\% \pm 0.1175$,$0.564 \pm 0.0996$,$31.98\% \pm 0.1499$
SVC,One-hot concatenated,$39.75\% \pm 0.1222$,$58.37\% \pm 0.1239$,$0.5856 \pm 0.1015$,$33.51\% \pm 0.1573$
SVC,Bag-of-words,$34.23\% \pm 0.1425$,$55.32\% \pm 0.1371$,$0.5682 \pm 0.1083$,$32.84\% \pm 0.1601$


## Load dataset (hide two columns)

### Two hidden columns

In [6]:
files = glob.glob(f'../evaluate_results/hide_two/outer/*.csv', recursive=True)

# Read all data
data_hide_two = pd.concat([pd.read_csv(file, index_col=[0]) for file in files])

result = data_hide_two.groupby(['model', 'split_method', 'metric', 'column'])['value'].agg(["mean", "std"])
pivot_table = result.pivot_table(index=['model', 'split_method', 'column'], columns='metric', values=['mean', 'std'])


new_pivot_table = pd.DataFrame({
    'Accuracy': '$' + pivot_table['mean']['accuracy'].map(format_percentage) + ' \pm ' + pivot_table['std']['accuracy'].map(format_std) + "$",
    'Hit@5':    '$' + pivot_table['mean']['hit@5'].map(format_percentage) + ' \pm ' + pivot_table['std']['hit@5'].map(format_std) + "$",
    'MDCG':     '$' + pivot_table['mean']['mdcg'].map(format_std) + ' \pm ' + pivot_table['std']['mdcg'].map(format_std) + "$",
    'MAP@5':    '$' + pivot_table['mean']['map@5'].map(format_percentage) + ' \pm ' + pivot_table['std']['map@5'].map(format_std) + "$",
})

new_pivot_table.index.set_names(['Model','Embedding', 'Column'],inplace=True)
new_pivot_table.rename(index={
    'split_x_y': 'No embedding',
    'split_x_y_split_with_one_hot_encoding': 'One-hot concatenated',
    'split_x_y_with_bag_of_words': 'Bag-of-words',
    'KNeighborsClassifier': '$k$-NN',
    'LogisticRegression': 'Logistic Regression',
    'MLPClassifier': 'MLP',
}, inplace=True)

print("Duas colunas são ocultadas. A indicada em 'Column' é o índice da coluna colocada como 'y'. A outra coluna ocultada segue a seguinte regra")
print("Obs: A 5º coluna é mostrada como '3' pq o índice começa como zero e pq eu tiro a 2º coluna para computar os dados")
print("  - if column = 1, then the other hidden column is 5: P(column_1 | columns \ {column_1, column_5})")
print("  - if column = 5, then the other hidden column is 1: P(column_5 | columns \ {column_1, column_5})")
new_pivot_table

Duas colunas são ocultadas. A indicada em 'Column' é o índice da coluna colocada como 'y'. A outra coluna ocultada segue a seguinte regra
Obs: A 5º coluna é mostrada como '3' pq o índice começa como zero e pq eu tiro a 2º coluna para computar os dados
  - if column = 1, then the other hidden column is 5: P(column_1 | columns \ {column_1, column_5})
  - if column = 5, then the other hidden column is 1: P(column_5 | columns \ {column_1, column_5})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Accuracy,Hit@5,MDCG,MAP@5
Model,Embedding,Column,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
$k$-NN,No embedding,1,$20.18\% \pm 0.01349$,$39.75\% \pm 0.02071$,$0.4291 \pm 0.01767$,$14.59\% \pm 0.00535$
$k$-NN,No embedding,3,$44.05\% \pm 0.0242$,$58.91\% \pm 0.0203$,$0.6037 \pm 0.01766$,$42.51\% \pm 0.01449$
Logistic Regression,One-hot concatenated,1,$24.06\% \pm 0.02254$,$47.39\% \pm 0.01177$,$0.4815 \pm 0.01662$,$16.05\% \pm 0.005102$
Logistic Regression,One-hot concatenated,3,$46.65\% \pm 0.0167$,$66.87\% \pm 0.009236$,$0.6516 \pm 0.01039$,$44.30\% \pm 0.01327$
SVC,One-hot concatenated,1,$25.13\% \pm 0.01952$,$46.83\% \pm 0.01893$,$0.4769 \pm 0.01353$,$16.40\% \pm 0.007493$


### Compare with one hidden column

In [7]:
pivot_table = data[
        (data.split_method == 'split_x_y_split_with_one_hot_encoding') 
      & ((data.column == 1) | (data.column == 4))] \
    .groupby(['model', 'split_method', 'metric', 'column'])['value'] \
    .agg(["mean", "std"]) \
    .pivot_table(index=['model', 'split_method', 'column'], columns='metric', values=['mean', 'std'])



new_pivot_table = pd.DataFrame({
    'Accuracy': '$' + pivot_table['mean']['accuracy'].map(format_percentage) + ' \pm ' + pivot_table['std']['accuracy'].map(format_std) + "$",
    'Hit@5':    '$' + pivot_table['mean']['hit@5'].map(format_percentage) + ' \pm ' + pivot_table['std']['hit@5'].map(format_std) + "$",
    'MDCG':     '$' + pivot_table['mean']['mdcg'].map(format_std) + ' \pm ' + pivot_table['std']['mdcg'].map(format_std) + "$",
    'MAP@5':    '$' + pivot_table['mean']['map@5'].map(format_percentage) + ' \pm ' + pivot_table['std']['map@5'].map(format_std) + "$",
})

new_pivot_table.index.set_names(['Model','Embedding', 'Column'],inplace=True)
new_pivot_table.rename(index={
    'split_x_y': 'No embedding',
    'split_x_y_split_with_one_hot_encoding': 'One-hot concatenated',
    'split_x_y_with_bag_of_words': 'Bag-of-words',
    'KNeighborsClassifier': '$k$-NN',
    'LogisticRegression': 'Logistic Regression',
    'MLPClassifier': 'MLP',
}, inplace=True)

print("Somente a coluna indicada está oculta")
print("A 5º coluna é 4 pq não deletei a 2º coluna")
print("  - P(column_n | columns \ {column_n})")

new_pivot_table

Somente a coluna indicada está oculta
A 5º coluna é 4 pq não deletei a 2º coluna
  - P(column_n | columns \ {column_n})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Accuracy,Hit@5,MDCG,MAP@5
Model,Embedding,Column,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
$k$-NN,One-hot concatenated,1,$20.50\% \pm 0.01293$,$39.43\% \pm 0.03035$,$0.4272 \pm 0.01974$,$14.50\% \pm 0.003722$
$k$-NN,One-hot concatenated,4,$44.01\% \pm 0.02206$,$60.16\% \pm 0.02256$,$0.6111 \pm 0.01183$,$43.22\% \pm 0.01341$
Logistic Regression,One-hot concatenated,1,$24.20\% \pm 0.01109$,$47.71\% \pm 0.01599$,$0.4834 \pm 0.0125$,$16.06\% \pm 0.003287$
Logistic Regression,One-hot concatenated,4,$47.57\% \pm 0.009918$,$66.96\% \pm 0.01218$,$0.6579 \pm 0.006775$,$44.39\% \pm 0.01388$
MLP,One-hot concatenated,1,$21.19\% \pm 0.02716$,$43.87\% \pm 0.02868$,$0.4537 \pm 0.01814$,$15.62\% \pm 0.003772$
MLP,One-hot concatenated,4,$42.20\% \pm 0.0204$,$62.29\% \pm 0.01188$,$0.616 \pm 0.01249$,$42.48\% \pm 0.01522$
SVC,One-hot concatenated,1,$25.40\% \pm 0.01053$,$45.03\% \pm 0.02116$,$0.4703 \pm 0.01388$,$16.35\% \pm 0.006784$
SVC,One-hot concatenated,4,$47.06\% \pm 0.01376$,$66.68\% \pm 0.01479$,$0.6518 \pm 0.01176$,$44.60\% \pm 0.01115$


Ao comparar as duas tabelas, se conclui que 

* remover a coluna 4 tira pouca informação na previsão da coluna 1;
* remover a coluna 1 tira pouca informação na previsão da coluna 4.