In [1]:
# OBS: not all print() are captured into .log

In [2]:
import sys
if('ipykernel_launcher' in sys.argv[0]):
    using_jupyter_gui = True
else:
    using_jupyter_gui = False

In [3]:
# custom lib
import pandas as pd
import numpy as np
import scipy as sp
from joblib import Parallel, delayed

# build-in lib
import time
from datetime import datetime

In [4]:
import warnings
import itertools
import statsmodels.api as sm

if(using_jupyter_gui):
    import matplotlib.pyplot as plt
    #plt.style.use('ggplot')
    plt.style.use('fivethirtyeight')

  from pandas.core import datetools


In [448]:
# multi files can be loaded if joblib and multi servers are used.
result_filename_listing = ['started_GMT_20170922_0910-distance_2ed_extended_weighted--missing_0.05-runner_1_0.npy']

In [449]:
result_all_df = pd.DataFrame()
columns = ['train_index', 'k', 'd', 'v', 'err']
for one_result_filename in result_filename_listing:
    print(one_result_filename)
    t = np.load(one_result_filename)
    t = pd.DataFrame(t.reshape(t.shape[0] * t.shape[1], t.shape[2]), columns=columns)
    t[['train_index', 'k', 'd', 'v']] = t[['train_index', 'k', 'd', 'v']].astype(int)
    result_all_df = result_all_df.append(t)
    print(result_all_df.shape)

started_GMT_20170922_0910-distance_2ed_extended_weighted--missing_0.05-runner_1_0.npy
(4291, 5)


In [450]:
np.sqrt(np.mean((result_all_df.err)**2))

78.705713830064823

In [451]:
def MAE(x_array):
    return(np.mean(np.abs(x_array)))
def RMSE(x_array):
    return(np.sqrt(np.mean(x_array ** 2)))

### below: distance_2ed_extended_weighted

In [452]:
# np.save('imputed_index_and_values_t145325.npy', result_all_df)

In [453]:
result_all_df.head()

Unnamed: 0,train_index,k,d,v,err
0,17842,32,4,8,76.96875
1,17843,32,4,8,76.84375
2,17844,32,4,8,77.78125
3,17845,32,4,8,77.40625
4,19757,32,4,8,83.625


In [454]:
tt = result_all_df.groupby(['k'])['err'].apply(RMSE)
tt

k
32    78.705714
Name: err, dtype: float64

In [455]:
tt = result_all_df.groupby(['d'])['err'].apply(RMSE)
tt

d
4    78.705714
Name: err, dtype: float64

In [456]:
tt = result_all_df.groupby(['v'])['err'].apply(RMSE)
tt

v
8    78.705714
Name: err, dtype: float64

In [457]:
tt = result_all_df.groupby(['k', 'd', 'v'])['err'].apply(RMSE).reset_index()
tt = tt.sort_values('err')
tt

Unnamed: 0,k,d,v,err
0,32,4,8,78.705714


# Save Imputed Results

In [458]:
result_all_df['err'] = result_all_df['err'] -1

In [459]:
result_all_df.head()

Unnamed: 0,train_index,k,d,v,err
0,17842,32,4,8,75.96875
1,17843,32,4,8,75.84375
2,17844,32,4,8,76.78125
3,17845,32,4,8,76.40625
4,19757,32,4,8,82.625


In [460]:
imputed_index_and_values_t145325 = result_all_df
imputed_index_and_values_t145325.to_pickle('t145325_imputed_index_and_values.pkl')

# Py to Mat

In [426]:
missing_percentage_string = result_filename_listing[0][68:70].replace("-", "0")
result_filename_listing[0][68:70]

'1-'

### for param/tuples

In [240]:
save_filename = 'gsw_knn_parameter_rmse' + missing_percentage_string
save_filename

'gsw_knn_parameter_rmse90'

In [979]:
knn_parameter = tt

In [980]:
from scipy import io # mandatory, io is a moudle and must be imported
t_dict = {col_name: knn_parameter[col_name].values for col_name in knn_parameter.columns.values}

## optional if you want to save the index as an array as well:
# a_dict[df.index.name] = df.index.values
sp.io.savemat(save_filename + '.mat', {'gsw_knn_parameter':t_dict})

In [884]:
np.save(save_filename, knn_parameter)

### for SVD

In [427]:
idxmin = tt['err'].idxmin()
k_best = tt['k'][idxmin]
d_best = tt['d'][idxmin]
v_best = tt['v'][idxmin]

In [428]:
result_best_df = result_all_df[np.logical_and(np.logical_and(result_all_df['k'] == k_best, result_all_df['d'] == d_best), result_all_df['v'] == v_best)]

In [429]:
result_best_df = result_best_df[['train_index', 'err']]

In [430]:
main_data_a = np.load('Dodgers.data_original_tailored-0.05_missing-missing_set_to_-1.npy')
ground_truth = main_data_a[result_best_df['train_index']]

In [431]:
result_best_df['ground_truth'] = ground_truth

In [432]:
result_best_df['imputed_prediction'] = ground_truth + result_best_df['err']

In [433]:
#missing_percentage_string = result_filename_listing[0][70:72].replace("-", "0")
matfilename = 'gsw_knn_best_param_raw_err_df' + missing_percentage_string
matfilename

'gsw_knn_best_param_raw_err_df10'

In [434]:
from scipy import io # mandatory, io is a moudle and must be imported
t_dict = {col_name : result_best_df[col_name].values for col_name in result_best_df.columns.values}

## optional if you want to save the index as an array as well:
# a_dict[df.index.name] = df.index.values
sp.io.savemat(matfilename + '.mat', {matfilename:t_dict})

In [327]:
from rpy2.robjects import r
from rpy2.robjects import pandas2ri as pdr
pdr.activate()

In [None]:
tt = tt.sort_values('err')
r_dataframe = pandas2ri.py2ri(tt)

### below: np.linalg.norm(a-b) 

In [105]:
tt = result_all_df.groupby(['k'])['err'].apply(RMSE)
tt

k
4      5.972108
8      5.687264
16     5.584751
32     5.585895
64     5.690218
128    5.992191
Name: err, dtype: float64

In [106]:
tt = result_all_df.groupby(['d'])['err'].apply(RMSE)
tt

d
4      5.664757
8      5.681854
16     5.730000
32     5.770308
64     5.801405
128    5.876143
Name: err, dtype: float64

In [108]:
tt = result_all_df.groupby(['v'])['err'].apply(RMSE)
tt

v
0     6.166368
4     5.640827
8     5.603701
16    5.586944
Name: err, dtype: float64

In [145]:
tt = result_all_df.groupby(['k', 'd', 'v'])['err'].apply(RMSE).reset_index()
tt.sort_values('err')

Unnamed: 0,k,d,v,err
99,64,4,16,5.367887
103,64,8,16,5.386043
123,128,4,16,5.388147
74,32,4,8,5.390318
98,64,4,8,5.397236
127,128,8,16,5.397881
75,32,4,16,5.404073
102,64,8,8,5.407285
107,64,16,16,5.409680
79,32,8,16,5.416372


In [146]:
tt = result_all_df.groupby(['k', 'd', 'v'])['err'].agg(MAE).reset_index()
tt.sort_values('err')

Unnamed: 0,k,d,v,err
99,64,4,16,3.980199
74,32,4,8,3.981778
127,128,8,16,3.985389
123,128,4,16,3.986504
103,64,8,16,3.989145
98,64,4,8,3.989505
102,64,8,8,3.991692
75,32,4,16,4.000101
107,64,16,16,4.000164
122,128,4,8,4.002363


In [107]:
result_all_df.groupby(['k', 'd', 'v']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,err,train_index
k,d,v,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,4,0,count,2473.000000,2473.000000
4,4,0,mean,-0.145168,24106.795795
4,4,0,std,5.884980,14259.779430
4,4,0,min,-32.000000,288.000000
4,4,0,25%,-3.250000,11013.000000
4,4,0,50%,0.000000,23903.000000
4,4,0,75%,3.000000,36531.000000
4,4,0,max,23.250000,49209.000000
4,4,4,count,2473.000000,2473.000000
4,4,4,mean,-0.097048,24106.795795
