In [51]:
from sklearn.datasets import load_iris
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
import sys
sys.path.append('../../src/GLFMpython/')
import GLFM
import graphviz
import pandas as pd
import numpy as np
import impyute as impy
import matplotlib.pyplot as plt

In [42]:
def imputation(X_input, C):
    
    X = X_input.copy()
    np.random.shuffle(X)
    
    mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
    
    
    original = X[mask]
    X[mask] = np.nan
    
    
    X_em = impy.em(X)
    recovered_em = X_em[mask]
    
    
    data = {}
    data['C'] = C
    data['X'] = X
    hidden = dict()
    N = len(X)
    hidden['Z'] = np.random.randint(0,2,size=(N,2)).astype('float64')
    params = dict()
    params['alpha'] = 2   # concentration parameter for the IBP
    params['Niter'] = 100 # number of algorithm iterations
    params['maxK'] = 10
    params['verbose'] = 0 #do not show messages
    hidden = GLFM.infer(data, hidden,params)
    
    X_glfm = GLFM.computeMAP(data['C'], hidden['Z'], hidden, params)
    recovered_glfm = X_glfm[mask]
    
    
    score_em = np.sqrt(metrics.mean_squared_error(recovered_em,original))
    score_glfm = np.sqrt(metrics.mean_squared_error(recovered_glfm,original))
    
    table = pd.DataFrame(data={'EM':score_em,'GLFM':score_glfm},columns = ['EM','GLFM'],index = ['RMSE'])
    return table

    

# Iris

In [47]:
iris = load_iris()
X_iris = iris.data

In [48]:
table_iris = imputation(X_iris, 'pppp')

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=4, Kest=8, maxR=1


In [49]:
table_iris

Unnamed: 0,EM,GLFM
RMSE,1.461187,1.075601


# Wine

In [52]:
wine = datasets.load_wine()
X_wine = wine.data

In [54]:
table_wine = imputation(X_wine, 'ppppppppppppp')

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=13, Kest=8, maxR=1


In [55]:
table_wine

Unnamed: 0,EM,GLFM
RMSE,132.02455,89.869927


# kdd

In [57]:
kdd = pd.read_csv('kddcup.data_10_percent_corrected.csv', header = None, names = ['duration','protocol_type','service',
'flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
'root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate',
'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count',
'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','target'])

In [58]:
kdd = kdd.sample(frac=1).reset_index(drop=True)
kdd_cut = kdd.iloc[:10000,:].copy()
kdd_cut['protocol_type'].replace(kdd['protocol_type'].unique(),
                                 [i for i in range(1, len(kdd_cut['protocol_type'].unique()) + 1)], inplace = True)
kdd_cut['service'].replace(kdd_cut['service'].unique(),
                           [i for i in range(1, len(kdd_cut['service'].unique()) + 1)], inplace = True)
kdd_cut['flag'].replace(kdd_cut['flag'].unique(),
                           [i for i in range(1, len(kdd_cut['flag'].unique()) + 1)], inplace = True)
kdd_cut['target'].replace(kdd_cut['target'].unique(),
                           [i for i in range(1, len(kdd_cut['target'].unique()) + 1)], inplace = True)

In [59]:
X_kdd = kdd_cut.iloc[:,:-1]

In [62]:
X_kdd= X_kdd.values

In [72]:
table_kdd = imputation(X_kdd,'ncccnnnnnnnnnnnnnnnnnnnnpppppppnnpppppppp')

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=41, Kest=8, maxR=44


In [73]:
table_kdd

Unnamed: 0,EM,GLFM
RMSE,15064.921752,13678.54505


# Cover Type

In [64]:
covtype = pd.read_csv('covtype.data.csv',header = None)
covtype_cut = covtype.iloc[:10000,:].copy()
X_cov = covtype_cut.iloc[:,:-1].values.astype('float64')

In [65]:
table_cov = imputation(X_cov,'ppppppnnnpoooooooooooooooooooooooooooooooooooooooooooo')

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=54, Kest=8, maxR=2


In [66]:
table_cov

Unnamed: 0,EM,GLFM
RMSE,346.486144,238.630655


# Absenteeism

In [77]:
Absenteeism = pd.read_csv('Absenteeism_at_work.csv',sep = ';')
X_abs = Absenteeism.iloc[:,:-1].values.astype('float64')

In [78]:
C = []
for i  in range(0,20):
    C.append('p')
C = ''.join(C)

In [79]:
table_abs = imputation(X_abs,C)

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=20, Kest=8, maxR=1


In [81]:
table_abs

Unnamed: 0,EM,GLFM
RMSE,25.408922,26.173301


# Frog

In [82]:
frogs = pd.read_csv('Frogs_MFCCs.csv')

In [83]:
X_frogs = frogs.iloc[:,:22].values.astype('float64')

In [84]:
C = []
for i  in range(0,22):
    C.append('g')
C = ''.join(C)

In [85]:
table_frogs = imputation(X_frogs,C)

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=22, Kest=8, maxR=1


In [86]:
table_frogs

Unnamed: 0,EM,GLFM
RMSE,0.209003,0.11169


# Turkey

In [67]:
turkiye = pd.read_csv('turkiye-student-evaluation_generic.csv')

In [68]:
X_turkey = turkiye.values.astype('float64')

In [69]:
C = []
for i  in range(0,33):
    C.append('p')
C = ''.join(C)

In [70]:
table_turkey = imputation(X_turkey,C)

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=33, Kest=8, maxR=1


In [71]:
table_turkey

Unnamed: 0,EM,GLFM
RMSE,1.969237,1.202523


# transaction

In [87]:
transaction = pd.read_csv('Sales_Transactions_Dataset_Weekly.csv')

In [88]:
X_tran = transaction.iloc[:,1:55].values.astype('float64')

In [89]:
C = []
for i  in range(0,54):
    C.append('p')
C = ''.join(C)

In [90]:
table_tran = imputation(X_tran,C)

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=54, Kest=8, maxR=1


In [97]:
table_tran

Unnamed: 0,EM,GLFM
RMSE,16.154467,10.945258


# gene

In [91]:
gene = pd.read_csv('gene_data.csv')

In [92]:
X_gene = gene.iloc[:20,1:].values.astype('float64')

In [93]:
C = []
for i  in range(0,20531):
    C.append('g')
C = ''.join(C)

In [94]:
table_gene = imputation(X_gene,C)

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=20531, Kest=1, maxR=1


In [104]:
table_gene

Unnamed: 0,EM,GLFM
RMSE,1.847733,1.455158


# Result

In [95]:
table_iris

Unnamed: 0,EM,GLFM
RMSE,1.461187,1.075601


In [96]:
table_wine

Unnamed: 0,EM,GLFM
RMSE,132.02455,89.869927


In [98]:
table_kdd

Unnamed: 0,EM,GLFM
RMSE,15064.921752,13678.54505


In [99]:
table_cov

Unnamed: 0,EM,GLFM
RMSE,346.486144,238.630655


In [100]:
table_abs

Unnamed: 0,EM,GLFM
RMSE,25.408922,26.173301


In [101]:
table_frogs

Unnamed: 0,EM,GLFM
RMSE,0.209003,0.11169


In [102]:
table_turkey

Unnamed: 0,EM,GLFM
RMSE,1.969237,1.202523


In [103]:
table_tran

Unnamed: 0,EM,GLFM
RMSE,16.154467,10.945258


In [105]:
table_gene

Unnamed: 0,EM,GLFM
RMSE,1.847733,1.455158
