In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
pd.options.display.float_format = '{:,.3f}'.format

In [2]:
from sklearn.preprocessing import MinMaxScaler

In [3]:
from numpy import isnan

In [4]:
data = pd.read_csv('data2_droplists.csv')

In [5]:
data.shape

(1838, 668)

In [6]:
#make sure dataset has only numbers
data = data.apply(pd.to_numeric)

In [8]:
print("Number of null values in dataset: ",data.isnull().values.sum())
print("Percentage: ",(data.isnull().sum().sum()/data.size)*100) #ποσοστο

Number of null values in dataset:  97121
Percentage:  7.910267604073681


In [9]:
data['G21'].value_counts()

0.000    1543
1.000     217
2.000      78
Name: G21, dtype: int64

In [10]:
data['G21'].isnull().values.sum()

0

### scale, y = G21 

In [11]:
#load data and split to dependent and independent variables
X = data.loc[:, data.columns != 'G21'].values #select all but the desired one for y
y = data['G21'].values

#X and y are numpy ndarray type variables
print(X.shape,y.shape)
X = np.array(X)
y = np.array(y)

(1838, 667) (1838,)


In [12]:
#normalize, range on the training set, default=(0, 1)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

#### kNN

In [13]:
#scaling data is important when performing KNN imputation
from sklearn.impute import KNNImputer
# define imputer
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [14]:
# fit on the dataset
knn_imputer.fit(X)

KNNImputer()

In [15]:
# transform the dataset
X_knn= knn_imputer.transform(X)

In [16]:
print('Missing: %d' % sum(isnan(X_knn).flatten()))

Missing: 0


#### Iterative Imputer

In [17]:
#scaling data not mentioned, probably after
# define imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_imputer =IterativeImputer(max_iter=5,n_nearest_features=200,initial_strategy='most_frequent',imputation_order='ascending', verbose=1)

In [18]:
# fit on the dataset
iter_imputer.fit(X)

[IterativeImputer] Completing matrix with shape (1838, 667)
[IterativeImputer] Change: 40.794699092914655, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Change: 28.673757764970244, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Change: 25.67881069213383, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Change: 39.75917938649334, scaled tolerance: 0.0010000000000000002 
[IterativeImputer] Change: 44.17073123174113, scaled tolerance: 0.0010000000000000002 




IterativeImputer(initial_strategy='most_frequent', max_iter=5,
                 n_nearest_features=200, verbose=1)

In [19]:
# transform the dataset
X_iter= iter_imputer.transform(X)

[IterativeImputer] Completing matrix with shape (1838, 667)


In [20]:
print('Missing: %d' % sum(isnan(X_iter).flatten()))

Missing: 0


In [22]:
from numpy import asarray
from numpy import save

#save different X
save('X_G21knn.npy', X_knn)
save('X_G21iter.npy', X_iter)

#save y for G1
save('y_G21.npy', y)

#### Multiple impution - miceforest

In [21]:
#scaling data not mentioned, probably after
# define imputer
import miceforest as mf

# Create kernel
kds = mf.ImputationKernel(X, datasets=4, mean_match_candidates=5, data_subset=5, save_all_iterations=False,save_models=0, copy_data=False)


In [23]:
kds.mice(10)

In [216]:
print(kds)

              Class: ImputationKernel
           Datasets: 4
         Iterations: 10
  Imputed Variables: 626
save_all_iterations: False


In [24]:
X_mice = kds.complete_data(dataset=0, inplace=False)

In [25]:
print('Missing: %d' % sum(isnan(X_mice).flatten()))

Missing: 0


In [26]:
save('X_G21mice10.npy', X_mice)