In [71]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn_pandas import CategoricalImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from impyute.imputation.cs import fast_knn
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from impyute.imputation.cs import mice
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

In [2]:
data = pd.read_pickle("files/Optima_Data_Report_Cases_9584_filled_pickle")

In [3]:
X = ['OPTIMA DIAGNOSES V 2010: PETERSEN MCI TYPE',
    'CLINICAL DEMENTIA RATING: OVERALL CDR SCORE']

In [4]:
y = ['OPTIMA DIAGNOSES V 2010: PETERSEN MCI']

In [5]:
columns = X + y

In [6]:
subset = data[columns]

In [7]:
subset = subset[subset[y[0]].notna()]

In [8]:
subset = subset[subset[y[0]].isin([8, 9])!=True]

In [9]:
subset[X].isna().sum().sum() / (subset[X].shape[0] * subset[X].shape[1])

0.3759961746891935

In [10]:
subset = subset.fillna(-1)

In [11]:
subset = subset.reset_index(drop=True)

In [12]:
train, test = train_test_split(subset, test_size=0.33, random_state=42, shuffle=True)

In [13]:
y_test = test[y[0]].copy().values

In [14]:
test[y[0]] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [78]:
subset.shape

(3137, 3)

In [79]:
Counter(subset[y].values.reshape(-1))

Counter({0.0: 2441, 1.0: 696})

In [15]:
imputer_categorical = CategoricalImputer()

In [16]:
imputer_categorical.fit(train.values)

CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
                   strategy='most_frequent')

In [17]:
imputer_categorical_y = imputer_categorical.transform(test.values)[:, -1]

In [18]:
accuracy_score(y_test, imputer_categorical_y)

0.0

In [19]:
imputer_simple_median = SimpleImputer( strategy='median') 

In [20]:
imputer_simple_median.fit(train.values)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [21]:
imputer_simple_median_y = imputer_simple_median.transform(test.values)[:, -1]

In [22]:
accuracy_score(y_test, imputer_simple_median_y)

0.7963320463320464

In [23]:
imputer_simple_most_frequent = SimpleImputer( strategy='most_frequent') 

In [24]:
imputer_simple_most_frequent.fit(train.values)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [25]:
imputer_simple_most_frequent_y = imputer_simple_most_frequent.transform(test.values)[:, -1]

In [26]:
accuracy_score(y_test, imputer_simple_most_frequent_y)

0.7963320463320464

In [32]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [33]:
mlp.fit(train[X].values, train[y].values)

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [34]:
mlp_y = mlp.predict(test[X].values)

In [35]:
accuracy_score(y_test, mlp_y)

0.7895752895752896

In [36]:
rf = RandomForestClassifier(max_depth=2, random_state=0)

In [37]:
rf.fit(train[X].values, train[y].values)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [38]:
rf_y = rf.predict(test[X].values)

In [39]:
accuracy_score(y_test, rf_y)

0.9083011583011583

In [42]:
lor = LogisticRegression(random_state=0).fit(train[X].values, train[y].values)

  y = column_or_1d(y, warn=True)


In [43]:
lor_y = lor.predict(test[X].values)

In [44]:
accuracy_score(y_test, lor_y)

0.9083011583011583

In [45]:
np.unique(y_test)

array([0., 1.])

In [46]:
lir = LinearRegression().fit(train[X].values, train[y].values)

In [47]:
lir_y = lir.predict(test[X].values)

In [48]:
lir_y = lir_y.reshape(-1)

In [49]:
lir_y[lir_y<0.5] = 0

In [50]:
lir_y[lir_y>=0.5] = 1

In [51]:
accuracy_score(y_test, lir_y)

0.9083011583011583

In [52]:
imputer_simple_mean = SimpleImputer( strategy='mean') 

In [53]:
imputer_simple_mean.fit(train.values)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [54]:
imputer_simple_mean_y = imputer_simple_mean.transform(test.values)[:, -1]

In [55]:
imputer_simple_mean_y[imputer_simple_mean_y<0.5] = 0

In [56]:
imputer_simple_mean_y[imputer_simple_mean_y>=0.5] = 1

In [57]:
accuracy_score(y_test, imputer_simple_mean_y)

0.7963320463320464

In [58]:
subset_concat = pd.concat([train, test])

In [59]:
fast_knn_y = fast_knn(subset_concat.values, k=30)

  weights = distances/np.sum(distances)


In [60]:
fast_knn_y = fast_knn_y[train.shape[0]:, -1]

In [61]:
fast_knn_y[np.isnan(fast_knn_y)] = -1

In [62]:
fast_knn_y[fast_knn_y<0.5] = 0

In [63]:
fast_knn_y[fast_knn_y>=0.5] = 1

In [64]:
accuracy_score(y_test, fast_knn_y)

0.8667953667953668

In [65]:
mice_y = mice(subset_concat.values)

In [66]:
mice_y = mice_y[train.shape[0]:, -1]

In [67]:
mice_y[mice_y<0.5] = 0

In [68]:
mice_y[mice_y>=0.5] = 1

In [69]:
accuracy_score(y_test, mice_y)

0.9083011583011583

In [72]:
imputer_iterative = IterativeImputer(random_state=0)

In [73]:
imputer_iterative.fit(train.values)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [74]:
imputer_iterative_y = imputer_iterative.transform(test.values)[:, -1]

In [75]:
imputer_iterative_y[imputer_iterative_y<0.5] = 0

In [76]:
imputer_iterative_y[imputer_iterative_y>=0.5] = 1

In [77]:
accuracy_score(y_test, imputer_iterative_y)

0.7963320463320464