In [1]:
#Same accross all Model Selection notebooks
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

#Hyperparameter search method, metric: "roc_auc"
from sklearn.model_selection import GridSearchCV
#metric
from sklearn.metrics import roc_auc_score as auc

#Different for different Model Selection notebooks
#preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#model
from sklearn.neighbors import KNeighborsClassifier

In [2]:
model_name="KNeighborsClassifier"

# 1. Import the train and test datasets *

In [3]:
#select the index of file to load
data_index=1

train_files=["DL_train.csv","Morgan_train.csv","Both_train.csv"]
test_files=["DL_test.csv","Morgan_test.csv","Both_test.csv"]
dataset_names=["DL","Morgan","Both"]
dataset_name=dataset_names[data_index]
#check dataset if there is a index col, id there is include the argument index_col=0
x_train = pd.read_csv(train_files[data_index],index_col=0)
y_train = x_train.ACTIVE
x_train.drop(["ACTIVE"],axis=1,inplace=True)
x_test = pd.read_csv(test_files[data_index],index_col=0) 

# 2. Setup Pipeline *

In [4]:
#Step 1: Normalise the data
norm_index=0
norm_type = [StandardScaler(),MinMaxScaler()] 
#Step 2: Model fit
model_index=0
models=[KNeighborsClassifier(),]
#Step 3: Pipeline
pipeline=Pipeline(steps=[("norm",norm_type[norm_index]),("model",models[model_index])])

# 3. Setup Parameter Grid *

In [5]:
param_grid = {
    "model__n_neighbors":[1,4]
}

# 4. setup GridSearch

In [6]:
score_metric = "roc_auc"
search = GridSearchCV(pipeline,param_grid,
                      n_jobs=-1,
                      cv=10,
                      scoring=score_metric)


In [7]:
search.fit(x_train,y_train)

In [8]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.640):
{'model__n_neighbors': 4}


In [9]:
search_res = pd.DataFrame(search.cv_results_)
search_res = search_res.sort_values(by=['rank_test_score'])
search_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,1.032729,0.074167,42.334902,12.143783,4,{'model__n_neighbors': 4},0.646232,0.635566,0.638394,0.637981,0.637241,0.632498,0.661617,0.637744,0.634719,0.638405,0.64004,0.007949,1
0,1.566222,0.223954,53.927585,0.894468,1,{'model__n_neighbors': 1},0.591978,0.583405,0.586101,0.586522,0.571618,0.589088,0.622961,0.571035,0.591752,0.591946,0.588641,0.013594,2


# 5.Estimation of Model Performance based on CV in GridSearch

In [37]:
estimated_auc = float(search_res.query("rank_test_score == 1").mean_test_score)

# 6.Best Hyperparameter

In [11]:
parms = search_res.query("rank_test_score == 1").params

In [13]:
parms

1    {'model__n_neighbors': 4}
Name: params, dtype: object

In [14]:
n_neighbors = parms[1]["model__n_neighbors"]

# 7.Train model on full train dataset

In [18]:
full_model = KNeighborsClassifier(n_neighbors=n_neighbors)

In [19]:
full_model.fit(x_train,y_train)

# 8.Prediction score for test dataset

In [29]:
pred_test = pd.DataFrame(full_model.predict_proba(x_test))[1]


In [30]:
pred_test.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: 1, dtype: float64

# 9.Export result

In [40]:
result = pd.DataFrame([estimated_auc,])

In [41]:
result.head()

Unnamed: 0,0
0,0.64004


In [42]:
result = pd.concat([result,pred_test])

In [43]:
result.head()

Unnamed: 0,0
0,0.64004
0,0.0
1,0.0
2,0.0
3,0.0


In [46]:
result.to_csv(f"{model_name}_{dataset_name}_Result.csv")