# Methods and Results

In [1]:
# importing required libraries for analysis
import altair as alt
import pandas as pd
import numpy as np
import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import (FunctionTransformer, Normalizer, OneHotEncoder, StandardScaler, normalize, scale)
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import  plot_confusion_matrix, classification_report

# setting random state to have reproducible results
random_state=12

# read in data
patient_data = pd.read_csv("data/ehr_data.csv")

# splitting data into train and test splits
patient_train, patient_test = train_test_split(patient_data, test_size=0.2, random_state=12, stratify=patient_data["SOURCE"])

In [2]:
# separate feature vectors from target
X_train = patient_train.drop(columns = ["SOURCE"])
y_train = patient_train["SOURCE"]
X_test = patient_test.drop(columns = ["SOURCE"])
y_test = patient_test["SOURCE"]

In [3]:
# preprocess features
numeric_feats = X_train.select_dtypes('number').columns.to_list()
numeric_transformer = make_pipeline(StandardScaler())

categorical_binary_feats = ["SEX"]
categorical_binary_transformer = make_pipeline(OneHotEncoder(drop="if_binary", dtype=int))

preprocessor = make_column_transformer((numeric_transformer, numeric_feats),
                                       (categorical_binary_transformer, categorical_binary_feats)
                                       )

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [5]:
# creating DummyClassifier as a baseline model to compare estimators to
scoring = {'accuracy': 'accuracy',
           'precision': make_scorer(precision_score, pos_label='in'),
           'recall': make_scorer(recall_score, pos_label='in'),
           'f1': make_scorer(f1_score, pos_label='in') }

dummy_classifier = DummyClassifier(strategy = "stratified", random_state = 12)

dummy_scores = pd.DataFrame(
    cross_validate(
        dummy_classifier, X_train, y_train, cv = 5, return_train_score = True, scoring = scoring
    )
)

dummy_mean = dummy_scores.mean()
dummy_mean

fit_time           0.004124
score_time         0.014069
test_accuracy      0.516296
train_accuracy     0.513318
test_precision     0.406146
train_precision    0.396941
test_recall        0.424699
train_recall       0.392079
test_f1            0.415214
train_f1           0.394495
dtype: float64

In [6]:
# KNN
# find the k value that yields the best accuracy estimate
results_dict = {
    "n_neighbors": [],
    "mean_train_score": [],
    "mean_cv_score": []}

for n in range(1,21):
    knn_model = KNeighborsClassifier(n_neighbors=n)
    cv_scores = cross_validate(knn_model, X_train, y_train, cv=5, return_train_score=True)
    results_dict["n_neighbors"].append(n)
    results_dict["mean_train_score"].append(cv_scores["train_score"].mean())
    results_dict["mean_cv_score"].append(cv_scores["test_score"].mean())

results_df = pd.DataFrame(results_dict)

results_df.sort_values(by=["mean_cv_score"], ascending=False).head(1)

Unnamed: 0,n_neighbors,mean_train_score,mean_cv_score
18,19,0.764169,0.740157


In [7]:
best_k = int(results_df.loc[results_df['mean_cv_score'].idxmax()]['n_neighbors'])
best_k

19

As shown above, 19 is the best k value in range 0-20 and yields a cross validation score of 74%.

In [8]:
# make new model with best k
best_model = KNeighborsClassifier(n_neighbors=best_k)

# retrain classifier
best_model.fit(X_train, y_train)

# get predictions on test data
best_model.predict(X_test)

# get estimate of accuracy of classifier on test data
test_score = best_model.score(X_test, y_test)
test_score

0.7304643261608154

The accuracy of the knn classifier with k=19 is 73%.