In [284]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2,mutual_info_classif
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [285]:
data = pd.read_csv('input.csv')
data.columns = [ x.lower().strip() for x in data.columns]
data.columns
data = data.sample(frac=1)

In [286]:
num_cols = data._get_numeric_data().columns
num_cols

Index(['dexa_freq_during_rx', 'count_of_risks'], dtype='object')

In [287]:
grouping_dict = {}

grouping = ['concom','comorb','risk']

for val in grouping:
    for col in data.columns:
        if col.startswith(val):
            grouping_dict[col] = val

In [288]:
grouping_dict

{'concom_cholesterol_and_triglyceride_regulating_preparations': 'concom',
 'concom_narcotics': 'concom',
 'concom_systemic_corticosteroids_plain': 'concom',
 'concom_anti_depressants_and_mood_stabilisers': 'concom',
 'concom_fluoroquinolones': 'concom',
 'concom_cephalosporins': 'concom',
 'concom_macrolides_and_similar_types': 'concom',
 'concom_broad_spectrum_penicillins': 'concom',
 'concom_anaesthetics_general': 'concom',
 'concom_viral_vaccines': 'concom',
 'comorb_encounter_for_screening_for_malignant_neoplasms': 'comorb',
 'comorb_encounter_for_immunization': 'comorb',
 'comorb_encntr_for_general_exam_w_o_complaint,_susp_or_reprtd_dx': 'comorb',
 'comorb_vitamin_d_deficiency': 'comorb',
 'comorb_other_joint_disorder_not_elsewhere_classified': 'comorb',
 'comorb_encntr_for_oth_sp_exam_w_o_complaint_suspected_or_reprtd_dx': 'comorb',
 'comorb_long_term_current_drug_therapy': 'comorb',
 'comorb_dorsalgia': 'comorb',
 'comorb_personal_history_of_other_diseases_and_conditions': 'como

In [289]:
del grouping_dict['risk_segment_prior_ntm']
del grouping_dict['risk_segment_during_rx']

combined_columns = [x for x in grouping_dict.keys()]

In [290]:
le = LabelEncoder()
for val in combined_columns:
    data[val] = le.fit_transform(data[val])

In [291]:
data.describe()

Unnamed: 0,dexa_freq_during_rx,comorb_encounter_for_screening_for_malignant_neoplasms,comorb_encounter_for_immunization,"comorb_encntr_for_general_exam_w_o_complaint,_susp_or_reprtd_dx",comorb_vitamin_d_deficiency,comorb_other_joint_disorder_not_elsewhere_classified,comorb_encntr_for_oth_sp_exam_w_o_complaint_suspected_or_reprtd_dx,comorb_long_term_current_drug_therapy,comorb_dorsalgia,comorb_personal_history_of_other_diseases_and_conditions,...,risk_family_history_of_osteoporosis,risk_low_calcium_intake,risk_vitamin_d_insufficiency,risk_poor_health_frailty,risk_excessive_thinness,risk_hysterectomy_oophorectomy,risk_estrogen_deficiency,risk_immobilization,risk_recurring_falls,count_of_risks
count,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,...,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0
mean,3.016063,0.447722,0.441881,0.39486,0.319217,0.291764,0.231016,0.23861,0.227512,0.197722,...,0.104556,0.012266,0.477804,0.056075,0.019568,0.015771,0.003213,0.004089,0.020152,1.239486
std,8.136545,0.497332,0.496683,0.488892,0.466241,0.454641,0.421544,0.426296,0.419287,0.398339,...,0.306025,0.110088,0.49958,0.2301,0.13853,0.124607,0.056597,0.063822,0.14054,1.094914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,3.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,146.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


In [292]:
data = data.drop(['change_risk_segment','risk_segment_during_rx','tscore_bucket_during_rx','change_t_score'],axis = 1)

In [293]:
binary_cols = ['persistency_flag','gender','ntm_specialist_flag','gluco_record_prior_ntm','gluco_record_during_rx','dexa_during_rx','frag_frac_during_rx','risk_segment_prior_ntm','tscore_bucket_prior_ntm','adherent_flag','idn_indicator','injectable_experience_during_rx','frag_frac_prior_ntm']

In [294]:
for col in binary_cols:
    data[col] = le.fit_transform(data[col])

In [295]:
data = data.drop(['race','ethnicity'],axis =1)

In [296]:
test = data.set_index('ptid').groupby(grouping_dict,axis=1).sum()

test.columns = ['concomitancy_count','comorbidity_count','risk_factors_count']

test = test.reset_index()

test.head(5)

Unnamed: 0,ptid,concomitancy_count,comorbidity_count,risk_factors_count
0,P1240,5,0,1
1,P3183,7,4,2
2,P1642,8,1,1
3,P2634,4,0,2
4,P649,5,2,2


In [297]:
data = data.drop(combined_columns,axis=1)

data = pd.merge(data, test, on='ptid', how = 'inner')

data=data.drop(['count_of_risks'],axis=1)

In [301]:
multiple_class = ['age_bucket','ntm_speciality','ntm_speciality_bucket','region']

In [302]:
for column in multiple_class:
    data[column] = data[column].astype('category')
    data[column] = le.fit_transform(data[column])

In [303]:
data.dtypes

ptid                               object
persistency_flag                    int32
gender                              int32
region                              int32
age_bucket                          int64
ntm_speciality                      int64
ntm_specialist_flag                 int32
ntm_speciality_bucket               int64
gluco_record_prior_ntm              int32
gluco_record_during_rx              int32
dexa_freq_during_rx                 int64
dexa_during_rx                      int32
frag_frac_prior_ntm                 int32
frag_frac_during_rx                 int32
risk_segment_prior_ntm              int32
tscore_bucket_prior_ntm             int32
adherent_flag                       int32
idn_indicator                       int32
injectable_experience_during_rx     int32
concomitancy_count                  int32
comorbidity_count                   int32
risk_factors_count                  int32
dtype: object

In [304]:
data.describe()

Unnamed: 0,persistency_flag,gender,region,age_bucket,ntm_speciality,ntm_specialist_flag,ntm_speciality_bucket,gluco_record_prior_ntm,gluco_record_during_rx,dexa_freq_during_rx,...,frag_frac_prior_ntm,frag_frac_during_rx,risk_segment_prior_ntm,tscore_bucket_prior_ntm,adherent_flag,idn_indicator,injectable_experience_during_rx,concomitancy_count,comorbidity_count,risk_factors_count
count,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,...,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0,3424.0
mean,0.37646,0.056659,1.781834,1.674942,13.590537,0.412091,0.96729,0.235105,0.263435,3.016063,...,0.161215,0.121787,0.56396,0.569801,0.050526,0.746787,0.892523,4.098423,2.174942,1.239486
std,0.484568,0.231223,1.608482,1.22005,11.932789,0.492283,0.620126,0.424126,0.44056,8.136545,...,0.367783,0.327088,0.495965,0.495176,0.219059,0.434915,0.309764,2.779239,2.094203,1.094914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,5.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0
50%,0.0,0.0,3.0,1.0,5.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0
75%,1.0,0.0,3.0,3.0,30.0,1.0,1.0,0.0,1.0,3.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,6.0,3.0,2.0
max,1.0,1.0,4.0,3.0,35.0,1.0,2.0,1.0,1.0,146.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.0,10.0,7.0


In [306]:
features = data.copy()
target = features[['persistency_flag']]
features = features.drop(['ptid','persistency_flag'],axis=1)

In [307]:
fs = SelectKBest(score_func=mutual_info_classif,k='all')
fs.fit(features , target)

  y = column_or_1d(y, warn=True)


SelectKBest(k='all',
      score_func=<function mutual_info_classif at 0x000001332521A158>)

In [308]:
feature_columns=features.columns
li = []
for i in range(len(fs.scores_)):
    li.append([feature_columns[i],fs.scores_[i]])

In [309]:
fin = pd.DataFrame(li)
fin = fin.sort_values(1,ascending=False)
fin.columns =['columns','values']
selected_columns = fin['columns'][:10].tolist()
selected_columns

['dexa_freq_during_rx',
 'concomitancy_count',
 'dexa_during_rx',
 'comorbidity_count',
 'gluco_record_during_rx',
 'ntm_specialist_flag',
 'ntm_speciality',
 'injectable_experience_during_rx',
 'idn_indicator',
 'age_bucket']

In [310]:
feature_final = features[selected_columns]

In [320]:
from sklearn.model_selection import cross_val_score, KFold
num_folds = [3, 5, 7, 9, 10, 11, 13, 15, 17, 19, 21, 23]
import numpy as np
scores=[]

In [321]:
for fold in num_folds:
    kf = KFold(fold, shuffle=True, random_state=1)
    model = LogisticRegression()
    score = cross_val_score(model,feature_final,target,scoring='f1',cv=kf)
    score = np.mean(score)
    scores.append([fold,score])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [322]:
df = pd.DataFrame(scores)
df.sort_values(1,ascending=False)

Unnamed: 0,0,1
2,7,0.70908
7,15,0.708521
6,13,0.708268
3,9,0.708199
9,19,0.707053
11,23,0.706804
10,21,0.706654
5,11,0.706355
4,10,0.705905
0,3,0.705889


In [349]:
x_train, x_test,y_train, y_test = train_test_split(features,target,test_size = 0.2, random_state = 9)

In [350]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
log_y_pred = logreg.predict(x_test)
acc_logreg = accuracy_score(y_test,log_y_pred)
acc_logreg

  y = column_or_1d(y, warn=True)


0.8175182481751825

In [416]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc

In [352]:
f1_score(y_test,log_y_pred)

0.7390396659707723

In [417]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, log_y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.7942298850574713

In [402]:
knn = KNeighborsClassifier(n_neighbors = 11)
knn.fit(x_train,y_train)
knn_y_pred = knn.predict(x_test)
acc_knn = accuracy_score(y_test,knn_y_pred)
acc_knn

  


0.8321167883211679

In [403]:
f1_score(y_test,knn_y_pred)

0.7578947368421052

In [418]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, knn_y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.8082758620689656

In [395]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators= 100)
random_forest.fit(x_train, y_train)
rf_y_pred = random_forest.predict(x_test)
acc_rf= accuracy_score(y_test,rf_y_pred)
acc_rf

  This is separate from the ipykernel package so we can avoid doing imports until


0.8131386861313868

In [420]:
f1_score(y_test, rf_y_pred)

0.7355371900826446

In [421]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, rf_y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.791632183908046

In [388]:
from sklearn.svm import SVC

In [391]:
svc = SVC()
svc.fit(x_train,y_train)
svc_y_pred = svc.predict(x_test)
acc_svc = accuracy_score(y_test,svc_y_pred)
acc_svc

  y = column_or_1d(y, warn=True)


0.8248175182481752

In [392]:
f1_score(y_test,svc_y_pred)

0.7435897435897435

In [423]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, svc_y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.7974252873563218

In [397]:
from sklearn.linear_model import SGDClassifier

In [398]:
sgd = SGDClassifier()
sgd.fit(x_train,y_train)
sgd_y_pred = sgd.predict(x_test)
acc_sgd = accuracy_score(y_test,sgd_y_pred)
acc_sgd

  y = column_or_1d(y, warn=True)


0.8116788321167884

In [399]:
f1_score(y_test,sgd_y_pred)

0.7435387673956262

In [424]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, sgd_y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.7981379310344828

In [438]:
neighbors = list(range(1, 50, 2))

# empty list that will hold cv scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append([k,scores.mean()])

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [439]:
cv_scores

[[1, 0.7232573995347719],
 [3, 0.7553862196197962],
 [5, 0.7739940108553248],
 [7, 0.7856755166974145],
 [9, 0.7849522740033689],
 [11, 0.791522953931713],
 [13, 0.7889748937194192],
 [15, 0.7918905911606642],
 [17, 0.7977340178070105],
 [19, 0.7991938718216091],
 [21, 0.8021162535761077],
 [23, 0.7999278094168606],
 [25, 0.7988289083179594],
 [27, 0.795179273281463],
 [29, 0.7959118740140638],
 [31, 0.795546910510414],
 [33, 0.7959092002887622],
 [35, 0.7970054276623619],
 [37, 0.7937194192668644],
 [39, 0.7951806101441137],
 [41, 0.7977340178070105],
 [43, 0.7944453356862117],
 [45, 0.7951779364188124],
 [47, 0.7951779364188123],
 [49, 0.7948143097778135]]

In [440]:
knn = KNeighborsClassifier(n_neighbors = 21)
knn.fit(x_train,y_train)
knn_y_pred = knn.predict(x_test)
acc_knn = accuracy_score(y_test,knn_y_pred)
acc_knn

  


0.8218978102189781

In [450]:
f1_score(y_test,knn_y_pred)

0.7426160337552741

In [477]:
knn = KNeighborsClassifier(n_neighbors = 21 , weights = 'distance',algorithm = ' ')
knn.fit(x_train,y_train)
knn_y_pred = knn.predict(x_test)
acc_knn = accuracy_score(y_test,knn_y_pred)
acc_knn

  


0.8204379562043795

In [478]:
f1_score(y_test,knn_y_pred)

0.7421383647798742