## Original dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
drug = pd.read_excel('Healthcare_dataset_1.xls')
drug.shape

(3424, 69)

In [78]:
# Creating statistical insights about original df regarding Ntm_Speciality
drug_ntm_spec = drug[['Ptid', 'Ntm_Speciality']]
drug_ntm_spec2 = drug_ntm_spec.groupby(['Ntm_Speciality']).count()
drug_ntm_spec2 = drug_ntm_spec2.sort_values(by=['Ptid'], ascending=False)

In [5]:
drug_ntm_spec2['Ptid%'] = drug_ntm_spec2['Ptid'] / 3424 * 100

In [6]:
drug_ntm_spec2

Unnamed: 0_level_0,Ptid,Ptid%
Ntm_Speciality,Unnamed: 1_level_1,Unnamed: 2_level_1
GENERAL PRACTITIONER,1535,44.830607
RHEUMATOLOGY,604,17.640187
ENDOCRINOLOGY,458,13.376168
Unknown,310,9.053738
ONCOLOGY,225,6.571262
OBSTETRICS AND GYNECOLOGY,90,2.628505
UROLOGY,33,0.963785
ORTHOPEDIC SURGERY,30,0.876168
CARDIOLOGY,22,0.642523
PATHOLOGY,16,0.46729


In [7]:
drug_ntm_spec2.shape

(36, 2)

Observation no 1 - "unknown" should be distributed among 35 other specialities (knn=35?)

## Preparing data for KNN Imputation

In [7]:
drug.columns

Index(['Ptid', 'Persistency_Flag', 'Gender', 'Race', 'Ethnicity', 'Region',
       'Age_Bucket', 'Ntm_Speciality', 'Ntm_Specialist_Flag',
       'Ntm_Speciality_Bucket', 'Gluco_Record_Prior_Ntm',
       'Gluco_Record_During_Rx', 'Dexa_Freq_During_Rx', 'Dexa_During_Rx',
       'Frag_Frac_Prior_Ntm', 'Frag_Frac_During_Rx', 'Risk_Segment_Prior_Ntm',
       'Tscore_Bucket_Prior_Ntm', 'Risk_Segment_During_Rx',
       'Tscore_Bucket_During_Rx', 'Change_T_Score', 'Change_Risk_Segment',
       'Adherent_Flag', 'Idn_Indicator', 'Injectable_Experience_During_Rx',
       'Comorb_Encounter_For_Screening_For_Malignant_Neoplasms',
       'Comorb_Encounter_For_Immunization',
       'Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx',
       'Comorb_Vitamin_D_Deficiency',
       'Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified',
       'Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx',
       'Comorb_Long_Term_Current_Drug_Therapy', 'Comorb_Dorsalgia',
       'Com

In [81]:
# My approach was to select only those features that are relevent to Ntm_Speciality, they all start like "Comorb*". 
# This approach should generate appropiate imputation by KNN

In [8]:
columns = ['Ptid', 'Ntm_Speciality', 'Comorb_Encounter_For_Screening_For_Malignant_Neoplasms',
       'Comorb_Encounter_For_Immunization',
       'Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx',
       'Comorb_Vitamin_D_Deficiency',
       'Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified',
       'Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx',
       'Comorb_Long_Term_Current_Drug_Therapy', 'Comorb_Dorsalgia',
       'Comorb_Personal_History_Of_Other_Diseases_And_Conditions',
       'Comorb_Other_Disorders_Of_Bone_Density_And_Structure',
       'Comorb_Disorders_of_lipoprotein_metabolism_and_other_lipidemias',
       'Comorb_Osteoporosis_without_current_pathological_fracture',
       'Comorb_Personal_history_of_malignant_neoplasm',
       'Comorb_Gastro_esophageal_reflux_disease',
       'Concom_Cholesterol_And_Triglyceride_Regulating_Preparations',
       'Concom_Narcotics', 'Concom_Systemic_Corticosteroids_Plain',
       'Concom_Anti_Depressants_And_Mood_Stabilisers',
       'Concom_Fluoroquinolones', 'Concom_Cephalosporins',
       'Concom_Macrolides_And_Similar_Types',
       'Concom_Broad_Spectrum_Penicillins', 'Concom_Anaesthetics_General',
       'Concom_Viral_Vaccines']

In [9]:
drug_knn = drug[[i for i in columns]]

In [10]:
drug_knn.head()

Unnamed: 0,Ptid,Ntm_Speciality,Comorb_Encounter_For_Screening_For_Malignant_Neoplasms,Comorb_Encounter_For_Immunization,"Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx",Comorb_Vitamin_D_Deficiency,Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified,Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx,Comorb_Long_Term_Current_Drug_Therapy,Comorb_Dorsalgia,...,Concom_Cholesterol_And_Triglyceride_Regulating_Preparations,Concom_Narcotics,Concom_Systemic_Corticosteroids_Plain,Concom_Anti_Depressants_And_Mood_Stabilisers,Concom_Fluoroquinolones,Concom_Cephalosporins,Concom_Macrolides_And_Similar_Types,Concom_Broad_Spectrum_Penicillins,Concom_Anaesthetics_General,Concom_Viral_Vaccines
0,P1,GENERAL PRACTITIONER,N,Y,Y,N,N,Y,N,Y,...,N,N,N,N,N,N,N,N,N,N
1,P2,GENERAL PRACTITIONER,N,N,Y,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
2,P3,GENERAL PRACTITIONER,Y,N,Y,N,N,N,N,N,...,Y,N,N,N,N,N,N,N,N,N
3,P4,GENERAL PRACTITIONER,N,Y,Y,N,Y,N,N,Y,...,N,Y,Y,N,N,N,N,N,N,Y
4,P5,GENERAL PRACTITIONER,Y,Y,Y,N,N,N,N,Y,...,N,Y,Y,Y,N,N,N,N,N,N


In [11]:
drug_knn.shape

(3424, 26)

In [12]:
columns_comorb = ['Comorb_Encounter_For_Screening_For_Malignant_Neoplasms',
       'Comorb_Encounter_For_Immunization',
       'Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx',
       'Comorb_Vitamin_D_Deficiency',
       'Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified',
       'Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx',
       'Comorb_Long_Term_Current_Drug_Therapy', 'Comorb_Dorsalgia',
       'Comorb_Personal_History_Of_Other_Diseases_And_Conditions',
       'Comorb_Other_Disorders_Of_Bone_Density_And_Structure',
       'Comorb_Disorders_of_lipoprotein_metabolism_and_other_lipidemias',
       'Comorb_Osteoporosis_without_current_pathological_fracture',
       'Comorb_Personal_history_of_malignant_neoplasm',
       'Comorb_Gastro_esophageal_reflux_disease',
       'Concom_Cholesterol_And_Triglyceride_Regulating_Preparations',
       'Concom_Narcotics', 'Concom_Systemic_Corticosteroids_Plain',
       'Concom_Anti_Depressants_And_Mood_Stabilisers',
       'Concom_Fluoroquinolones', 'Concom_Cephalosporins',
       'Concom_Macrolides_And_Similar_Types',
       'Concom_Broad_Spectrum_Penicillins', 'Concom_Anaesthetics_General',
       'Concom_Viral_Vaccines']

In [82]:
# Changing "Comorb*" columns from categorial into numerical

In [39]:
for i in columns_comorb:
    drug_knn[i].replace('Y', 1)
    drug_knn[i].replace('N', 0)

In [36]:
drug_knn['Ntm_Speciality'].replace('Unknown', np.NaN)

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
3419    1.0
3420    NaN
3421    3.0
3422    NaN
3423    NaN
Name: Ntm_Speciality, Length: 3424, dtype: float64

In [31]:
drug_knn.head()

Unnamed: 0,Ptid,Ntm_Speciality,Comorb_Encounter_For_Screening_For_Malignant_Neoplasms,Comorb_Encounter_For_Immunization,"Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx",Comorb_Vitamin_D_Deficiency,Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified,Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx,Comorb_Long_Term_Current_Drug_Therapy,Comorb_Dorsalgia,...,Concom_Cholesterol_And_Triglyceride_Regulating_Preparations,Concom_Narcotics,Concom_Systemic_Corticosteroids_Plain,Concom_Anti_Depressants_And_Mood_Stabilisers,Concom_Fluoroquinolones,Concom_Cephalosporins,Concom_Macrolides_And_Similar_Types,Concom_Broad_Spectrum_Penicillins,Concom_Anaesthetics_General,Concom_Viral_Vaccines
0,P1,1,N,Y,Y,N,N,Y,N,Y,...,N,N,N,N,N,N,N,N,N,N
1,P2,1,N,N,Y,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
2,P3,1,Y,N,Y,N,N,N,N,N,...,Y,N,N,N,N,N,N,N,N,N
3,P4,1,N,Y,Y,N,Y,N,N,Y,...,N,Y,Y,N,N,N,N,N,N,Y
4,P5,1,Y,Y,Y,N,N,N,N,Y,...,N,Y,Y,Y,N,N,N,N,N,N


In [83]:
# Changing "Ntm_Speciality" column from categorial into numerical with help of following dictionary

In [56]:
ntm_speciality = {"GENERAL PRACTITIONER": 1,
"RHEUMATOLOGY": 2,
"ENDOCRINOLOGY": 3,
"ONCOLOGY": 4,
"OBSTETRICS AND GYNECOLOGY": 5,
"UROLOGY": 6,
"ORTHOPEDIC SURGERY": 7,
"CARDIOLOGY": 8,
"PATHOLOGY": 9,
"HEMATOLOGY & ONCOLOGY": 10,
"OTOLARYNGOLOGY": 11,
"PEDIATRICS": 12,
"PHYSICAL MEDICINE AND REHABILITATION": 13,
"SURGERY AND SURGICAL SPECIALTIES": 14,
"PULMONARY MEDICINE": 15,
"PSYCHIATRY AND NEUROLOGY": 16,
"ORTHOPEDICS": 17,
"NEPHROLOGY": 18,
"GASTROENTEROLOGY": 19,
"TRANSPLANT SURGERY": 20,
"GERIATRIC MEDICINE": 21,
"PLASTIC SURGERY": 22,
"VASCULAR SURGERY": 23,
"HOSPICE AND PALLIATIVE MEDICINE": 24,
"PAIN MEDICINE": 25,
"CLINICAL NURSE SPECIALIST": 26,
"PODIATRY": 27,
"OPHTHALMOLOGY": 28,
"OCCUPATIONAL MEDICINE": 29,
"RADIOLOGY": 30,
"OBSTETRICS & OBSTETRICS & GYNECOLOGY & OBSTETRICS & GYNECOLOGY": 31,
"NUCLEAR MEDICINE": 32,
"NEUROLOGY": 33,
"EMERGENCY MEDICINE": 34,
"HOSPITAL MEDICINE": 35}

In [28]:
drug_knn['Ntm_Speciality'].replace(ntm_speciality.keys(), ntm_speciality.values())

0             1
1             1
2             1
3             1
4             1
         ...   
3419          1
3420    Unknown
3421          3
3422    Unknown
3423    Unknown
Name: Ntm_Speciality, Length: 3424, dtype: object

In [40]:
drug_knn.tail()

Unnamed: 0,Ptid,Ntm_Speciality,Comorb_Encounter_For_Screening_For_Malignant_Neoplasms,Comorb_Encounter_For_Immunization,"Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx",Comorb_Vitamin_D_Deficiency,Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified,Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx,Comorb_Long_Term_Current_Drug_Therapy,Comorb_Dorsalgia,...,Concom_Cholesterol_And_Triglyceride_Regulating_Preparations,Concom_Narcotics,Concom_Systemic_Corticosteroids_Plain,Concom_Anti_Depressants_And_Mood_Stabilisers,Concom_Fluoroquinolones,Concom_Cephalosporins,Concom_Macrolides_And_Similar_Types,Concom_Broad_Spectrum_Penicillins,Concom_Anaesthetics_General,Concom_Viral_Vaccines
3419,P3420,1.0,1,0,1,1,0,0,0,1,...,1,1,1,1,1,0,0,0,0,0
3420,P3421,,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3421,P3422,3.0,1,0,0,1,1,0,0,0,...,1,1,0,0,0,0,0,0,1,0
3422,P3423,,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3423,P3424,,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [41]:
drug_knn.to_csv('drug_knn.xls')

## KNN Imputer with n_neighbors=5

In [40]:
drug_knn = pd.read_csv('drug_knn.xls', index_col=False)
drug_knn2 = drug_knn.drop(['Ptid', 'Unnamed: 0'], axis=1)

In [41]:
drug_knn2

Unnamed: 0,Ntm_Speciality,Comorb_Encounter_For_Screening_For_Malignant_Neoplasms,Comorb_Encounter_For_Immunization,"Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx",Comorb_Vitamin_D_Deficiency,Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified,Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx,Comorb_Long_Term_Current_Drug_Therapy,Comorb_Dorsalgia,Comorb_Personal_History_Of_Other_Diseases_And_Conditions,...,Concom_Cholesterol_And_Triglyceride_Regulating_Preparations,Concom_Narcotics,Concom_Systemic_Corticosteroids_Plain,Concom_Anti_Depressants_And_Mood_Stabilisers,Concom_Fluoroquinolones,Concom_Cephalosporins,Concom_Macrolides_And_Similar_Types,Concom_Broad_Spectrum_Penicillins,Concom_Anaesthetics_General,Concom_Viral_Vaccines
0,1.0,0,1,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1.0,0,1,1,0,1,0,0,1,0,...,0,1,1,0,0,0,0,0,0,1
4,1.0,1,1,1,0,0,0,0,1,1,...,0,1,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,1.0,1,0,1,1,0,0,0,1,0,...,1,1,1,1,1,0,0,0,0,0
3420,,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3421,3.0,1,0,0,1,1,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
3422,,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [85]:
# Setting up KNN imputer, fitting data and transformation

In [42]:
from sklearn.impute import KNNImputer

In [43]:
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [44]:
imputer.fit(drug_knn2)

KNNImputer()

In [84]:
# Transforming array back to dataframe

In [45]:
drug_knn2_trans = imputer.transform(drug_knn2)

In [46]:
drug_knn2_trans

array([[1. , 0. , 1. , ..., 0. , 0. , 0. ],
       [1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [1. , 1. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [3. , 1. , 0. , ..., 0. , 1. , 0. ],
       [1.4, 1. , 0. , ..., 0. , 0. , 0. ],
       [4.8, 0. , 0. , ..., 0. , 0. , 0. ]])

In [47]:
drug_knn2_trans.shape

(3424, 25)

In [48]:
drug_knn3 = pd.DataFrame(drug_knn2_trans)

In [49]:
drug_knn3.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
3419,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3420,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3421,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3422,1.4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3423,4.8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
drug_knn3.groupby([0]).count()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,21,22,23,24
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1547,1547,1547,1547,1547,1547,1547,1547,1547,1547,...,1547,1547,1547,1547,1547,1547,1547,1547,1547,1547
1.2,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
1.4,27,27,27,27,27,27,27,27,27,27,...,27,27,27,27,27,27,27,27,27,27
1.6,48,48,48,48,48,48,48,48,48,48,...,48,48,48,48,48,48,48,48,48,48
1.8,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29
2.0,639,639,639,639,639,639,639,639,639,639,...,639,639,639,639,639,639,639,639,639,639
2.2,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
2.4,16,16,16,16,16,16,16,16,16,16,...,16,16,16,16,16,16,16,16,16,16
2.6,15,15,15,15,15,15,15,15,15,15,...,15,15,15,15,15,15,15,15,15,15
2.8,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


In [52]:
decimals = 0    
drug_knn3[0] = drug_knn3[0].apply(lambda x: round(x, decimals))
drug_knn3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3420,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3421,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3422,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
drug_knn3[0] = drug_knn3[0].astype(int)
drug_knn3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3420,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3421,3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3422,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
drug_knn3[0] = drug_knn3[0].replace(ntm_speciality.values(), ntm_speciality.keys())
drug_knn3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,GENERAL PRACTITIONER,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,GENERAL PRACTITIONER,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GENERAL PRACTITIONER,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GENERAL PRACTITIONER,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,GENERAL PRACTITIONER,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,GENERAL PRACTITIONER,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3420,RHEUMATOLOGY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3421,ENDOCRINOLOGY,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3422,GENERAL PRACTITIONER,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
drug_knn3.rename(columns={0:'Ntm_Speciality'}, inplace=True)
drug_knn4 = drug_knn3['Ntm_Speciality']
drug_knn4

0            GENERAL PRACTITIONER
1            GENERAL PRACTITIONER
2            GENERAL PRACTITIONER
3            GENERAL PRACTITIONER
4            GENERAL PRACTITIONER
                  ...            
3419         GENERAL PRACTITIONER
3420                 RHEUMATOLOGY
3421                ENDOCRINOLOGY
3422         GENERAL PRACTITIONER
3423    OBSTETRICS AND GYNECOLOGY
Name: Ntm_Speciality, Length: 3424, dtype: object

In [63]:
orig = drug[['Ptid']]
drug_knn_5_final = orig.join(drug_knn4)
drug_knn_5_final

Unnamed: 0,Ptid,Ntm_Speciality
0,P1,GENERAL PRACTITIONER
1,P2,GENERAL PRACTITIONER
2,P3,GENERAL PRACTITIONER
3,P4,GENERAL PRACTITIONER
4,P5,GENERAL PRACTITIONER
...,...,...
3419,P3420,GENERAL PRACTITIONER
3420,P3421,RHEUMATOLOGY
3421,P3422,ENDOCRINOLOGY
3422,P3423,GENERAL PRACTITIONER


In [64]:
drug_knn_5_final2 = drug_knn_5_final.groupby(['Ntm_Speciality']).count()
drug_knn_5_final2 = drug_knn_5_final2.sort_values(by=['Ptid'], ascending=False)
drug_knn_5_final2['Ptid%'] = drug_knn_5_final2['Ptid'] / 3424 * 100
drug_knn_5_final2

Unnamed: 0_level_0,Ptid,Ptid%
Ntm_Speciality,Unnamed: 1_level_1,Unnamed: 2_level_1
GENERAL PRACTITIONER,1585,46.290888
RHEUMATOLOGY,752,21.962617
ENDOCRINOLOGY,520,15.186916
ONCOLOGY,254,7.418224
OBSTETRICS AND GYNECOLOGY,101,2.949766
UROLOGY,36,1.051402
ORTHOPEDIC SURGERY,34,0.992991
CARDIOLOGY,24,0.700935
PATHOLOGY,17,0.496495
HEMATOLOGY & ONCOLOGY,14,0.408879


In [65]:
drug_knn_5_final.to_csv('drug_knn_5_final.csv')

## KNN Imputer with n_neighbors=35

In [66]:
imputer2 = KNNImputer(n_neighbors=35, weights='uniform', metric='nan_euclidean')

In [67]:
imputer2.fit(drug_knn2)

KNNImputer(n_neighbors=35)

In [68]:
drug_knn35_trans = imputer2.transform(drug_knn2)

In [69]:
drug_knn35_trans

array([[1.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [3.        , 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [2.4       , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [2.77142857, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [70]:
drug_knn35_trans.shape

(3424, 25)

In [71]:
drug_knn35_final = pd.DataFrame(drug_knn35_trans)
drug_knn35_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.000000,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.000000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,1.000000,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3420,2.085714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3421,3.000000,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3422,2.400000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
decimals = 0    
drug_knn35_final[0] = drug_knn35_final[0].apply(lambda x: round(x, decimals))
drug_knn35_final[0] = drug_knn35_final[0].astype(int)
drug_knn35_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3420,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3421,3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3422,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
drug_knn35_final.rename(columns={0:'Ntm_Speciality'}, inplace=True)
drug_knn35_final = drug_knn35_final['Ntm_Speciality']
drug_knn_35_final = orig.join(drug_knn35_final)
drug_knn_35_final

Unnamed: 0,Ptid,Ntm_Speciality
0,P1,1
1,P2,1
2,P3,1
3,P4,1
4,P5,1
...,...,...
3419,P3420,1
3420,P3421,2
3421,P3422,3
3422,P3423,2


In [75]:
drug_knn_35_final["Ntm_Speciality"] = drug_knn_35_final['Ntm_Speciality'].replace(ntm_speciality.values(), ntm_speciality.keys())
drug_knn_35_final

Unnamed: 0,Ptid,Ntm_Speciality
0,P1,GENERAL PRACTITIONER
1,P2,GENERAL PRACTITIONER
2,P3,GENERAL PRACTITIONER
3,P4,GENERAL PRACTITIONER
4,P5,GENERAL PRACTITIONER
...,...,...
3419,P3420,GENERAL PRACTITIONER
3420,P3421,RHEUMATOLOGY
3421,P3422,ENDOCRINOLOGY
3422,P3423,RHEUMATOLOGY


In [76]:
drug_knn_35_final2 = drug_knn_35_final.groupby(['Ntm_Speciality']).count()
drug_knn_35_final2 = drug_knn_35_final2.sort_values(by=['Ptid'], ascending=False)
drug_knn_35_final2['Ptid%'] = drug_knn_35_final2['Ptid'] / 3424 * 100
drug_knn_35_final2

Unnamed: 0_level_0,Ptid,Ptid%
Ntm_Speciality,Unnamed: 1_level_1,Unnamed: 2_level_1
GENERAL PRACTITIONER,1538,44.918224
RHEUMATOLOGY,786,22.955607
ENDOCRINOLOGY,574,16.764019
ONCOLOGY,234,6.834112
OBSTETRICS AND GYNECOLOGY,90,2.628505
UROLOGY,33,0.963785
ORTHOPEDIC SURGERY,30,0.876168
CARDIOLOGY,22,0.642523
PATHOLOGY,16,0.46729
HEMATOLOGY & ONCOLOGY,14,0.408879


In [77]:
drug_knn_35_final.to_csv('drug_knn_35_final.csv')