In [1]:
#importing necessary libraries
import numpy as np
from sklearn import preprocessing, neighbors
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
#loading in data 
df = pd.read_csv("D:\code\School\D207\medical_clean.csv")

In [2]:
#dropping unwanted columns
df = df.drop(['CaseOrder', 'State', 'Customer_id', 'Interaction', 'UID', 'City', 'County', 'Zip',
             'Lat', 'Lng', 'TimeZone', 'Job', 'Full_meals_eaten', 'vitD_supp', 'Item1', 'Item2',
             'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8', 
                     'Population', 'Area', 'Children',  'Full_meals_eaten', 'CaseOrder'], axis =1)

In [3]:
#getting dummies for categorical variables
df = pd.concat([df, pd.get_dummies(df['Marital'], prefix='Marital', drop_first=False)], axis=1)
df = pd.concat([df, pd.get_dummies(df['Gender'], prefix='Gender', drop_first=False)], axis=1)
df = pd.concat([df, pd.get_dummies(df['Initial_admin'], prefix='Initial_admin', drop_first=False)], axis=1)
df = pd.concat([df, pd.get_dummies(df['Complication_risk'], prefix='Complication_risk', drop_first=False)], axis=1)
df = pd.concat([df, pd.get_dummies(df['Services'], prefix='Services', drop_first=False)], axis=1)

In [4]:
#dropping original columns now that these have been encoded
df.drop(['Marital'],axis=1, inplace=True)
df.drop(['Gender'],axis=1, inplace=True)
df.drop(['Initial_admin'],axis=1, inplace=True)
df.drop(['Complication_risk'],axis=1, inplace=True)
df.drop(['Services'],axis=1, inplace=True)

In [5]:
#these variables only have two possible values and can be encoded as 1 or 0
df['ReAdmis']=df.ReAdmis.map(dict(Yes=1, No=0))
df['HighBlood']=df.HighBlood.map(dict(Yes=1, No=0))
df['Stroke']=df.Stroke.map(dict(Yes=1, No=0))
df['Overweight']=df.Overweight.map(dict(Yes=1, No=0))
df['Arthritis']=df.Arthritis.map(dict(Yes=1, No=0))
df['Diabetes']=df.Diabetes.map(dict(Yes=1, No=0))
df['Hyperlipidemia']=df.Hyperlipidemia.map(dict(Yes=1, No=0))
df['BackPain']=df.BackPain.map(dict(Yes=1, No=0))
df['Anxiety']=df.Anxiety.map(dict(Yes=1, No=0))
df['Allergic_rhinitis']=df.Allergic_rhinitis.map(dict(Yes=1, No=0))
df['Reflux_esophagitis']=df.Reflux_esophagitis.map(dict(Yes=1, No=0))
df['Asthma']=df.Asthma.map(dict(Yes=1, No=0))
df['Soft_drink']=df.Soft_drink.map(dict(Yes=1, No=0))

In [6]:
#viewing updated dataframe
df.head()

Unnamed: 0,Age,Income,ReAdmis,VitD_levels,Doc_visits,Soft_drink,HighBlood,Stroke,Overweight,Arthritis,...,Initial_admin_Elective Admission,Initial_admin_Emergency Admission,Initial_admin_Observation Admission,Complication_risk_High,Complication_risk_Low,Complication_risk_Medium,Services_Blood Work,Services_CT Scan,Services_Intravenous,Services_MRI
0,53,86575.93,0,19.141466,6,0,1,0,0,1,...,0,1,0,0,0,1,1,0,0,0
1,51,46805.99,0,18.940352,4,0,1,0,1,0,...,0,1,0,1,0,0,0,0,1,0
2,53,14370.14,0,18.057507,4,0,1,0,1,0,...,1,0,0,0,0,1,1,0,0,0
3,78,39741.49,0,16.576858,4,0,0,1,0,1,...,1,0,0,0,0,1,1,0,0,0
4,22,1209.56,0,17.439069,5,1,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0


In [7]:
#separating dependent variable from independent variables
X_values = df.drop(['Complication_risk_High', 'Complication_risk_Low', 
                   'Complication_risk_Medium'], axis = 1)

In [8]:
#making a copy of the data to work on in case of error
data_copy = pd.read_csv("D:\code\School\D207\medical_clean.csv")

In [9]:
#identifying dependent variable
Y = data_copy['Complication_risk']

In [10]:
#writing prepared data to a csv
df.to_csv('prepped_med_data.csv')

In [11]:
#splitting data into testing and training data 
X_train, X_test, Y_train, Y_test = train_test_split(X_values,Y,test_size = .2)


In [12]:
#saving to csv
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
Y_train.to_csv('Y_train.csv')
Y_test.to_csv('Y_test.csv')

  after removing the cwd from sys.path.
  """


In [13]:
#converting dataframe to array 
X = np.array(X_values)
#converting dataframe to array
Y = np.array(Y)
#viewing array 
print(Y)

['Medium' 'High' 'Medium' ... 'High' 'Medium' 'Low']


In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
#scaling data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
#viewing training set
print(X_train)

[[ 0.51645171  0.23045243  1.31366815 ...  2.73068849 -0.67675078
  -0.19670091]
 [-0.83763662 -0.01109839 -0.76122725 ... -0.36620801  1.47764884
  -0.19670091]
 [-0.40239394 -0.52011536 -0.76122725 ... -0.36620801 -0.67675078
  -0.19670091]
 ...
 [-0.59583513 -0.40558608  1.31366815 ...  2.73068849 -0.67675078
  -0.19670091]
 [-0.54747484 -0.76289444 -0.76122725 ... -0.36620801 -0.67675078
  -0.19670091]
 [ 0.6615326  -0.9103863  -0.76122725 ... -0.36620801 -0.67675078
  -0.19670091]]


In [17]:
#creating classifier object
classifier = neighbors.KNeighborsClassifier()
classifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [18]:
#creating predictions for X 
y_pred = classifier.predict(X_test)

In [19]:
#viewing predictions
print(y_pred)

['High' 'Medium' 'Medium' ... 'Medium' 'High' 'Medium']


In [20]:
#getting precision score 
precision_score(y_pred, Y_test, average = 'macro')

0.3290537029583485

In [21]:
print(Y_test)

1204    Medium
793        Low
4967      High
7572    Medium
9048       Low
         ...  
3932      High
2943    Medium
9050      High
4146      High
980        Low
Name: Complication_risk, Length: 2000, dtype: object


In [22]:
#creating confusion matrix
cm = confusion_matrix(Y_test, y_pred, labels = ['High',
'Low', 'Medium'])
                                           

In [23]:
#viewing false positive and negative counts
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]

In [24]:
print(TN, FN, TP, FP)

297 190 54 112


In [25]:
#creating program to calculate area under the curve 
def multiclass_roc_auc_score(Y_test, y_pred, average="macro"):
    lb = preprocessing.LabelBinarizer()
    lb.fit(Y_test)
    Y_test = lb.transform(Y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(Y_test, y_pred)

In [26]:
#getting area under the curve 
multiclass_roc_auc_score(Y_test, y_pred)

0.49788968265592315