In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [7]:
df = pd.read_csv('diabetic_data.csv')
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [8]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [9]:
df = df[['race', 'gender', 'age','metformin', 'insulin', 'diabetesMed']]
df

Unnamed: 0,race,gender,age,metformin,insulin,diabetesMed
0,Caucasian,Female,[0-10),No,No,No
1,Caucasian,Female,[10-20),No,Up,Yes
2,AfricanAmerican,Female,[20-30),No,No,Yes
3,Caucasian,Male,[30-40),No,Up,Yes
4,Caucasian,Male,[40-50),No,Steady,Yes
...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),Steady,Down,Yes
101762,AfricanAmerican,Female,[80-90),No,Steady,Yes
101763,Caucasian,Male,[70-80),Steady,Down,Yes
101764,Caucasian,Female,[80-90),No,Up,Yes


In [14]:
for column in df:
  unique_values = df[column].unique() 
  print(f'Unique values in column {column}: {unique_values}')

Unique values in column race: ['Caucasian' 'AfricanAmerican' 'Other' 'Asian' 'Hispanic']
Unique values in column gender: ['Female' 'Male']
Unique values in column age: ['5' '15' '25' '35' '45' '55' '65' '75' '85' '95']
Unique values in column metformin: ['No' 'Steady' 'Up' 'Down']
Unique values in column insulin: ['No' 'Up' 'Steady' 'Down']
Unique values in column diabetesMed: ['No' 'Yes']


In [11]:
df = df[~df['race'].isin(['?'])]
df = df[~df['gender'].isin(['Unknown/Invalid'])]

In [12]:
df['age'] = df['age'].replace(['[0-10)', '[10-20)', '[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'], ['5', '15', '25', '35', '45', '55', '65', '75', '85', '95'])

In [13]:
df

Unnamed: 0,race,gender,age,metformin,insulin,diabetesMed
0,Caucasian,Female,5,No,No,No
1,Caucasian,Female,15,No,Up,Yes
2,AfricanAmerican,Female,25,No,No,Yes
3,Caucasian,Male,35,No,Up,Yes
4,Caucasian,Male,45,No,Steady,Yes
...,...,...,...,...,...,...
101761,AfricanAmerican,Male,75,Steady,Down,Yes
101762,AfricanAmerican,Female,85,No,Steady,Yes
101763,Caucasian,Male,75,Steady,Down,Yes
101764,Caucasian,Female,85,No,Up,Yes


In [16]:
race = LabelEncoder()
df["race"] = race.fit_transform(df["race"])
gender = LabelEncoder()
df["gender"] = gender.fit_transform(df["gender"])
metformin = LabelEncoder()
df["metformin"] = metformin.fit_transform(df["metformin"])
insulin = LabelEncoder()
df["insulin"] = insulin.fit_transform(df["insulin"])
diabetesMed = LabelEncoder()
df["diabetesMed"] = diabetesMed.fit_transform(df["diabetesMed"])

df

Unnamed: 0,race,gender,age,metformin,insulin,diabetesMed
0,2,0,5,1,1,0
1,2,0,15,1,3,1
2,0,0,25,1,1,1
3,2,1,35,1,3,1
4,2,1,45,1,2,1
...,...,...,...,...,...,...
101761,0,1,75,2,0,1
101762,0,0,85,1,2,1
101763,2,1,75,2,0,1
101764,2,0,85,1,3,1


In [17]:
X = df.drop('diabetesMed', axis=1)
y = df['diabetesMed']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

In [22]:
y_pred = lr.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))

Accuracy: 0.66
Precision: 0.74
Recall: 0.86
F1 Score: 0.80
