In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the diabetes dataset
df = pd.read_csv('diabetic_data.csv')

In [3]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [4]:
df = df[["race","gender","age","metformin","insulin","diabetesMed"]]

In [5]:
df

Unnamed: 0,race,gender,age,metformin,insulin,diabetesMed
0,Caucasian,Female,[0-10),No,No,No
1,Caucasian,Female,[10-20),No,Up,Yes
2,AfricanAmerican,Female,[20-30),No,No,Yes
3,Caucasian,Male,[30-40),No,Up,Yes
4,Caucasian,Male,[40-50),No,Steady,Yes
...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),Steady,Down,Yes
101762,AfricanAmerican,Female,[80-90),No,Steady,Yes
101763,Caucasian,Male,[70-80),Steady,Down,Yes
101764,Caucasian,Female,[80-90),No,Up,Yes


In [6]:
df['age'] = df['age'].replace(['[0-10)', '[10-20)', '[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'], ['5', '15', '25', '35', '45', '55', '65', '75', '85', '95'])

In [7]:
df["age"].value_counts

<bound method IndexOpsMixin.value_counts of 0          5
1         15
2         25
3         35
4         45
          ..
101761    75
101762    85
101763    75
101764    85
101765    75
Name: age, Length: 101766, dtype: object>

In [8]:
df

Unnamed: 0,race,gender,age,metformin,insulin,diabetesMed
0,Caucasian,Female,5,No,No,No
1,Caucasian,Female,15,No,Up,Yes
2,AfricanAmerican,Female,25,No,No,Yes
3,Caucasian,Male,35,No,Up,Yes
4,Caucasian,Male,45,No,Steady,Yes
...,...,...,...,...,...,...
101761,AfricanAmerican,Male,75,Steady,Down,Yes
101762,AfricanAmerican,Female,85,No,Steady,Yes
101763,Caucasian,Male,75,Steady,Down,Yes
101764,Caucasian,Female,85,No,Up,Yes


In [9]:
for column in df:
    unique_values = df[column].unique() 
    print(f'Unique values in column {column}: {unique_values}')

Unique values in column race: ['Caucasian' 'AfricanAmerican' '?' 'Other' 'Asian' 'Hispanic']
Unique values in column gender: ['Female' 'Male' 'Unknown/Invalid']
Unique values in column age: ['5' '15' '25' '35' '45' '55' '65' '75' '85' '95']
Unique values in column metformin: ['No' 'Steady' 'Up' 'Down']
Unique values in column insulin: ['No' 'Up' 'Steady' 'Down']
Unique values in column diabetesMed: ['No' 'Yes']


In [10]:
df = df[~df['race'].isin(['?'])]
df = df[~df['gender'].isin(['Unknown/Invalid'])]

In [11]:
race = LabelEncoder()
df["race"] = race.fit_transform(df["race"])
gender = LabelEncoder()
df["gender"] = gender.fit_transform(df["gender"])
metformin = LabelEncoder()
df["metformin"] = metformin.fit_transform(df["metformin"])
insulin = LabelEncoder()
df["insulin"] = insulin.fit_transform(df["insulin"])
diabetesMed = LabelEncoder()
df["diabetesMed"] = diabetesMed.fit_transform(df["diabetesMed"])

In [12]:
df

Unnamed: 0,race,gender,age,metformin,insulin,diabetesMed
0,2,0,5,1,1,0
1,2,0,15,1,3,1
2,0,0,25,1,1,1
3,2,1,35,1,3,1
4,2,1,45,1,2,1
...,...,...,...,...,...,...
101761,0,1,75,2,0,1
101762,0,0,85,1,2,1
101763,2,1,75,2,0,1
101764,2,0,85,1,3,1


In [13]:
# Split the dataset into features and target variables
X = df.drop('diabetesMed', axis=1)
y = df['diabetesMed']

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=100)

In [16]:
# Train the classifier on the training set
clf.fit(X_train, y_train)

In [17]:
# Evaluate the classifier on the testing set
y_pred = clf.predict(X_test)

In [18]:
# Print the accuracy of the classifier
print('Accuracy:', clf.score(X_test, y_test))

Accuracy: 0.8621538770792502


In [19]:
X = np.array([["Caucasian","Male",46,"Steady","Steady"]])

In [20]:
X[:,0] = race.transform(X[:,0]) 
X[:,1] = gender.transform(X[:,1])
X[:,3] = metformin.transform(X[:,3]) 
X[:,4] = insulin.transform(X[:,4]) 
X = X.astype(float) 
X

array([[ 2.,  1., 46.,  2.,  2.]])

In [21]:
y_pred = clf.predict(X)



In [22]:
y_pred

array([1])