In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random

In [72]:
data = pd.read_csv('diabetes_prediction_dataset.csv', encoding = "utf-8")

Using Pandas Data Directly

In [73]:
X = data[['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level']]
y = data['diabetes']
X = pd.get_dummies(X)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [75]:
clf = RandomForestClassifier(n_estimators=1000, random_state=42)
clf.fit(X_train, y_train)

In [77]:
predictions = clf.predict(X_test)

In [78]:
accuracy = accuracy_score(y_test, predictions)
accuracy

0.9703

In [79]:
importance = clf.feature_importances_
for i, n in enumerate(importance):
    importance[i] *= 1000
key_importance = list(zip(X.columns, importance))
key_importance

[('age', 106.44974003213441),
 ('hypertension', 14.798725975566057),
 ('heart_disease', 11.02640880588388),
 ('bmi', 123.76433323491548),
 ('HbA1c_level', 401.2993566541934),
 ('blood_glucose_level', 319.49957625352306),
 ('gender_Female', 2.373951905503614),
 ('gender_Male', 2.4366318970867726),
 ('gender_Other', 0.0024504405010010856),
 ('smoking_history_No Info', 4.6644139261790265),
 ('smoking_history_current', 2.4707781884640134),
 ('smoking_history_ever', 2.102018408403119),
 ('smoking_history_former', 3.582689663252006),
 ('smoking_history_never', 3.3184380830081457),
 ('smoking_history_not current', 2.2104865313862034)]

Converting to dictionaries --> removing negative data

In [82]:
dataset = data.to_dict(orient='records')

In [58]:
for d in dataset:
    if d['gender'] == 'Male':
        d['gender'] = 1
    else:
        d['gender'] = 0

In [59]:
# getting rid of negative data
# we have 8500 instances of positive data

removeMultiplier = 1 # ratio of negative data to positive data
numToRemove = int(len(dataset) - (removeMultiplier * 8500 + 8500))
random.shuffle(dataset)

dataset.sort(key = lambda d: d['diabetes'], reverse=True)
for i in range(numToRemove):
    dataset.pop()

random.shuffle(dataset)

In [60]:
nTrain = int(0.6 * len(dataset))
nValid = int(0.2 * len(dataset))

In [61]:
dataTrain = dataset[:nTrain]
dataValid = dataset[nTrain:nTrain + nValid]
dataTest = dataset[nTrain + nValid:]

In [62]:
data = pd.DataFrame(dataset)

In [76]:
keys = []
for d in dataset[0]:
    keys.append(d)
keys.remove('diabetes')
keys.remove('smoking_history')
keys

['gender',
 'age',
 'hypertension',
 'heart_disease',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level']

In [64]:
X = data[keys]
y = data['diabetes']
X = pd.get_dummies(X)
X

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
0,1,44.0,0,0,31.82,3.5,126
1,1,9.0,0,0,16.40,4.5,100
2,0,57.0,0,0,27.32,6.6,220
3,0,32.0,1,0,21.22,4.5,130
4,1,74.0,1,1,35.83,6.5,130
...,...,...,...,...,...,...,...
16995,0,55.0,0,0,26.85,5.7,160
16996,1,10.0,0,0,16.71,6.6,155
16997,0,74.0,1,0,29.92,6.1,130
16998,1,74.0,0,0,33.14,6.5,200


In [65]:
keys = list(X.columns)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [67]:
clf = RandomForestClassifier(n_estimators=1000, random_state=42)
clf.fit(X_train, y_train)

In [68]:
predictions = clf.predict(X_test)

In [69]:
accuracy = accuracy_score(y_test, predictions)
accuracy

0.9007843137254902

In [70]:
importance = clf.feature_importances_
for i, n in enumerate(importance):
    importance[i] *= 1000
key_importance = list(zip(keys, importance))

In [71]:
key_importance

[('gender', 7.722952854747654),
 ('age', 182.7988610379931),
 ('hypertension', 25.12040902260138),
 ('heart_disease', 14.585259710235125),
 ('bmi', 130.18705719324385),
 ('HbA1c_level', 352.5259225636268),
 ('blood_glucose_level', 287.0595376175522)]