# Predicting Diabetes in a Patient

### Machine Learning

#### Random Forests, SVMs, AdaBoost, KNN, Gaussian, Perceptrons, CARTs

In [1]:
import numpy as np
import pandas as pd

In [2]:
 # machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

## Part 1



__ 1. Read __`diabetesdata.csv`__ file into a pandas dataframe. 
About the data: __

1. __TimesPregnant__: Number of times pregnant 
2. __glucoseLevel__: Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
3. __BP__: Diastolic blood pressure (mm Hg)  
5. __insulin__: 2-Hour serum insulin (mu U/ml) 
6. __BMI__: Body mass index (weight in kg/(height in m)^2) 
7. __pedigree__: Diabetes pedigree function 
8. __Age__: Age (years) 
9. __IsDiabetic__: 0 if not diabetic or 1 if diabetic) 








In [3]:
#Read data & print it
data = pd.read_csv("diabetesdata.csv") 
data.shape

(768, 8)

In [4]:
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
0,6,148.0,72,0,33.6,0.627,50.0,1
1,1,,66,0,26.6,0.351,31.0,0
2,8,183.0,64,0,23.3,0.672,,1
3,1,,66,94,28.1,0.167,21.0,0
4,0,137.0,40,168,43.1,2.288,33.0,1


In [5]:
data.describe()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
count,768.0,734.0,768.0,768.0,768.0,768.0,735.0,768.0
mean,3.845052,121.016349,69.105469,79.799479,31.992578,0.471876,33.353741,0.348958
std,3.369578,31.66024,19.355807,115.244002,7.88416,0.331329,11.772944,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,141.0,80.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,846.0,67.1,2.42,81.0,1.0


In [6]:
data.isnull().sum()

TimesPregnant     0
glucoseLevel     34
BP                0
insulin           0
BMI               0
Pedigree          0
Age              33
IsDiabetic        0
dtype: int64

**Percentage of NaN values in each column.**

In [7]:
NullsPerColumn = (data.isnull().sum()/768)

In [8]:
NullsPerColumn = NullsPerColumn.to_frame(name = 'Percentage Null')
NullsPerColumn

Unnamed: 0,Percentage Null
TimesPregnant,0.0
glucoseLevel,0.044271
BP,0.0
insulin,0.0
BMI,0.0
Pedigree,0.0
Age,0.042969
IsDiabetic,0.0


**TOTAL percent of ROWS with NaN values in the dataframe (make sure values are floats).**

In [9]:
PercentNull = data.isnull().sum(axis=1).sum()/data.shape[0] 
PercentNull


0.08723958333333333

**Split __`data`__  into  __`train_df`__ and __`test_df`__  with 15% test split.**


In [10]:
#split values
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size = 0.15, random_state = 100)


**Replace the Nan values in  __`train_df`__ and __`test_df`__  with the mean of EACH feature.**

In [11]:
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())



**Split __`train_df`__ & __`test_df`__   into  __`X_train`__, __`Y_train`__  and __`X_test`__, __`Y_test`__. __`Y_train`__  and __`Y_test`__ should only have the column we are trying to predict,  __`IsDiabetic`__.**

In [12]:
train_df.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
458,10,148.0,84,237,37.6,1.001,51.0,1
635,13,104.0,72,0,31.2,0.465,38.0,1
457,5,86.0,68,71,30.2,0.364,24.0,0
674,8,91.0,82,0,35.6,0.587,68.0,0
277,0,104.0,64,116,27.8,0.454,23.0,0


In [13]:
test_df.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
173,1,79.0,60,48,43.5,0.678,23.0,0
253,0,86.0,68,0,35.8,0.238,25.0,0
207,5,162.0,104,0,37.7,0.151,52.0,1
737,8,122.821429,72,0,32.0,0.6,42.0,0
191,9,123.0,70,94,33.1,0.374,40.0,0


In [14]:
X_train = train_df.iloc[:,:7]
Y_train = train_df['IsDiabetic']
X_test  = test_df.iloc[:,:7]
Y_test = test_df['IsDiabetic']


**7.Use this dataset to train perceptron, logistic regression and random forest models using 15% test split. Report training and test accuracies.**

In [15]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
logreg_train_acc = logreg.score(X_train, Y_train)
logreg_test_acc = logreg.score(X_test, Y_test)
print ('logreg training acuracy= ',logreg_train_acc)
print('logreg test accuracy= ',logreg_test_acc)

logreg training acuracy=  0.7745398773006135
logreg test accuracy=  0.75




In [16]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
perceptron_train_acc = perceptron.score(X_train, Y_train)
perceptron_test_acc = perceptron.score(X_test, Y_test)
print ('perceptron training acuracy= ',perceptron_train_acc)
print('perceptron test accuracy= ',perceptron_test_acc)


perceptron training acuracy=  0.38190184049079756
perceptron test accuracy=  0.3793103448275862


In [17]:
# Adaboost
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, Y_train)
adaboost_train_acc = adaboost.score(X_train, Y_train)
adaboost_test_acc = adaboost.score(X_test, Y_test)
print ('adaboost training acuracy= ',adaboost_train_acc)
print('adaboost test accuracy= ',adaboost_test_acc)

adaboost training acuracy=  0.8358895705521472
adaboost test accuracy=  0.7241379310344828


In [18]:
# Random Forest

random_forest = xgb.XGBClassifier(n_estimators = 70)
random_forest.fit(X_train, Y_train)
random_forest_train_acc = random_forest.score(X_train, Y_train)
random_forest_test_acc = random_forest.score(X_test, Y_test)
print('random_forest training acuracy= ',random_forest_train_acc)
print('random_forest test accuracy= ',random_forest_test_acc)

random_forest training acuracy=  0.8742331288343558
random_forest test accuracy=  0.7327586206896551


NOTE: Mean imputation might not always be the best way to ﬁll missing values. It depends on the data distribution, someitmes the mean is not really the best representation of the dataset in each column (if unbalanced or very skewed). Median can be used in this case. Other methods include predicting the missing value based on parameters from other features.


## Part 2


__1.Add columns __`BMI_band`__ & __`Pedigree_band`__  to  __`Data`__  by cutting __`BMI`__ & __`Pedigree`__ into 3 intervals. PRINT the first 5 rows of__`data`__.__


In [19]:
# YOUR CODE HERE
data['BMI_band'] = pd.cut(data['BMI'], 3) 
data['Pedigree_band'] = pd.cut(data['Pedigree'],3) 
data.head(5)
 



Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,148.0,72,0,33.6,0.627,50.0,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,,66,0,26.6,0.351,31.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,183.0,64,0,23.3,0.672,,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,,66,94,28.1,0.167,21.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,137.0,40,168,43.1,2.288,33.0,1,"(22.367, 44.733]","(1.639, 2.42]"


__1a. Print the category intervals for __`BMI_band`__ & __`Pedigree_band`.

In [20]:
print('BMI_Band_Interval:\n', data['BMI_band'].unique()) 
print()
print('Pedigree_Band_Interval:\n',data['Pedigree_band'].unique())
                                                        

BMI_Band_Interval:
 [(22.367, 44.733], (-0.0671, 22.367], (44.733, 67.1]]
Categories (3, interval[float64]): [(-0.0671, 22.367] < (22.367, 44.733] < (44.733, 67.1]]

Pedigree_Band_Interval:
 [(0.0757, 0.859], (1.639, 2.42], (0.859, 1.639]]
Categories (3, interval[float64]): [(0.0757, 0.859] < (0.859, 1.639] < (1.639, 2.42]]


__2. Group __`data`__ by __`Pedigree_band`__ & determine ratio of diabetic in each band.__

In [21]:
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,148.0,72,0,33.6,0.627,50.0,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,,66,0,26.6,0.351,31.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,183.0,64,0,23.3,0.672,,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,,66,94,28.1,0.167,21.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,137.0,40,168,43.1,2.288,33.0,1,"(22.367, 44.733]","(1.639, 2.42]"


In [22]:
df = data[['Pedigree_band','IsDiabetic']].groupby('Pedigree_band').sum()


In [23]:
 
pedigree_DiabeticRatio = (data[['Pedigree_band','IsDiabetic']].groupby('Pedigree_band').sum() / 
                          data[['Pedigree_band','IsDiabetic']].groupby('Pedigree_band').count())
pedigree_DiabeticRatio

Unnamed: 0_level_0,IsDiabetic
Pedigree_band,Unnamed: 1_level_1
"(0.0757, 0.859]",0.327007
"(0.859, 1.639]",0.540541
"(1.639, 2.42]",0.444444


__2a. Group  __`data`__ by __`BMI_band`__ & determine ratio of diabetic in each band.__

In [24]:

BMI_DiabeticRatio = (data[['BMI_band','IsDiabetic']].groupby('BMI_band').sum() /
                     data[['BMI_band','IsDiabetic']].groupby('BMI_band').count())
BMI_DiabeticRatio

Unnamed: 0_level_0,IsDiabetic
BMI_band,Unnamed: 1_level_1
"(-0.0671, 22.367]",0.039216
"(22.367, 44.733]",0.358297
"(44.733, 67.1]",0.611111


__3. Convert these features - 'BP','insulin','BMI' and 'Pedigree'   into categorical values by mapping different bands of values of these features to integers 0,1,2.__  
 


In [25]:
# YOUR CODE HERE
data['BP'] = pd.cut(data['BP'], 3, labels = [0,1,2])
data['insulin'] = pd.cut(data['insulin'], 3, labels = [0,1,2])
data['BMI'] = pd.cut(data['BMI'], 3, labels = [0,1,2])
data['Pedigree'] = pd.cut(data['Pedigree'], 3, labels = [0,1,2])
data.head()


Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,148.0,1,0,1,0,50.0,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,,1,0,1,0,31.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,183.0,1,0,1,0,,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,,1,0,1,0,21.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,137.0,0,0,1,2,33.0,1,"(22.367, 44.733]","(1.639, 2.42]"



__4. Instead of generalizing the NAN values with the mean of the feature we will try assigning values to NANs based on some hypothesis. For example for age we assume that the relation between BMI and BP of people is a reflection of the age group. We can have 9 types of BMI and BP relations and our aim is to find the median age of each of that group:__


| BMI | 0       | 1      | 2  |
|-----|-------------|------------- |----- |
| BP  |             |              |      |
| 0   | a00         | a01          | a02  |
| 1   | a10         | a11          | a12  |
| 2   | a20         | a21          |  a22 |




In [26]:
guess_ages = np.zeros((3,3),dtype=int) 
guess_ages

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [27]:
for i in range(3):
    for j in range(3):
        guess_df = data[(data['BMI']==i)&(data['BP']==j)]['Age'].dropna() 
        age_guess = guess_df.median()        
        guess_ages[i,j]= int(age_guess)        
        
print('The guess matrix for "Age" is:\n', guess_ages)


The guess matrix for "Age" is:
 [[24 25 55]
 [29 29 37]
 [33 32 31]]


In [28]:
guess_glucoselvl = np.zeros((3,3),dtype=int) 
guess_glucoselvl 

for i in range(3):
    for j in range(3):
        guess_df = data[(data['BP']==i)&(data['Pedigree']==j)]['glucoseLevel'].dropna() 
        glucoselvl_guess = guess_df.median()        
        guess_glucoselvl[i,j]= int(glucoselvl_guess)        
        
print() 
print('The guess matrix for "GlucoseLevel" is:\n', guess_glucoselvl ) 
print()


The guess matrix for "GlucoseLevel" is:
 [[115 127 137]
 [112 115 149]
 [133 129 159]]



In [29]:
for i in range(3): 
    for j in range(3): 
        data.loc[(data.Age.isnull())& (data.BP == i) & (data.BMI == j), 'Age'] = guess_ages[i,j] 
        
data['Age'] = data['Age'].astype(int) 

In [30]:
for i in range(3): 
    for j in range(3): 
        data.loc[(data.glucoseLevel.isnull())& (data.BP == i) & (data.Pedigree == j), 'glucoseLevel'] = guess_glucoselvl[i,j] 
        
data['glucoseLevel'] = data['glucoseLevel'].astype(int) 

In [31]:
data.isnull().sum()

TimesPregnant    0
glucoseLevel     0
BP               0
insulin          0
BMI              0
Pedigree         0
Age              0
IsDiabetic       0
BMI_band         0
Pedigree_band    0
dtype: int64



__5. Now, convert 'glucoseLevel' and 'Age' features also to categorical variables of 4 categories each. PRINT the head of __`data`__ __








In [32]:

data['glucoseLevel'] = pd.cut(data['glucoseLevel'], 4, labels = [0,1,2,3])
data['Age'] = pd.cut(data['Age'], 4, labels = [0,1,2,3])
 


In [33]:
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,2,1,0,1,0,1,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,2,1,0,1,0,0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,3,1,0,1,0,0,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,2,1,0,1,0,0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,2,0,0,1,2,0,1,"(22.367, 44.733]","(1.639, 2.42]"


__6.Use this dataset (with all features in categorical form) to train perceptron, logistic regression and random forest models using 15% test split. Report training and test accuracies.__


In [34]:
train_df, test_df = train_test_split(data, test_size = 0.15, random_state = 100)
X_train = train_df.iloc[:,:7]
Y_train = train_df['IsDiabetic']
X_test = test_df.iloc[:,:7]
Y_test = test_df['IsDiabetic']
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((652, 7), (652,), (116, 7), (116,))

In [35]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
logreg_train_acc = logreg.score(X_train, Y_train)
logreg_test_acc = logreg.score(X_test, Y_test)
print ('logreg training acuracy= ',logreg_train_acc)
print('logreg test accuracy= ',logreg_test_acc)

logreg training acuracy=  0.754601226993865
logreg test accuracy=  0.7155172413793104




In [36]:
# Perceptron
perceptron = Perceptron()
perceptron.fit(X_test, Y_test)
perceptron_train_acc = perceptron.score(X_train, Y_train)
perceptron_test_acc = perceptron.score(X_test, Y_test)
print ('perceptron training acuracy= ',perceptron_train_acc)
print('perceptron test accuracy= ',perceptron_test_acc)



perceptron training acuracy=  0.4325153374233129
perceptron test accuracy=  0.43103448275862066


In [37]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators = 20)
random_forest.fit(X_train, Y_train)
random_forest_train_acc = random_forest.score(X_train, Y_train)
random_forest_test_acc = random_forest.score(X_test, Y_test)
print ('random_forest training acuracy= ',random_forest_train_acc)
print('random_forest test accuracy= ',random_forest_test_acc)

random_forest training acuracy=  0.8742331288343558
random_forest test accuracy=  0.6379310344827587
