In [1]:
import numpy as np
import pandas as pd


In [2]:
 # machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


## Part 1


**Read __`diabetesdata.csv`__ file into a pandas dataframe.**
About the data: __

1. __TimesPregnant__: Number of times pregnant 
2. __glucoseLevel__: Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
3. __BP__: Diastolic blood pressure (mm Hg)  
5. __insulin__: 2-Hour serum insulin (mu U/ml) 
6. __BMI__: Body mass index (weight in kg/(height in m)^2) 
7. __pedigree__: Diabetes pedigree function 
8. __Age__: Age (years) 
9. __IsDiabetic__: 0 if not diabetic or 1 if diabetic) 








In [3]:
#Read data & print it
data = pd.read_csv('diabetesdata.csv')
print(data.head())

   TimesPregnant  glucoseLevel  BP  insulin   BMI  Pedigree   Age  IsDiabetic
0              6         148.0  72        0  33.6     0.627  50.0           1
1              1           NaN  66        0  26.6     0.351  31.0           0
2              8         183.0  64        0  23.3     0.672   NaN           1
3              1           NaN  66       94  28.1     0.167  21.0           0
4              0         137.0  40      168  43.1     2.288  33.0           1


**Calculate the percentage of NaN values in each column.**

In [4]:
# NullsPerColumn is a dataframe that includes a column called 'Percentage Null'. The values are between 0 and 1.

percentage_NaN= data.isnull().sum() / len(data)

NullsPerColumn = pd.DataFrame({'column_name': data.columns,
                                 'Percentage Null': percentage_NaN})
NullsPerColumn

Unnamed: 0,column_name,Percentage Null
TimesPregnant,TimesPregnant,0.0
glucoseLevel,glucoseLevel,0.044271
BP,BP,0.0
insulin,insulin,0.0
BMI,BMI,0.0
Pedigree,Pedigree,0.0
Age,Age,0.042969
IsDiabetic,IsDiabetic,0.0


**Calculate the TOTAL percent of ROWS with NaN values in the dataframe.**

In [5]:
# Return values between 0 and 1. 

a = data.isnull().sum(axis=1).tolist()
PercentNull = (len(a) - a.count(0))  / len(a)
PercentNull

0.08333333333333333

**Split __`data`__  into  __`train_df`__ and __`test_df`__  with 15% test split.**


In [6]:
#split values
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size=0.15, random_state=100)


**Replace the Nan values in  __`train_df`__ and __`test_df`__  with the mean of EACH feature.**

In [7]:
train_df = train_df.fillna(train_df.mean(axis=0))
test_df =test_df.fillna(test_df.mean(axis=0))

train_df.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
458,10,148.0,84,237,37.6,1.001,51.0,1
635,13,104.0,72,0,31.2,0.465,38.0,1
457,5,86.0,68,71,30.2,0.364,24.0,0
674,8,91.0,82,0,35.6,0.587,68.0,0
277,0,104.0,64,116,27.8,0.454,23.0,0


**Split __`train_df`__ & __`test_df`__   into  __`X_train`__, __`Y_train`__  and __`X_test`__, __`Y_test`__.**

In [8]:
X_train = train_df.drop(['IsDiabetic'], axis=1)
Y_train = train_df.loc[:, train_df.columns == 'IsDiabetic'].values.ravel()
X_test  = test_df.drop(['IsDiabetic'], axis=1)
Y_test = test_df.loc[:, test_df.columns == 'IsDiabetic'].values.ravel()
[X_train.shape, Y_train.shape, X_test.shape,Y_test.shape]

[(652, 7), (652,), (116, 7), (116,)]

**Use this dataset to train perceptron, logistic regression and random forest models using 15% test split. Report training and test accuracies.**

In [9]:
# Logistic Regression
from sklearn import linear_model
logreg = linear_model.LogisticRegression(solver = 'newton-cg', multi_class='ovr') 
# use newton-cg solver and 'ovr' for multi_class
logreg.fit(X_train, Y_train)
logreg_train_acc = logreg.score(X_train,Y_train)
logreg_test_acc = logreg.score(X_test,Y_test)
print ('logreg training acuracy= ',logreg_train_acc)
print('logreg test accuracy= ',logreg_test_acc)

logreg training acuracy=  0.7806748466257669
logreg test accuracy=  0.75


In [10]:
# Perceptron

perceptron = linear_model.Perceptron()
perceptron.fit(X_train, Y_train)
perceptron_train_acc = perceptron.score(X_train, Y_train)
perceptron_test_acc = perceptron.score(X_test, Y_test)
print ('perceptron training acuracy= ',perceptron_train_acc)
print('perceptron test accuracy= ',perceptron_test_acc)


perceptron training acuracy=  0.38190184049079756
perceptron test accuracy=  0.3793103448275862


In [11]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier()
adaboost.fit(X_train, Y_train)
adaboost_train_acc = adaboost.score(X_train, Y_train)
adaboost_test_acc = adaboost.score(X_test, Y_test)
print ('adaboost training acuracy= ',adaboost_train_acc)
print('adaboost test accuracy= ',adaboost_test_acc)

adaboost training acuracy=  0.8358895705521472
adaboost test accuracy=  0.7241379310344828


In [12]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)
random_forest_train_acc = random_forest.score(X_train, Y_train)
random_forest_test_acc = random_forest.score(X_train, Y_train)
print('random_forest training acuracy= ',random_forest_train_acc)
print('random_forest test accuracy= ',random_forest_test_acc)

random_forest training acuracy=  1.0
random_forest test accuracy=  1.0


## Part 2


**Add columns __`BMI_band`__ & __`Pedigree_band`__  to  a new dataframe called __`data2`__  by cutting __`BMI`__ & __`Pedigree`__ into 3 intervals. PRINT the first 5 rows of __`data2`__.**


In [13]:
# YOUR CODE HERE
data2=pd.DataFrame()
data2['BMI_band']= pd.cut(data['BMI'],3)
data2['Pedigree_band']=pd.cut(data['Pedigree'],3)
data2.head(5)

Unnamed: 0,BMI_band,Pedigree_band
0,"(22.367, 44.733]","(0.0757, 0.859]"
1,"(22.367, 44.733]","(0.0757, 0.859]"
2,"(22.367, 44.733]","(0.0757, 0.859]"
3,"(22.367, 44.733]","(0.0757, 0.859]"
4,"(22.367, 44.733]","(1.639, 2.42]"


__Print the category intervals for __`BMI_band`__ & __`Pedigree_band`.

In [14]:
print( 'BMI_Band_Interval:', data2['BMI_band'].unique())

print('Pedigree_Band_Interval:', data2['Pedigree_band'].unique() )

                                                        

BMI_Band_Interval: [(22.367, 44.733], (-0.0671, 22.367], (44.733, 67.1]]
Categories (3, interval[float64]): [(-0.0671, 22.367] < (22.367, 44.733] < (44.733, 67.1]]
Pedigree_Band_Interval: [(0.0757, 0.859], (1.639, 2.42], (0.859, 1.639]]
Categories (3, interval[float64]): [(0.0757, 0.859] < (0.859, 1.639] < (1.639, 2.42]]


__Group __`data`__ by __`Pedigree_band`__ & determine ratio of diabetic in each band.__

In [15]:
# YOUR CODE HERE
data['Pedigree_band']=data2['Pedigree_band']
#pedigree_diabratio = data[data.IsDiabetic == 1].groupby(['Pedigree_band']).count()/data.groupby(['Pedigree_band']).count()
#pedigree_DiabeticRatio = pedigree_diabratio['IsDiabetic']
#pedigree_DiabeticRatio

pedigree_DiabeticRatio = (
    data[['Pedigree_band', 'IsDiabetic']]
    .groupby(['Pedigree_band'], as_index=False)
    .mean()
    .sort_values(by='Pedigree_band', ascending=True)
)
pedigree_DiabeticRatio

Unnamed: 0,Pedigree_band,IsDiabetic
0,"(0.0757, 0.859]",0.327007
1,"(0.859, 1.639]",0.540541
2,"(1.639, 2.42]",0.444444


__Group  __`data`__ by __`BMI_band`__ & determine ratio of diabetic in each band.__

In [16]:
# YOUR CODE HERE
data['BMI_band']=data2['BMI_band']
#BMI_diabratio = data[data.IsDiabetic == 1].groupby(['BMI_band']).count()/data.groupby(['BMI_band']).count()
#BMI_DiabeticRatio = BMI_diabratio['IsDiabetic']
#BMI_DiabeticRatio

BMI_DiabeticRatio = (
    data[["BMI_band", "IsDiabetic"]]
    .groupby(["BMI_band"], as_index=False)
    .mean()
    .sort_values(by="BMI_band", ascending=True)
)
BMI_DiabeticRatio

Unnamed: 0,BMI_band,IsDiabetic
0,"(-0.0671, 22.367]",0.039216
1,"(22.367, 44.733]",0.358297
2,"(44.733, 67.1]",0.611111


__Convert these features - 'BP','insulin','BMI' and 'Pedigree'   into categorical values by mapping different bands of values of these features to integers 0,1,2 in a dataframe called `data3`.__  
 




In [17]:


data3=pd.DataFrame()
columns_to_segment = ["BP", "insulin", "BMI", "Pedigree"]
for column in columns_to_segment:
    data3[column] = pd.cut(data[column], 3, labels=[0, 1, 2])
    
data3.head()

Unnamed: 0,BP,insulin,BMI,Pedigree
0,1,0,1,0
1,1,0,1,0
2,1,0,1,0
3,1,0,1,0
4,0,0,1,2


In [18]:
data['BMI']=data3['BMI']
data['BP']=data3['BP']
data['Pedigree']=data3['Pedigree']
data['insulin']=data3['insulin']
data

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,Pedigree_band,BMI_band
0,6,148.0,1,0,1,0,50.0,1,"(0.0757, 0.859]","(22.367, 44.733]"
1,1,,1,0,1,0,31.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
2,8,183.0,1,0,1,0,,1,"(0.0757, 0.859]","(22.367, 44.733]"
3,1,,1,0,1,0,21.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
4,0,137.0,0,0,1,2,33.0,1,"(1.639, 2.42]","(22.367, 44.733]"
...,...,...,...,...,...,...,...,...,...,...
763,10,101.0,1,0,1,0,63.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
764,2,122.0,1,0,1,0,27.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
765,5,121.0,1,0,1,0,30.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
766,1,126.0,1,0,1,0,47.0,1,"(0.0757, 0.859]","(22.367, 44.733]"



__Now I consider the original dataset again, instead of generalizing the NAN values with the mean of the feature I will try assigning values to NANs based on some hypothesis. For example for age I assume that the relation between BMI and BP of people is a reflection of the age group. I can have 9 types of BMI and BP relations and my aim is to find the median age of each of that group:__

My Age guess matrix will look like this:  

| BMI | 0       | 1      | 2  |
|-----|-------------|------------- |----- |
| BP  |             |              |      |
| 0   | a00         | a01          | a02  |
| 1   | a10         | a11          | a12  |
| 2   | a20         | a21          |  a22 |


__I Create a guess_matrix  for NaN values of *'Age'* ( using 'BMI' and 'BP')  and  *'glucoseLevel'*  (using 'BP' and 'Pedigree') for the given dataset and I assign values accordingly to the NaNs in 'Age' or *'glucoseLevel'* .__




In [19]:


guess_matrix_age = np.zeros((3, 3), dtype=float)

for i in range(0, 3):
    for j in range(0,3):
         guess_age_df = data[(data['BMI'] == i)&(data['BP'] == j)]['Age'].dropna()
         age_guess = guess_age_df.median()
         guess_matrix_age[i,j] = float(age_guess)
            
print("Guess Matrix for Age:\n", guess_matrix_age)

print ('\nAssigning age values to NAN age values in the dataset...')
    
for i in range(0, 3):
    for j in range(0, 3):
        data.loc[(data.Age.isnull()) & (data.BP == i) & (data.BMI == j),'Age'] = guess_matrix_age[i,j]
                    



guess_matrix_glucoseLevel = np.zeros((3, 3), dtype=float)

for i in range(0, 3):
    for j in range(0,3):
         guess_glucLevel_df = data[(data['BP'] == i)&(data['Pedigree'] == j)]['glucoseLevel'].dropna()
         glucLevel_guess = guess_glucLevel_df.median()
         guess_matrix_glucoseLevel[i,j] = float(glucLevel_guess)
            
print("Guess Matrix for Glucose Level:\n", guess_matrix_glucoseLevel)

print ('\nAssigning Glucose Level values to NAN Glucose Level values in the dataset...')
    
for i in range(0, 3):
    for j in range(0, 3):
        data.loc[(data.glucoseLevel.isnull()) & (data.BP == i) & (data.Pedigree == j),'glucoseLevel'] = guess_matrix_glucoseLevel[i,j]
                    



Guess Matrix for Age:
 [[24.5 25.  55.5]
 [29.5 29.  37. ]
 [33.  32.  31. ]]

Assigning age values to NAN age values in the dataset...
Guess Matrix for Glucose Level:
 [[115.  127.5 137. ]
 [112.  115.5 149. ]
 [133.  129.5 159.5]]

Assigning Glucose Level values to NAN Glucose Level values in the dataset...



**Now, convert 'glucoseLevel' and 'Age' features also to categorical variables of 4 categories each in a dataframe called `data4`. PRINT the head of `data4`**








In [20]:


data4 = pd.DataFrame()
data4['glucoseLevel']=pd.cut(data['glucoseLevel'],4, labels=[0,1,2,3])
data4['Age']=pd.cut(data['Age'],4,labels=[0,1,2,3])

print(data4.head())

  glucoseLevel Age
0            2   1
1            2   0
2            3   0
3            2   0
4            2   0


In [21]:
data


Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,Pedigree_band,BMI_band
0,6,148.0,1,0,1,0,50.0,1,"(0.0757, 0.859]","(22.367, 44.733]"
1,1,112.0,1,0,1,0,31.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
2,8,183.0,1,0,1,0,29.0,1,"(0.0757, 0.859]","(22.367, 44.733]"
3,1,112.0,1,0,1,0,21.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
4,0,137.0,0,0,1,2,33.0,1,"(1.639, 2.42]","(22.367, 44.733]"
...,...,...,...,...,...,...,...,...,...,...
763,10,101.0,1,0,1,0,63.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
764,2,122.0,1,0,1,0,27.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
765,5,121.0,1,0,1,0,30.0,0,"(0.0757, 0.859]","(22.367, 44.733]"
766,1,126.0,1,0,1,0,47.0,1,"(0.0757, 0.859]","(22.367, 44.733]"


In [22]:

data['glucoseLevel']=data4['glucoseLevel']
data['Age']=data4['Age']
data= data.drop(['Pedigree_band'], axis=1)
data= data.drop(['BMI_band'], axis=1)
data

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
0,6,2,1,0,1,0,1,1
1,1,2,1,0,1,0,0,0
2,8,3,1,0,1,0,0,1
3,1,2,1,0,1,0,0,0
4,0,2,0,0,1,2,0,1
...,...,...,...,...,...,...,...,...
763,10,2,1,0,1,0,2,0
764,2,2,1,0,1,0,0,0
765,5,2,1,0,1,0,0,0
766,1,2,1,0,1,0,1,1


__Use this dataset (with all features in categorical form) to train perceptron, logistic regression and random forest models using 15% test split. Report training and test accuracies.__


In [23]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.15, random_state=100)
X_train = train_df.drop(['IsDiabetic'], axis=1)
Y_train = train_df.loc[:, train_df.columns == 'IsDiabetic'].values.ravel()
X_test  = test_df.drop(['IsDiabetic'], axis=1)
Y_test = test_df.loc[:, test_df.columns == 'IsDiabetic'].values.ravel()

X_train.shape, Y_train.shape, X_test.shape

((652, 7), (652,), (116, 7))

In [24]:
# Logistic Regression

from sklearn import linear_model
logreg = linear_model.LogisticRegression(solver = 'newton-cg', multi_class='ovr') 
logreg.fit(X_train, Y_train)
logreg_train_acc = logreg.score(X_train,Y_train)
logreg_test_acc = logreg.score(X_test,Y_test)
print ('logreg training acuracy= ',logreg_train_acc)
print('logreg test accuracy= ',logreg_test_acc)

logreg training acuracy=  0.7530674846625767
logreg test accuracy=  0.7327586206896551


In [25]:
# Perceptron

perceptron = linear_model.Perceptron()
perceptron.fit(X_train, Y_train)
perceptron_train_acc = perceptron.score(X_train, Y_train)
perceptron_test_acc = perceptron.score(X_test, Y_test)
print ('perceptron training acuracy= ',perceptron_train_acc)
print('perceptron test accuracy= ',perceptron_test_acc)

perceptron training acuracy=  0.5812883435582822
perceptron test accuracy=  0.6120689655172413


In [26]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)
random_forest_train_acc = random_forest.score(X_train, Y_train)
random_forest_test_acc = random_forest.score(X_train, Y_train)
print('random_forest training acuracy= ',random_forest_train_acc)
print('random_forest test accuracy= ',random_forest_test_acc)

random_forest training acuracy=  0.8773006134969326
random_forest test accuracy=  0.8773006134969326
