In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
# Question 2

# 2-a
cols=['Number of times pregnant', 
      'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
      'Diastolic blood pressure (mm Hg)', 
      'Triceps skinfold tickness (mm)', 
      '2-Houre serum insulin (mu U/ml)',
      'Body mass index (weight in kg/(height in m)^2)', 
      'Diabetes pedigree function', 
      'Age (years)', 
      'Class variable (0 or 1)'] 

df = pd.read_csv('Diabetes.csv', names=cols, header=None)
display(df)

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skinfold tickness (mm),2-Houre serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Number of times pregnant                                                  768 non-null    int64  
 1   Plasma glucose concentration a 2 hours in an oral glucose tolerance test  768 non-null    int64  
 2   Diastolic blood pressure (mm Hg)                                          768 non-null    int64  
 3   Triceps skinfold tickness (mm)                                            768 non-null    int64  
 4   2-Houre serum insulin (mu U/ml)                                           768 non-null    int64  
 5   Body mass index (weight in kg/(height in m)^2)                            768 non-null    float64
 6   Diabetes pedigree function                                         

In [3]:
# 2-b
# find the number of null values and missing values in each column
for col in df.columns:
    x=pd.to_numeric(df[col], errors='coerce')
    idx = x.isna()
    seen = set()
    uniq = [x for x in df[idx][col] if x not in seen and not seen.add(x)]
    for j in df[idx][col]:
        df[col] = df[col].replace({j : np.NaN})
        
df.isna().sum()

Number of times pregnant                                                    0
Plasma glucose concentration a 2 hours in an oral glucose tolerance test    0
Diastolic blood pressure (mm Hg)                                            0
Triceps skinfold tickness (mm)                                              0
2-Houre serum insulin (mu U/ml)                                             0
Body mass index (weight in kg/(height in m)^2)                              0
Diabetes pedigree function                                                  0
Age (years)                                                                 0
Class variable (0 or 1)                                                     0
dtype: int64

In [4]:
# removing outlier records from dataset using z-score method
correct_record = []
df_zscore = df.copy()
for col in df_zscore.columns:
    if col != "Class variable (0 or 1)":
        df_zscore['_z'] = stats.zscore(df_zscore[col])
        qu = '_z' + ' <= 3 & ' + '_z' + ' >= -3'
        Without_Outliers = df_zscore.query(qu)
        df_zscore =  Without_Outliers.copy()
        df_zscore[col] = df_zscore['_z']
    
df_zscore.drop(['_z'], axis=1, inplace=True)
print('Outliers record number using z_score \t', df.shape[0] - df_zscore.shape[0])
display(df_zscore)

Outliers record number using z_score 	 83


Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skinfold tickness (mm),2-Houre serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,0.639947,0.852734,0.148818,0.868668,-0.719459,0.200687,0.510998,1.434609,1
1,-0.844885,-1.120621,-0.159937,0.487391,-0.719459,-0.760858,-0.370909,-0.190593,0
2,1.233880,1.949043,-0.262856,-1.355445,-0.719459,-1.214158,0.654787,-0.105056,1
3,-0.844885,-0.995328,-0.159937,0.106115,0.084023,-0.554812,-0.958847,-1.045963,0
5,0.342981,-0.149605,0.251737,-1.355445,-0.719459,-0.898221,-0.850206,-0.276130,0
...,...,...,...,...,...,...,...,...,...
763,1.827813,-0.619451,0.354655,1.694767,0.819123,0.104533,-0.946065,2.546589,0
764,-0.547919,0.038334,0.045900,0.360299,-0.719459,0.640251,-0.406057,-0.532741,0
765,0.342981,0.007011,0.148818,0.106115,0.237881,-0.815803,-0.709612,-0.276130,0
766,-0.844885,0.163626,-0.468693,-1.355445,-0.719459,-0.280085,-0.377299,1.177998,1


In [5]:
# 2-c
y = df_zscore[['Class variable (0 or 1)']]
x = df_zscore.drop(['Class variable (0 or 1)'], axis=1)

In [6]:
# 2-d
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 5)

In [7]:
display(y_train['Class variable (0 or 1)'].value_counts())
s = y_train['Class variable (0 or 1)'].value_counts().sum()
print( '===========> train <===========' )
print( 'Distribution Of 0 :' , round(y_train['Class variable (0 or 1)'].value_counts()[0] * 100 / s, 2), '%')
print( 'Distribution Of 1 :' , round(y_train['Class variable (0 or 1)'].value_counts()[1]  * 100 / s, 2), '%')

0    367
1    181
Name: Class variable (0 or 1), dtype: int64

Distribution Of 0 : 66.97 %
Distribution Of 1 : 33.03 %


In [8]:
display(y_test['Class variable (0 or 1)'].value_counts())
s = y_test['Class variable (0 or 1)'].value_counts().sum()
print( '===========> test <===========' )
print( 'Distribution Of 0 :' , round(y_test['Class variable (0 or 1)'].value_counts()[0] * 100 / s, 2), '%')
print( 'Distribution Of 1 :' , round(y_test['Class variable (0 or 1)'].value_counts()[1]  * 100 / s, 2), '%')

0    92
1    45
Name: Class variable (0 or 1), dtype: int64

Distribution Of 0 : 67.15 %
Distribution Of 1 : 32.85 %


In [9]:
# 2-e
rfy = np.ravel(y_train)
rf01 = RandomForestClassifier(n_estimators = 100, criterion = "entropy", max_depth = 3).fit(x_train, rfy)

In [10]:
# 2-f
score = rf01.score(x_test, y_test)
display(score)

0.781021897810219

In [11]:
# 2-g
for depth in range(1, 15):
    rfy = np.ravel(y_train)
    rf01 = RandomForestClassifier(n_estimators = 100, criterion = "entropy", max_depth = depth).fit(x_train, rfy)
    score = rf01.score(x_test, y_test)
    print("accuracy for max_depth ", depth, ":", score)

accuracy for max_depth  1 : 0.6715328467153284
accuracy for max_depth  2 : 0.7445255474452555
accuracy for max_depth  3 : 0.781021897810219
accuracy for max_depth  4 : 0.781021897810219
accuracy for max_depth  5 : 0.7956204379562044
accuracy for max_depth  6 : 0.781021897810219
accuracy for max_depth  7 : 0.8029197080291971
accuracy for max_depth  8 : 0.7956204379562044
accuracy for max_depth  9 : 0.7737226277372263
accuracy for max_depth  10 : 0.8029197080291971
accuracy for max_depth  11 : 0.8029197080291971
accuracy for max_depth  12 : 0.7956204379562044
accuracy for max_depth  13 : 0.8102189781021898
accuracy for max_depth  14 : 0.7956204379562044
