# About the data 

- number of times pregnant,
- plasma glucose concentration at 2 hours in an oral glucose tolerance test,
- diastolic blood pressure (mmHg),
- triceps skin fold thickness (mm),
- 2-hour serum insulin (mu U/ml),
- body mass index (weight in kg/(height in m2)),
- diabetes pedigree function, age (years) and
- a test whether the patient showed signs of diabetes (coded zero if negative, one if positive).

In [32]:
import pandas as pd

In [33]:
df = pd.read_csv('Diabetes.csv')

In [34]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [35]:
df.shape

(768, 9)

In [36]:
df.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [37]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [38]:
df = df[(df['Glucose']!=0)]

In [39]:
df = df[(df['BMI']!=0)]

In [40]:
df.shape

(752, 9)

**Insulin**: It is very rare for a person to have zero insulin.

In [41]:
df[df['Insulin']==0].shape

(360, 9)

In [42]:
df[df['BloodPressure']==0].shape

(28, 9)

In [43]:
df[df['SkinThickness']==0].shape

(218, 9)

In [44]:
df = df[(df['SkinThickness']!=0) | (df['BloodPressure']!=0) | (df['Insulin']!=0)]

In [45]:
df.shape

(726, 9)

In [46]:
df.Outcome.value_counts()

0    476
1    250
Name: Outcome, dtype: int64

In [47]:
df[df['SkinThickness']==0].shape

(192, 9)

In [48]:
df[df['BloodPressure']==0].shape

(2, 9)

In [49]:
df[df['Insulin']==0].shape

(334, 9)

In [50]:
df = df[df['BloodPressure']!=0]

In [51]:
df1 = df.loc[df['Outcome'] == 1]
df2 = df.loc[df['Outcome'] == 0]
df1 = df1.replace({'Insulin':0}, np.median(df1['Insulin']))
df2 = df2.replace({'Insulin':0}, np.median(df2['Insulin']))
df1 = df1.replace({'SkinThickness':0}, np.median(df1['SkinThickness']))
df2 = df2.replace({'SkinThickness':0}, np.median(df2['SkinThickness']))
dataframe = [df1, df2]
dataset = pd.concat(dataframe)







<IPython.core.display.Javascript object>

In [52]:
Y = dataset.Outcome
x = dataset.drop('Outcome', axis = 1)

In [53]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, Y, test_size = 0.15, random_state = 45)

In [54]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
x_train, y_train = smt.fit_sample(x_train, y_train)

In [55]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, bootstrap = True, max_features = 'sqrt')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print('Accuracy of Random Forest on test set: {:.2f}'.format(model.score(x_test, y_test)))

Accuracy of Random Forest on test set: 0.87


In [56]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,64,33.6,0.627,50,1
2,8,183,64,28,64,23.3,0.672,32,1
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
8,2,197,70,45,543,30.5,0.158,53,1


In [57]:
dataset.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [58]:
model.predict([[6,148,72,35,64,33.6,0.627,50]])

array([1])

In [59]:
#import pickle 
#pickle.dump(model,open('model_diabetes.pkl','wb'))