In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('pima-indians-diabetes-database/diabetes.csv')

In [3]:
%matplotlib inline

In [4]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [5]:
df1 = df1.replace({'BloodPressure':0}, np.median(df1['BloodPressure']))
df2 = df2.replace({'BloodPressure':0}, np.median(df2['BloodPressure']))
data = [df1,df2]
df = pd.concat(data)

In [6]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [7]:
df1 = df1.replace({'BMI':0}, np.mean(df1['BMI']))
df2 = df2.replace({'BMI':0}, np.mean(df2['BMI']))

In [8]:
data = [df1,df2]
df = pd.concat(data)

In [9]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [10]:
df1 = df1.replace({'Glucose':0}, np.mean(df1['Glucose']))
df2 = df2.replace({'Glucose':0}, np.mean(df2['Glucose']))

In [11]:
data = [df1,df2]
df = pd.concat(data)

In [12]:
mean_insulin_diabetic = df['Insulin'].loc[df['Outcome']==1].sum()/130
mean_insulin_diabetic.round()

207.0

In [13]:
mean_insulin_non_diabetic = df['Insulin'].loc[df['Outcome']==0].sum()/264
mean_insulin_non_diabetic.round()

130.0

In [14]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [15]:
df1 = df1.replace({'Insulin':0}, mean_insulin_diabetic)
df2 = df2.replace({'Insulin':0}, mean_insulin_non_diabetic)

In [16]:
data = [df1,df2]
df = pd.concat(data)

In [17]:
mean_diabetic_skinThickness = df['SkinThickness'].loc[df['Outcome']==1].sum()/180
mean_diabetic_skinThickness.round()

33.0

In [18]:
mean_nondiabetic_skinThickness = df['SkinThickness'].loc[df['Outcome']==0].sum()/361
mean_nondiabetic_skinThickness.round()

27.0

In [19]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [20]:
df1 = df1.replace({'SkinThickness':0}, mean_diabetic_skinThickness)
df2 = df2.replace({'SkinThickness':0}, mean_nondiabetic_skinThickness)

In [21]:
data = [df1,df2]
df = pd.concat(data)

In [22]:
df.Outcome.unique()

array([1, 0], dtype=int64)

In [23]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,206.846154,33.6,0.627,50,1
2,8,183.0,64,33.0,206.846154,23.3,0.672,32,1
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70,45.0,543.0,30.5,0.158,53,1


### ML section classification without using smote

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X = df.drop('Outcome', axis=1).values
y = df['Outcome'].values

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [27]:
from sklearn.preprocessing import MinMaxScaler

In [28]:
scaler = MinMaxScaler()

In [29]:
X_train_sc = scaler.fit_transform(X_train)

In [30]:
X_test_sc = scaler.transform(X_test)

### Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_sc,y_train)

LogisticRegression()

In [32]:
logreg.fit(X_train_sc, y_train)

LogisticRegression()

In [47]:
predictions = logreg.predict(X_test_sc)

In [48]:
print('Misclassified examples: %d' %(y_test!=predictions).sum())

Misclassified examples: 47


In [49]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test_sc, y_test)))

Accuracy of logistic regression classifier on test set: 0.80


In [50]:
from sklearn.metrics import classification_report,confusion_matrix

In [51]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85       145
           1       0.80      0.60      0.69        86

    accuracy                           0.80       231
   macro avg       0.80      0.76      0.77       231
weighted avg       0.80      0.80      0.79       231



In [38]:
confusion_matrix(y_test,predictions)

array([[  0, 145],
       [  0,  86]], dtype=int64)

### Model Fitting: Support Vector Machine (Kernel: rbf)

In [39]:
from sklearn.svm import SVC
classifier_rbf = SVC(kernel = 'rbf')
classifier_rbf.fit(X_train, y_train)
predictions = classifier_rbf.predict(X_test)

In [40]:
print('Accuracy of SVC (RBF) classifier on test set: {:.2f}'.format(classifier_rbf.score(X_test, y_test)))

Accuracy of SVC (RBF) classifier on test set: 0.84


In [41]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87       145
           1       0.80      0.74      0.77        86

    accuracy                           0.84       231
   macro avg       0.83      0.82      0.82       231
weighted avg       0.83      0.84      0.83       231



In [42]:
confusion_matrix(y_test,predictions)

array([[129,  16],
       [ 22,  64]], dtype=int64)

### Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
model.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', n_estimators=300)

In [44]:
predictions = model.predict(X_test)
print('Accuracy of Random Forest on test set: {:.2f}'.format(model.score(X_test, y_test)))

Accuracy of Random Forest on test set: 0.85


In [45]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88       145
           1       0.81      0.77      0.79        86

    accuracy                           0.85       231
   macro avg       0.84      0.83      0.84       231
weighted avg       0.85      0.85      0.85       231



In [46]:
confusion_matrix(y_test,predictions)

array([[130,  15],
       [ 20,  66]], dtype=int64)