In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('pima-indians-diabetes-database/diabetes.csv')

In [3]:
%matplotlib inline

In [4]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [5]:
df1 = df1.replace({'BloodPressure':0}, np.median(df1['BloodPressure']))
df2 = df2.replace({'BloodPressure':0}, np.median(df2['BloodPressure']))
data = [df1,df2]
df = pd.concat(data)

In [6]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [7]:
df1 = df1.replace({'BMI':0}, np.mean(df1['BMI']))
df2 = df2.replace({'BMI':0}, np.mean(df2['BMI']))

In [8]:
data = [df1,df2]
df = pd.concat(data)

In [9]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [10]:
df1 = df1.replace({'Glucose':0}, np.mean(df1['Glucose']))
df2 = df2.replace({'Glucose':0}, np.mean(df2['Glucose']))

In [11]:
data = [df1,df2]
df = pd.concat(data)

In [12]:
mean_insulin_diabetic = df['Insulin'].loc[df['Outcome']==1].sum()/130
mean_insulin_diabetic.round()

207.0

In [13]:
mean_insulin_non_diabetic = df['Insulin'].loc[df['Outcome']==0].sum()/264
mean_insulin_non_diabetic.round()

130.0

In [14]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [15]:
df1 = df1.replace({'Insulin':0}, mean_insulin_diabetic)
df2 = df2.replace({'Insulin':0}, mean_insulin_non_diabetic)

In [16]:
data = [df1,df2]
df = pd.concat(data)

In [17]:
mean_diabetic_skinThickness = df['SkinThickness'].loc[df['Outcome']==1].sum()/180
mean_diabetic_skinThickness.round()

33.0

In [18]:
mean_nondiabetic_skinThickness = df['SkinThickness'].loc[df['Outcome']==0].sum()/361
mean_nondiabetic_skinThickness.round()

27.0

In [19]:
df1 = df.loc[df['Outcome']==1]
df2 = df.loc[df['Outcome']==0]

In [20]:
df1 = df1.replace({'SkinThickness':0}, mean_diabetic_skinThickness)
df2 = df2.replace({'SkinThickness':0}, mean_nondiabetic_skinThickness)

In [21]:
data = [df1,df2]
df = pd.concat(data)

### ML section classification without using smote

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X = df.drop('Outcome', axis=1).values
y = df['Outcome'].values

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [25]:
from sklearn.preprocessing import MinMaxScaler

In [26]:
scaler = MinMaxScaler()

In [27]:
X_train = scaler.fit_transform(X_train)

In [28]:
X_test = scaler.transform(X_test)

### Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
predictions = logreg.predict(X_test)

In [31]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.80


In [32]:
from sklearn.metrics import classification_report,confusion_matrix

In [33]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       145
           1       0.82      0.58      0.68        86

    accuracy                           0.80       231
   macro avg       0.80      0.75      0.77       231
weighted avg       0.80      0.80      0.79       231



In [34]:
confusion_matrix(y_test,predictions)

array([[134,  11],
       [ 36,  50]], dtype=int64)

### Model Fitting: Support Vector Machine (Kernel: rbf)

In [35]:
from sklearn.svm import SVC
classifier_rbf = SVC(kernel = 'rbf')
classifier_rbf.fit(X_train, y_train)
predictions = classifier_rbf.predict(X_test)



In [36]:
print('Accuracy of SVC (RBF) classifier on test set: {:.2f}'.format(classifier_rbf.score(X_test, y_test)))

Accuracy of SVC (RBF) classifier on test set: 0.81


In [37]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.79      0.94      0.86       145
           1       0.86      0.57      0.69        86

    accuracy                           0.81       231
   macro avg       0.82      0.76      0.77       231
weighted avg       0.81      0.81      0.79       231



In [38]:
confusion_matrix(y_test,predictions)

array([[137,   8],
       [ 37,  49]], dtype=int64)

### Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [43]:
predictions = model.predict(X_test)
print('Accuracy of Random Forest on test set: {:.2f}'.format(model.score(X_test, y_test)))

Accuracy of Random Forest on test set: 0.85


In [44]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.87      0.90      0.89       145
           1       0.82      0.77      0.80        86

    accuracy                           0.85       231
   macro avg       0.85      0.84      0.84       231
weighted avg       0.85      0.85      0.85       231



In [45]:
confusion_matrix(y_test,predictions)

array([[131,  14],
       [ 20,  66]], dtype=int64)