In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

# DATA COLLECTION AND ANALYSIS

In [38]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [39]:
df1 = df.drop(columns='SkinThickness')

In [40]:
df1.shape

(768, 8)

In [41]:
df1.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [42]:
df1['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [43]:
df1.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3.298,109.98,68.184,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,100.335821,35.142537,0.5505,37.067164


In [45]:
X = df1.drop(columns='Outcome', axis=1)
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,0,33.6,0.627,50
1,1,85,66,0,26.6,0.351,31
2,8,183,64,0,23.3,0.672,32
3,1,89,66,94,28.1,0.167,21
4,0,137,40,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,10,101,76,180,32.9,0.171,63
764,2,122,70,0,36.8,0.340,27
765,5,121,72,112,26.2,0.245,30
766,1,126,60,0,30.1,0.349,47


In [47]:
Y = df1['Outcome']
Y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [72]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.1,stratify=Y,random_state=0)
def models(x_train, y_train):
    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0,solver='lbfgs', max_iter=1000)
    log.fit(x_train, y_train)
    
    #Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(random_state=0, criterion="entropy")
    tree.fit(x_train, y_train)
    
    #Random Forest
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(random_state=0, criterion="entropy", n_estimators=10)
    forest.fit(x_train, y_train)
    
    
    return log, tree, forest

In [73]:
model = models(x_train, y_train)

In [74]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range(len(model)):
    print("Model:", i)
    print(classification_report(y_test, model[i].predict(x_test)))
    print("Accuracy : ", accuracy_score(y_test,model[i].predict(x_test)))

Model: 0
              precision    recall  f1-score   support

           0       0.82      0.92      0.87        50
           1       0.81      0.63      0.71        27

    accuracy                           0.82        77
   macro avg       0.82      0.77      0.79        77
weighted avg       0.82      0.82      0.81        77

Accuracy :  0.8181818181818182
Model: 1
              precision    recall  f1-score   support

           0       0.76      0.82      0.79        50
           1       0.61      0.52      0.56        27

    accuracy                           0.71        77
   macro avg       0.68      0.67      0.67        77
weighted avg       0.71      0.71      0.71        77

Accuracy :  0.7142857142857143
Model: 2
              precision    recall  f1-score   support

           0       0.83      0.98      0.90        50
           1       0.94      0.63      0.76        27

    accuracy                           0.86        77
   macro avg       0.89      0.80      

In [82]:
user = [1,85,66,0,26.6,0.351,31]
np_array = np.asarray(user)
reshaped_array = np_array.reshape(1,-1)

In [83]:
res = model[2].predict(reshaped_array)
if res>0:
    print("Positive")
else:
    print("Negative")

Negative


In [85]:
import pickle
with open('diabetes_pred.pickle','wb') as f:
    pickle.dump(model[2],f)