In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import pickle
import json

In [2]:
df = pd.read_csv("diabetes_data_upload.csv")
df.sample(10)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
193,36,Male,Yes,No,No,Yes,No,Yes,Yes,Yes,No,Yes,No,No,No,No,Positive
123,47,Male,No,Yes,No,No,No,No,Yes,Yes,No,No,No,No,Yes,Yes,Positive
362,28,Female,No,No,No,No,No,No,Yes,No,No,No,Yes,Yes,No,No,Positive
162,35,Female,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Positive
499,64,Male,No,No,No,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes,Yes,No,Negative
285,30,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative
512,43,Male,No,No,No,No,No,No,No,No,No,No,No,No,Yes,No,Negative
208,54,Male,No,No,Yes,Yes,No,Yes,No,No,No,Yes,No,No,Yes,No,Negative
387,36,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative
240,45,Male,No,No,No,Yes,No,No,No,Yes,No,No,Yes,No,Yes,No,Negative


In [3]:
df.describe()

Unnamed: 0,Age
count,520.0
mean,48.028846
std,12.151466
min,16.0
25%,39.0
50%,47.5
75%,57.0
max,90.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

In [5]:
df.isna().sum()

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64

In [6]:
X = df.drop('class', axis=1)
y = df['class']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True,random_state=4, stratify=y)

In [8]:
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_test.shape:", X_test.shape)
print("y_test.shape:", y_test.shape)

X_train.shape: (468, 16)
y_train.shape: (468,)
X_test.shape: (52, 16)
y_test.shape: (52,)


In [11]:
df.columns


Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')

In [18]:
scaler = StandardScaler()
encoder = OneHotEncoder()

In [19]:
ct = make_column_transformer((encoder, ['Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity']))

In [20]:
classifiers = [("Logistic Regression", LogisticRegression()), ("Random Forest Classifier", RandomForestClassifier()), 
               ("K Nearest Neighbors", KNeighborsClassifier(n_neighbors=2))]

In [21]:
for clf_name, clf in classifiers:
    pipe = make_pipeline(ct, clf)
    pipe.fit(X_train, y_train)
    train_pred = pipe.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)
    print("Train accuracy of ", clf_name, "is: ", train_acc)    
    test_pred =pipe.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred)
    print("Test accuracy of ", clf_name, "is: ",test_acc)

Train accuracy of  Logistic Regression is:  0.9423076923076923
Test accuracy of  Logistic Regression is:  0.9230769230769231
Train accuracy of  Random Forest Classifier is:  0.9935897435897436
Test accuracy of  Random Forest Classifier is:  0.9807692307692307
Train accuracy of  K Nearest Neighbors is:  0.9764957264957265
Test accuracy of  K Nearest Neighbors is:  0.9615384615384616


In [22]:
final_pipe = make_pipeline(ct, RandomForestClassifier())
final_pipe.fit(X_train, y_train)
y_pred = final_pipe.predict(X_test)
y_acc = accuracy_score(y_test, y_pred)
print("Accuracy score of Random Forest Classifier on test set is: ", y_acc)

Accuracy score of Random Forest Classifier on test set is:  0.9807692307692307


In [24]:
import pickle
pickle.dump(final_pipe, open('model.pkl', 'wb'))

In [25]:
model = pickle.load(open('model.pkl', 'rb'))

In [40]:
print(model.predict([X_test.iloc[3]]))

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity'

In [37]:
X_test.iloc[3]

Age                       40
Gender                Female
Polyuria                 Yes
Polydipsia               Yes
sudden weight loss       Yes
weakness                 Yes
Polyphagia                No
Genital thrush            No
visual blurring          Yes
Itching                   No
Irritability              No
delayed healing          Yes
partial paresis          Yes
muscle stiffness         Yes
Alopecia                  No
Obesity                   No
Name: 367, dtype: object

In [None]:
Age                       40
Gender                Female
Polyuria                 Yes
Polydipsia               Yes
sudden weight loss       Yes
weakness                 Yes
Polyphagia                No
Genital thrush            No
visual blurring          Yes
Itching                   No
Irritability              No
delayed healing          Yes
partial paresis          Yes
muscle stiffness         Yes
Alopecia                  No
Obesity                   No