# Diabetes Predicion

In [None]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# read the data file
data = pd.read_csv('/config/workspace/Dataset/diabetes.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

We can see there a data for columns Glucose, insuline ,skin thickness, BMi, blood pressure which has value 0.
but it cant be possible to get value 0 for these columns .we can either remove such data or replace with theier respective mean values.

In [None]:
data['BMI'] = data["BMI"].replace(0,data['BMI'].mean())
data['BloodPressure'] = data["BloodPressure"].replace(0,data['BloodPressure'].mean())
data['Glucose'] = data["Glucose"].replace(0,data['Glucose'].mean())
data['Insulin'] = data["Insulin"].replace(0,data['Insulin'].mean())
data['SkinThickness'] = data["SkinThickness"].replace(0,data['SkinThickness'].mean())

In [None]:
# boxplot
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data=data,width = 0.5,ax=ax,fliersize=3)

In [None]:
data.head()

In [None]:
# dependent and independent variables
X = data.drop(columns=['Outcome'])
y = data['Outcome']

In [None]:
# separate the data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
X_train.shape , X_test.shape

In [None]:
import pickle
from sklearn.preprocessing import StandardScaler
#standard scaling - standardization: minimise outliers 
def scaler_standard(X_train,X_test):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(X_train)
    x_test_scaled = scaler.transform(X_test)
    # saving the model 
    
    pickle.dump(scaler,open('D:\ShailyPythonPW\ML\DiabeticMLProject\Model\scaler.pkl','wb'))
    # file.close()
    
    return x_train_scaled,x_test_scaled

In [None]:
x_train_scaled,x_test_scaled = scaler_standard(X_train,X_test)

## Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(x_train_scaled,y_train)

In [None]:
# hyperparameter tunning
# GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#parameter grid
parameters = {
    'penalty' : ['l1','l2'],
    'C' : np.logspace(-3,3,7),
    'solver' : ['newton-cg','lbfgs',]
}

In [None]:
logreg = LogisticRegression()
clf = GridSearchCV(logreg, # model
                   param_grid=parameters, # hyperparameters 
                   scoring='accuracy', # metric for scoring
                   cv=10) # number of folds of dataset

clf.fit(x_train_scaled,y_train)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

lets see how well our model performs on the test data set

In [None]:
y_pred = clf.predict(x_test_scaled)

##### Accuracy score

In [None]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

##### Confusion matrix

In [None]:
con_mat = confusion_matrix(y_test,y_pred)
con_mat

In [None]:
true_positive = con_mat[0][0]
false_positive = con_mat[0][1]
false_negative = con_mat[1][0]
true_negative = con_mat[1][1]

In [None]:
Accuracy = (true_positive + true_negative ) / (true_negative + true_positive + false_negative + false_positive)
Accuracy

##### Accuracy is 79%

In [None]:
Precision = true_positive / (true_positive + false_positive)
Precision

##### Precision is 9%

In [None]:
Recall = true_positive / (true_positive + false_negative)
Recall

##### Recall is 81%

In [None]:
F1_score = 2*( Recall * Precision) / (Recall + Precision)
F1_score

##### F1_score is 85%

In [None]:
import pickle
file = open('D:\ShailyPythonPW\ML\DiabeticMLProject\Model\ModelForPrediction.pkl','wb')
pickle.dump(log_reg,file)
file.close()