In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
df = pd.read_csv("kaggle_diabetes.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'kaggle_diabetes.csv'

In [None]:
# check the rows and columns
df.shape

In [None]:
# check the data types
df.dtypes

In [None]:
# check the null values
df.isnull().sum()

In [None]:
# describe the data
df.describe()

In [None]:
# horizontal describe
df.describe().T

In [None]:
# check the duplicate values
dup_value = df.duplicated().sum()
np.int64(dup_value)

In [None]:
print(dup_value)
print(np.int64(dup_value))

In [None]:
# check the unique values
df.nunique()

In [None]:
# check the nan values
df.isna().sum()

# EDA

In [None]:
# check the correlation
df.corr()

In [None]:
sns.heatmap(df.corr(), annot=True, cmap='PiYG', linewidths=0.2)

In [None]:
# get the top3 correlated features
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(df, 3))

### As per above we can say that the glucose is highly correlated with the outcome.

In [None]:
# check the skewness
df.skew()

In [None]:
# check the skewness as diagram
df.skew().plot(kind='barh')

In [None]:
# check the distribution of data
df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# count the target
df['Outcome'].value_counts()

#### As per above count we can say that the it's a imbalanced data set 0 -> 75% and 1->25%

In [None]:
# check the distribution of pregnancies with target column
sns.countplot(data=df, x='Pregnancies', hue='Outcome')

#### As per this daigaram i can say that if you have pregnancies more than 6 then you are more likely to get diabetes

In [None]:
# check the distribution of Glucose with target column
plt.figure(figsize=(30,10))
df['Glucose'].value_counts().plot(kind='bar')

In [None]:
plt.figure(figsize=(60,10))
sns.countplot(data=df, x='Glucose', hue='Outcome')

#### As per above diagram we can say that the if you have glucose lebel more than 120 then their is a high chance that u r going to be diabetic

In [None]:
# check the distribution of BMI with target coumn
plt.figure(figsize=(100,10))
sns.countplot(data=df, x='BMI', hue='Outcome') 

In [None]:
# check the distribution of blood pressure with target column
plt.figure(figsize=(30,10))
sns.countplot(data=df, x='BloodPressure', hue='Outcome')

In [None]:
df.columns

In [None]:
# check the distribution of skin thickness with target column
plt.figure(figsize=(30,5))
sns.countplot(data=df, x='SkinThickness', hue='Outcome')

In [None]:
# check the distribution of Age with target column
plt.figure(figsize=(30,5))
sns.countplot(data=df, x='Age', hue='Outcome')

#### As per above we can say that the if your age is more than 35 then their are certain chances of diabetes.

In [None]:
# check the distribution of Insulin with target column
plt.figure(figsize=(100,5))
sns.countplot(data=df, x='Insulin', hue='Outcome')


In [None]:
# check the outliers
df.plot(kind='box', subplots=True, layout=(3,3), figsize=(20,20))
plt.show()

## we have less data so we can't remove the outliers so we need to standardize the data

In [None]:
from sklearn.preprocessing import StandardScaler
scr = StandardScaler()

In [None]:
df.head(2)

In [None]:
# standardization
X = df.drop(columns='Outcome',axis=1)
X = pd.DataFrame(scr.fit_transform(X), columns=X.columns)

In [None]:
X.head(2)

In [None]:
# check the outliers
X.plot(kind='box', subplots=True, layout=(3,3), figsize=(20,20))
plt.show()

In [None]:
y=df['Outcome']

In [None]:
X.shape,y.shape

# Split the dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test  = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Train the Model with Ml and ANN

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# import gridesearchcv
from sklearn.model_selection import GridSearchCV

In [None]:
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
}

In [None]:
params_gdr = {
    'LogisticRegression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    'RandomForestClassifier': {
        'n_estimators': [100, 150, 200],
        'criterion': ['gini', 'entropy']
    },
    'DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
    },
    'SVC': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf']
    },
    'KNeighborsClassifier': {  
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
}

In [None]:
from sklearn.model_selection import GridSearchCV
def predict_model(model,para,X_train,y_train):
    best_estimator = {}
    for model_name in models:
        model = models[model_name]
        params = para[model_name]
        gdr = GridSearchCV(estimator=model, cv=5, scoring='accuracy', n_jobs=-1, param_grid=params)
        gdr.fit(X_train, y_train)
        best_estimator[model_name] = gdr.best_estimator_
        print(f"Best parameters for {model_name}: {gdr.best_params_}")
        print(f"Best score for {model_name}: {gdr.best_score_}\n")
    return best_estimator

In [None]:
predict_model(models,params_gdr,X_train,y_train)

# try with XGboost

In [None]:
import xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
xg_param = {
    'n_estimators': [100, 150, 200],
    'max_depth': range(2, 11, 1),
    'learning_rate': [0.1, 0.01, 0.5],
}

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
xg_gdr = GridSearchCV(estimator=XGBClassifier(), cv=5, scoring='accuracy', n_jobs=-1, param_grid=xg_param)

In [None]:
xg_gdr.fit(X_train,y_train)

In [None]:
xg_gdr.best_params_

In [None]:
xg_gdr.best_score_

# Try with ANN

In [None]:
from tensorflow import keras
from keras.models import Sequential # type: ignore

In [None]:
from keras.layers import Dense,Dropout,ReLU,BatchNormalization

#### Build the ANN model

In [None]:
model_one = Sequential()

In [None]:
model_one.add(Dense(units=32,kernel_initializer='uniform',activation='relu',input_dim=8))

#### 1st input layer have 21 inpute and 1st hidden layer have 32

In [None]:
model_one.add(Dense(units=64,activation='relu',kernel_initializer='uniform'))
model_one.add(Dense(units=128,activation='relu',kernel_initializer='uniform'))
model_one.add(Dense(units=32,activation='relu',kernel_initializer='uniform'))
model_one.add(Dense(units=16,activation='relu',kernel_initializer='uniform'))
model_one.add(Dense(units=8,activation='relu',kernel_initializer='uniform'))
model_one.add(Dense(units=1,activation='sigmoid',kernel_initializer='uniform'))

In [None]:
model_one.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model_one.summary()

In [None]:
history = model.fit(X_train,y_train,batch_size=32,epochs=200,validation_split=0.2)

In [None]:
# As per above we can say that the Ann have the higest accuracy compare to ML models

# lets make a pipeline which will standarize the data and push for training

In [None]:
y_pred = model.predict(X_test)