<div style="background-color: #4CAF50; padding: 10px; border-radius: 5px;">
    <p style="color: black; font-size: 16px; font-weight: bold;">
       1. Import Libraries
    </p>
</div>


In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler, StandardScaler
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings ('ignore')


<div style="background-color: #4CAF50; padding: 10px; border-radius: 5px;">
    <p style="color: black; font-size: 16px; font-weight: bold;">
      2.  Load Data
    </p>
</div>


In [86]:
df = pd.read_csv('/content/diabetes.csv')

In [87]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [88]:
for column in df.columns:
    num_distinct_values = len(df[column].unique())
    print(f"{column}: {num_distinct_values} distinct values")

Pregnancies: 17 distinct values
Glucose: 136 distinct values
BloodPressure: 47 distinct values
SkinThickness: 51 distinct values
Insulin: 186 distinct values
BMI: 248 distinct values
DiabetesPedigreeFunction: 517 distinct values
Age: 52 distinct values
Outcome: 2 distinct values


In [89]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

preprocessing

In [90]:
df1 = df.copy()

In [91]:
#chaeck 0 value
numeric_columns = df1.select_dtypes(include=['int', 'float']).columns

zero_counts = df1[numeric_columns].eq(0).sum()

print(zero_counts)

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64


In [92]:
columns_non_nol = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

df1[columns_non_nol] = df1[columns_non_nol].replace(0, pd.NA)

for column in columns_non_nol:
    median_value = df1[column].median()
    df1[column].fillna(median_value, inplace=True)



<div style="background-color: #4CAF50; padding: 10px; border-radius: 5px;">
    <p style="color: black; font-size: 16px; font-weight: bold;">
       Normalization Data
    </p>
</div>


In [93]:
features_to_scale_robust = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']


robust_scaler = RobustScaler()

df1[features_to_scale_robust] = robust_scaler.fit_transform(df1[features_to_scale_robust])

def count_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ((series < lower_bound) | (series > upper_bound)).sum()

for feature in features_to_scale_robust:
    num_outliers = count_outliers(df1[feature])
    print(f'Number of outliers in {feature}: {num_outliers}')

Number of outliers in Pregnancies: 4
Number of outliers in Glucose: 0
Number of outliers in BloodPressure: 14
Number of outliers in SkinThickness: 87
Number of outliers in Insulin: 346
Number of outliers in BMI: 8
Number of outliers in DiabetesPedigreeFunction: 29
Number of outliers in Age: 9


In [94]:
scaler = StandardScaler()

df1[features_to_scale_robust] = scaler.fit_transform(df1[features_to_scale_robust])

In [95]:
X = df1.drop(['Outcome'], axis=1)
y = df1['Outcome']

In [96]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)


 <div style="background-color: #4CAF50; padding: 10px; border-radius: 5px;">
    <p style="color: black; font-size: 16px; font-weight: bold;">
         Testing  Data
    </p>
</div>


In [97]:
def hypothesis_testing(df, column1, column2):
    t_stat, p_val = stats.ttest_ind(df[column1], df[column2])

columns_for_testing = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

for i in range(len(columns_for_testing) - 1):
    hypothesis_testing(df1, columns_for_testing[i], columns_for_testing[i + 1])


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


<div style="background-color: #4CAF50; padding: 10px; border-radius: 5px;">
    <p style="color: black; font-size: 16px; font-weight: bold;">
     5.  Modeling
    </p>
</div>


<div style="background-color: #4CAF50; padding: 10px; border-radius: 5px;">
    <p style="color: black; font-size: 16px; font-weight: bold;">
        SVC
    </p>
</div>


In [101]:
param_dist = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
}

svc = SVC()

randomized_search = RandomizedSearchCV(svc, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

randomized_search.fit(X_train, y_train)

best_params = randomized_search.best_params_
print(f"Best Hyperparameters: {best_params}")

best_svc_model = randomized_search.best_estimator_
svc_predicted = best_svc_model.predict(X_test)

svc_acc_score = accuracy_score(y_test, svc_predicted)
svc_conf_matrix = confusion_matrix(y_test, svc_predicted)

print("\nConfusion Matrix for SVM:")
print(svc_conf_matrix)
print("\nAccuracy of Support Vector Classifier:", svc_acc_score * 100, '\n')
print("Classification Report:")
print(classification_report(y_test, svc_predicted))

Best Hyperparameters: {'kernel': 'poly', 'gamma': 'scale', 'C': 10}

Confusion Matrix for SVM:
[[75 24]
 [15 86]]

Accuracy of Support Vector Classifier: 80.5 

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.79        99
           1       0.78      0.85      0.82       101

    accuracy                           0.81       200
   macro avg       0.81      0.80      0.80       200
weighted avg       0.81      0.81      0.80       200



<div style="background-color: #4CAF50; padding: 10px; border-radius: 5px;">
    <p style="color: black; font-size: 16px; font-weight: bold;">
       Extra Trees Clasifier
    </p>
</div>


In [100]:
param_dist_et = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
}

et = ExtraTreesClassifier()

randomized_search_et = RandomizedSearchCV(
    et,
    param_distributions=param_dist_et,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

randomized_search_et.fit(X_train, y_train)

best_params_et = randomized_search_et.best_params_
print(f"Best Hyperparameters for Extra Trees Classifier: {best_params_et}")

best_et_model = randomized_search_et.best_estimator_

et_predicted = best_et_model.predict(X_test)

et_acc_score = accuracy_score(y_test, et_predicted)

et_conf_matrix = confusion_matrix(y_test, et_predicted)



print("\nConfusion Matrix for Extra Trees Classifier:")
print(et_conf_matrix)
print("\nAccuracy of Extra Trees Classifier:", et_acc_score * 100, '\n')
print("Classification Report for Extra Trees Classifier:")
print(classification_report(y_test, et_predicted))


Best Hyperparameters for Extra Trees Classifier: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'gini', 'bootstrap': False}

Confusion Matrix for Extra Trees Classifier:
[[74 25]
 [11 90]]

Accuracy of Extra Trees Classifier: 82.0 

Classification Report for Extra Trees Classifier:
              precision    recall  f1-score   support

           0       0.87      0.75      0.80        99
           1       0.78      0.89      0.83       101

    accuracy                           0.82       200
   macro avg       0.83      0.82      0.82       200
weighted avg       0.83      0.82      0.82       200

