In [None]:
pip install lightgbm

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold

# suppress warnings from final output
import warnings
warnings.simplefilter("ignore")

In [None]:
path = 'diabetes.csv'

In [None]:
df = pd.read_csv(path)

In [None]:
df.shape

 Display the first few rows of the dataset to get an overview

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
# Descriptive statistics of the data set accessed.
df.describe()

In [None]:
def extract_variable_names(dataframe, cat_threshold=10, card_threshold=20):
    """
    Extracts names of categorical, numerical, and categorical but cardinal variables from a dataframe.

    Parameters:
        dataframe: DataFrame
            The input dataframe.
        cat_threshold: int, optional
            Threshold value for identifying numerical-looking categorical variables.
        card_threshold: int, optional
            Threshold value for identifying categorical but cardinal variables.

    Returns:
        cat_vars: list
            List of categorical variable names.
        num_vars: list
            List of numerical variable names.
        card_cat_vars: list
            List of categorical-looking cardinal variable names.
    """
    
    # Categorical columns and categorical but cardinal columns
    cat_vars = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
    num_like_cat_vars = [col for col in dataframe.columns if dataframe[col].nunique() < cat_threshold and
                         dataframe[col].dtype != "O"]
    card_cat_vars = [col for col in dataframe.columns if dataframe[col].nunique() > card_threshold and
                     dataframe[col].dtype == "O"]
    cat_vars = [col for col in cat_vars if col not in card_cat_vars]
    cat_vars = cat_vars + num_like_cat_vars

    # Numerical columns
    num_vars = [col for col in dataframe.columns if dataframe[col].dtype != "O"]
    num_vars = [col for col in num_vars if col not in num_like_cat_vars]

    # Print summary
    print(f"Number of Observations: {dataframe.shape[0]}")
    print(f"Number of Variables: {dataframe.shape[1]}")
    print(f'Number of Categorical Variables: {len(cat_vars)}')
    print(f'Number of Numerical Variables: {len(num_vars)}')
    print(f'Number of Cardinal Categorical Variables: {len(card_cat_vars)}')
    print(f'Number of Numerical-Looking Categorical Variables: {len(num_like_cat_vars)}')
    
    return cat_vars, num_vars, card_cat_vars

In [None]:
cat_cols, num_cols, cat_but_car = extract_variable_names(df)

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
#data visualisation in histogram 
df.hist(figsize = (10,12))    #to check the distribution of the features 

Check the distribution of the 'Outcome' variable (diabetes-positive or diabetes-negative)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

ax = sns.countplot(data=df, x=df['Outcome'].replace({0: 'No Diabetes', 1: 'Diabetes'}),
                   ax=axs[0])

ax.set_xlabel('Outcome', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
axs[0].set_title("Diabetes Count", fontsize=16)

def func(pct, allvals):
    absolute = int(np.round(pct / 100. * np.sum(allvals)))
    return f"{pct:.2f}%\n({absolute:d})"

explode = [0, 0.07]
labels = ['No Diabetes', 'Diabetes']

ax2 = df['Outcome'].value_counts().plot.pie(explode=explode,shadow=True,
                                                 autopct=lambda pct: func(pct, df['Outcome'].value_counts()),
                                                 ylabel='', labels=labels,
                                                 ax=axs[1], textprops=dict(color="black", size=13))
axs[1].set_title("Diabetes Percentage", fontsize=16)

plt.tight_layout()
plt.show()

In [None]:
print(df['Outcome'].value_counts()*100/len(df),'\n')
print(df['Outcome'].value_counts())

In [None]:
def target_variable_summary_numeric(dataframe, target_col, numeric_col):
    """
    Calculate and print the summary of a numeric column grouped by the target variable.

    Parameters:
        dataframe: DataFrame
            The input dataframe.
        target_col: str
            Name of the target variable column.
        numeric_col: str
            Name of the numeric column for analysis.
    """
    summary = dataframe.groupby(target_col).agg({numeric_col: "mean"})
    print(summary, end="\n\n\n")
    print("------------------------------")

# Iterate through numeric columns and generate target variable summaries
for column in num_cols:
    target_variable_summary_numeric(df, "Outcome", column)


In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='YlGnBu', fmt=".2f", linewidths=.5)
plt.title("Correlation Matrix")
plt.show()


_When we examine the graph, we observe that the highest correlation is between the variables of pregnancy-age , glucose-outcome and insulin-skin thickness_

In [None]:
# Checking data distributions between the columns
sns.pairplot(df, hue='Outcome', height=4)

In [None]:
# Checking data distributions between some highly correlated columns

# Separate data for diabetic and healthy individuals
diabetic_data = df[df.Outcome == 1]
healthy_data = df[df.Outcome == 0]

# Create a scatter plot to visualize Age vs. Insulin for diabetic and healthy individuals
plt.scatter(healthy_data.Age, healthy_data.Pregnancies, color="green", label="Healthy", alpha=0.5)
plt.scatter(diabetic_data.Age, diabetic_data.Pregnancies, color="red", label="Diabetic", alpha=0.5)

# Add labels and legend
plt.xlabel("Age")
plt.ylabel("Pregnancies")
plt.legend()

# Show the scatter plot
plt.show()

# Data Processing

### Checking for Missing Value

In [None]:
def check_missing_values(dataframe, include_column_names=False):
    """
    Checks for missing values in a DataFrame and displays a summary.

    Parameters:
        dataframe: DataFrame
            The input dataframe.
        include_column_names: bool, optional
            If True, returns a list of columns with missing values.

    Returns:
        List of columns with missing values if include_column_names is True.
    """
    # Find columns with missing values
    columns_with_missing = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    
    # Calculate missing value counts and ratios
    missing_counts = dataframe[columns_with_missing].isnull().sum().sort_values(ascending=False)
    missing_ratios = (dataframe[columns_with_missing].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    
    # Create a summary DataFrame
    missing_df = pd.DataFrame({'Missing Count': missing_counts, 'Missing Ratio (%)': np.round(missing_ratios, 2)})
    
    # Print the summary
    print(missing_df, end="\n")
    
    # Return list of columns with missing values if requested
    if include_column_names:
        return columns_with_missing

# Call the function to check missing values
check_missing_values(df, include_column_names=True)

_When examining the dataset for missing values, our assessment indicated an absence of such values. Nonetheless, upon closer inspection, we identified occurrences of 0 in fields like Glucose, BloodPressure, SkinThickness, Insulin, and BMI. It is implausible for these variables to assume a value of 0. Consequently, we intend to substitute these 0 values with NaN to accurately represent the absence of valid data points._

In [None]:
columns_with_zeros = [col for col in df.columns if (df[col].min() == 0 and col not in ["Pregnancies", "Outcome"])]

In [None]:
columns_with_zeros

In [None]:
# List of columns with potential missing values coded as 0
columns_with_zeros = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

# Replace 0 values with NaN in the specified columns
df[columns_with_zeros] = df[columns_with_zeros].replace(0, np.NaN)

In [None]:
# Iterate through columns with potential missing values
for col in columns_with_zeros:
    # Print the count of 0 values in the column
    print(f"{col}: {df.loc[df[col] == 0].shape[0]}")
    
    # Replace 0 values with NaN in the column
    df[col] = np.where(df[col] == 0, np.nan, df[col])

# Call a function to check missing values in the DataFrame
check_missing_values(df)

In [None]:
!pip install missingno

In [None]:
import missingno as msno
msno.bar(df, sort="ascending");

In [None]:
msno.matrix(df);

In [None]:
# Fill missing values with the median values for each variable.
def fill_missing_with_median(var):   
    non_null_values = df[df[var].notnull()]
    median_values = non_null_values[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return median_values

# Replace incomplete observations with the median values of non-diabetic and diabetic individuals.
columns = df.columns
columns = columns.drop("Outcome")
for column in columns:
    median_values = fill_missing_with_median(column)
    df.loc[(df['Outcome'] == 0) & (df[column].isnull()), column] = median_values[column][0]
    df.loc[(df['Outcome'] == 1) & (df[column].isnull()), column] = median_values[column][1]

In [None]:
# Check the remaining number of missing values
missing_values_count = df.isnull().sum()
print("Number of Missing Values:")
print(missing_values_count)

In [None]:
df.head()

### Dealing with Outliers

In [None]:
def get_outlier_thresholds(dataframe, variable, lower_quantile=0.10, upper_quantile=0.90):
    """
    Calculate the lower and upper outlier thresholds based on quantiles.

    Parameters:
        dataframe: DataFrame
            The input dataframe.
        variable: str
            The name of the variable for which outlier thresholds are calculated.
        lower_quantile: float, optional
            The lower quantile value.
        upper_quantile: float, optional
            The upper quantile value.

    Returns:
        lower_limit: float
            The lower outlier threshold.
        upper_limit: float
            The upper outlier threshold.
    """
    quantile_one = dataframe[variable].quantile(lower_quantile)
    quantile_three = dataframe[variable].quantile(upper_quantile)
    interquantile_range = quantile_three - quantile_one
    upper_limit = quantile_three + 1.5 * interquantile_range
    lower_limit = quantile_one - 1.5 * interquantile_range
    return lower_limit, upper_limit

def has_outliers(dataframe, col_name):
    """
    Check if a column in the dataframe has outliers.

    Parameters:
        dataframe: DataFrame
            The input dataframe.
        col_name: str
            The name of the column to check for outliers.

    Returns:
        bool
            True if outliers are present, False otherwise.
    """
    lower_limit, upper_limit = get_outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)].any(axis=None):
        return True
    else:
        return False

# Iterate through numeric columns and check for outliers
for column in num_cols:
    has_outliers_flag = has_outliers(df, column)
    print(f"{column}: {has_outliers_flag}")

In [None]:
# Create a figure with specified size
plt.figure(figsize=(12, 6))

# Create a boxplot for all columns
df.boxplot()

# Add title and adjust x-axis labels
plt.title("Boxplot of All Columns with Outliers")
plt.xticks(rotation=45)

# Display the boxplot
plt.show()

In [None]:
def replace_outliers_with_thresholds(dataframe, column):
    """
    Replace outliers in a column of a dataframe with predefined thresholds.

    Parameters:
        dataframe: DataFrame
            The input dataframe.
        column: str
            The name of the column to replace outliers.

    Returns:
        None
    """
    low_limit, up_limit = get_outlier_thresholds(dataframe, column)
    
    # Replace outliers with threshold values
    dataframe.loc[(dataframe[column] < low_limit), column] = low_limit
    dataframe.loc[(dataframe[column] > up_limit), column] = up_limit

# Iterate through numeric columns and replace outliers
for column in num_cols:
    replace_outliers_with_thresholds(df, column)
    print(f"Outliers replaced for {column}")


In [None]:
# check for outliers
for column in num_cols:
    has_outliers_flag = has_outliers(df, column)
    print(f"{column}: {has_outliers_flag}")

### Feature Engineering

In [None]:
# Create a categorical variable based on BMI ranges
bmi_categories = pd.cut(df['BMI'], bins=[0, 18.5, 24.9, 29.9, 34.9, 39.9, float('inf')],
                        labels=["Underweight", "Normal", "Overweight", "Obesity 1", "Obesity 2", "Obesity 3"])
df['BMI_CAT'] = bmi_categories

# Define a function to set insulin score
def set_insulin(row):
    if 16 <= row["Insulin"] <= 166:
        return "Normal"
    else:
        return "Abnormal"

# Apply the function to create a new categorical variable for insulin
df['Insulin_CAT'] = df.apply(set_insulin, axis=1)

# Define BloodPressure categories
bp_bins = [0, 60, 80, 90, 120, np.inf]
bp_labels = ["Low_Blood_Pressure", "Normal", "Prehypertension", "Hypertension", "Hypertensive_Crisis"]

# Create 'BloodPressure_CAT' categorical variable based on BloodPressure values
df['BloodPressure_CAT'] = pd.cut(df['BloodPressure'], bins=bp_bins, labels=bp_labels)

# Create a categorical variable based on glucose levels
glucose_categories = pd.cut(df['Glucose'], bins=[0, 70, 99, 126, float('inf')],
                            labels=["Low", "Normal", "Overweight", "High"])  # Removed "Secret" category
df['Glucose_CAT'] = glucose_categories

# Create age categories using quantiles
df['Age_CAT'] = pd.qcut(df['Age'], q=3, labels=["Young", "Mature", "Old"])

# Create a categorical variable for pregnancies
df['Preg_CAT'] = pd.cut(df['Pregnancies'], bins=[-1, 0, 1, float('inf')],
                        labels=["Never", "One_Time", "Many_Times"])

# Print a message to indicate completion of feature engineering
print("Feature engineering completed.")

In [None]:
df.head()

In [None]:
cat_cols, num_cols, cat_but_car = extract_variable_names(df)

In [None]:
# List of new categorical columns to visualize
new_cols = ["Insulin_CAT", "BloodPressure_CAT", "Glucose_CAT", "BMI_CAT", "Age_CAT", "Preg_CAT"]

# Define subplot grid parameters
rows = 3
cols = 2

# Calculate the total number of subplots
total_subplots = rows * cols

# Create a new figure with specified size
fig = plt.figure(figsize=(15, 11))

# Loop through each new categorical column
for subplot_counter, col in enumerate(new_cols, start=1):
    if subplot_counter <= total_subplots:
        # Create a subplot
        plt.subplot(rows, cols, subplot_counter)
        
        # Create a countplot with hue (Outcome)
        ax = sns.countplot(data=df,
                           x=col,
                           hue="Outcome")
        
        # Set labels for axes
        plt.ylabel('Count')
        plt.xlabel(f'{col}', size=15)

# Adjust layout and spacing between subplots
plt.tight_layout()
plt.subplots_adjust(hspace=0.17)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding for binary columns
def label_encode_binary(dataframe, column):
    label_encoder = LabelEncoder()
    dataframe[column] = label_encoder.fit_transform(dataframe[column])
    return dataframe

# Get list of binary columns
binary_columns = [col for col in df.columns if df[col].dtype == "object" and df[col].nunique() == 2]

# Apply label encoding to binary columns
for col in binary_columns:
    df = label_encode_binary(df, col)

# One Hot Encoding for categorical columns
def one_hot_encode(dataframe, categorical_columns, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_columns, drop_first=drop_first)
    return dataframe

# Get list of categorical columns for one hot encoding
ohe_columns = [col for col in df.columns if 10 >= df[col].nunique() > 2]

# Apply one hot encoding to categorical columns
df = one_hot_encode(df, ohe_columns)

# Display the updated dataframe
df.head()

In [None]:
df.columns

## Feature scaling

In [None]:
cat_df = df[['Insulin_CAT','BMI_CAT_Normal', 'BMI_CAT_Overweight', 'BMI_CAT_Obesity 1','BMI_CAT_Obesity 2', 
             'BMI_CAT_Obesity 3', 'BloodPressure_CAT_Normal','BloodPressure_CAT_Prehypertension', 
             'BloodPressure_CAT_Hypertension','BloodPressure_CAT_Hypertensive_Crisis', 'Glucose_CAT_Normal',
             'Glucose_CAT_Overweight', 'Glucose_CAT_High', 'Age_CAT_Mature','Age_CAT_Old', 'Preg_CAT_One_Time', 
             'Preg_CAT_Many_Times']]

In [None]:
# Separate the target variable 'Outcome' from the dataset
y = df["Outcome"]

# Remove specific columns and create the feature matrix X
X = df.drop(['Outcome', 'Insulin_CAT',
       'BMI_CAT_Normal', 'BMI_CAT_Overweight', 'BMI_CAT_Obesity 1',
       'BMI_CAT_Obesity 2', 'BMI_CAT_Obesity 3', 'BloodPressure_CAT_Normal',
       'BloodPressure_CAT_Prehypertension', 'BloodPressure_CAT_Hypertension',
       'BloodPressure_CAT_Hypertensive_Crisis', 'Glucose_CAT_Normal',
       'Glucose_CAT_Overweight', 'Glucose_CAT_High', 'Age_CAT_Mature',
       'Age_CAT_Old', 'Preg_CAT_One_Time', 'Preg_CAT_Many_Times'], axis=1)

# Store the remaining column names in 'cols'
cols = X.columns

# Store the original index in 'index'
index = X.index

In [None]:
# Standardization of variables plays a vital role in enhancing model performance.
from sklearn.preprocessing import RobustScaler

# Fit a RobustScaler transformer to the data
transformer = RobustScaler().fit(X)

# Transform the data using the fitted transformer
X = transformer.transform(X)

# Convert the transformed data back to a DataFrame with specified columns and index
X = pd.DataFrame(X, columns=cols, index=index)
X = pd.concat([X, cat_df], axis=1)

In [None]:
X.head()

In [None]:
#Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Print the shapes of X, X_train, and X_test
print("Shape of X:", X.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

In [None]:
# Initialize classifiers
rf = RandomForestClassifier()
svm = SVC()
lr = LogisticRegression()
knn = KNeighborsClassifier()
XGB = GradientBoostingClassifier()
dt = DecisionTreeClassifier()
Ada = AdaBoostClassifier()
lgbm = LGBMClassifier()

In [None]:
models = [
    ('Logistic Regression', lr),
    ('K-Nearest Neighbors', knn),
    ('Decision Tree', dt),
    ('Random Forest', rf),
    ('Support Vector Machine', svm),
    ('Gradient Boosting', XGB),
    ('AdaBoost', Ada),
    ('LightGBM', lgbm)
]

# Evaluate each model in the list
results = []
names = []

In [None]:
# Filter out LightGBM warnings
warnings.filterwarnings("ignore", category=UserWarning, message="Found whitespace in feature_names")
warnings.filterwarnings("ignore", category=UserWarning, message="No further splits with positive gain")
warnings.filterwarnings("ignore", category=UserWarning, message="Auto-choosing col-wise multi-threading")

In [None]:
for name, model in models:
    
        kfold = KFold(n_splits = 10 )
        cv_results = cross_val_score(model, X, y, cv = 10, scoring= "accuracy")
        results.append(cv_results)
        names.append(name)
        msg = "%s: %.2f (%.2f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

## Model Tunning

In [None]:
# Create a list of classifiers and their respective parameter grids for hyperparameter tuning
classifier_parameter_grids = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Support Vector Machine': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [3, 4, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'LightGBM': {
       'n_estimators': [50, 100, 200],
       'learning_rate': [0.01, 0.1, 0.2],
       'max_depth': [3, 4, 5],
       'subsample': [0.8, 1.0],
       'colsample_bytree': [0.8, 1.0],
       'reg_alpha': [0, 0.1, 0.5],
       'reg_lambda': [0, 0.1, 0.5]
    }
}

models = [
    ('Logistic Regression', lr),
    ('K-Nearest Neighbors', knn),
    ('Decision Tree', dt),
    ('Random Forest', rf),
    ('Support Vector Machine', svm),
    ('Gradient Boosting', XGB),
    ('AdaBoost', Ada),
    ('LightGBM', lgbm)
]

In [None]:
from sklearn.model_selection import GridSearchCV

# Define a dictionary to store the best estimator for each classifier after hyperparameter tuning
best_estimators = {}

# Iterate over each classifier and their respective parameter grid
for name, classifier in models:
    # Retrieve the parameter grid for the current classifier
    param_grid = classifier_parameter_grids[name]
    
    # Initialize GridSearchCV with the given classifier, parameter grid, and 5-fold cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=10)
    
    # Fit the GridSearchCV object on the training data
    grid_search.fit(X_train, y_train)
    
    # Print the best hyperparameters for the current classifier
    print(f'Best hyperparameters for {name}: {grid_search.best_params_}')
    
    # Store the best estimator for the current classifier in the dictionary
    best_estimators[name] = grid_search.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score

# Create a dictionary to store the tuned models
tuned_models = {}

# Iterate over the best_estimators dictionary
for name, best_estimator in best_estimators.items():
    # Fit the best estimator on the training data
    best_estimator.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = best_estimator.predict(X_test)
    
    # Calculate and print the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy of {name}: {accuracy:.2f}')
    
    # Store the trained model in the dictionary
    tuned_models[name] = best_estimator

In [None]:
# Generate predicted labels for test data using best models found for each classifier
y_pred_lr = best_estimators['Logistic Regression'].predict(X_test)
y_pred_knn = best_estimators['K-Nearest Neighbors'].predict(X_test)
y_pred_dt = best_estimators['Decision Tree'].predict(X_test)
y_pred_rf = best_estimators['Random Forest'].predict(X_test)
y_pred_svm = best_estimators['Support Vector Machine'].predict(X_test)
y_pred_gb = best_estimators['Gradient Boosting'].predict(X_test)
y_pred_ada = best_estimators['AdaBoost'].predict(X_test)
y_pred_lgbm = best_estimators['LightGBM'].predict(X_test)

In [None]:
# Checking Model prediction accuracy
lr_acc = round(accuracy_score(y_test, y_pred_lr)*100, 2)
knn_acc = round(accuracy_score(y_test, y_pred_knn)*100, 2)
dt_acc = round(accuracy_score(y_test, y_pred_dt)*100, 2)
rf_acc = round(accuracy_score(y_test, y_pred_rf)*100, 2)
svm_acc = round(accuracy_score(y_test, y_pred_svm)*100, 2)
gb_acc = round(accuracy_score(y_test, y_pred_gb)*100, 2)
ada_acc = round(accuracy_score(y_test, y_pred_ada)*100, 2)
lgbm_acc = round(accuracy_score(y_test, y_pred_lgbm)*100, 2)

print(f'Logistic Regression Accuracy {lr_acc}%')
print(f'K-Nearest Neighbors Accuracy {knn_acc}%')
print(f'Decision Tree Accuracy {dt_acc}%')
print(f'Random Forest Accuracy {rf_acc}%')
print(f'Support Vector Machine {svm_acc}%')
print(f'Gradient Boosting {gb_acc}%')
print(f'AdaBoost Accuracy {ada_acc}%')
print(f'LightGBM Accuracy {lgbm_acc}%')

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

# Create a dictionary to store the F1 scores and ROC AUC scores
scores = {}

# Iterate over the tuned_models dictionary
for name, model in tuned_models.items():
    # Make predictions on the testing data
    y_pred = model.predict(X_test)
    
    # Calculate the F1 score
    f1 = f1_score(y_test, y_pred)
    
    # Calculate the ROC AUC score
    if hasattr(model, 'predict_proba'):  # Check if the model supports predict_proba
        y_prob = model.predict_proba(X_test)[:, 1]  # Probability of positive class
        roc_auc = roc_auc_score(y_test, y_prob)
    else:
        roc_auc = None  # Set ROC AUC to None if predict_proba is not available
    
    # Store the F1 score and ROC AUC score in the scores dictionary
    scores[name] = {'F1 Score': f1, 'ROC AUC Score': roc_auc}

# Print the F1 scores and ROC AUC scores for each model
for name, score in scores.items():
    if score['ROC AUC Score'] is not None:
        print(f'{name} - F1 Score: {score["F1 Score"]:.2f}, ROC AUC Score: {score["ROC AUC Score"]:.2f}')
    else:
        print(f'{name} - F1 Score: {score["F1 Score"]:.2f}, ROC AUC Score: N/A')

In [None]:
from sklearn.metrics import roc_curve, auc

# Create a dictionary to store the ROC curves and AUC values
roc_curves = {}

# Create a figure and axis for the ROC curve plot
plt.figure(figsize=(8, 6))

# Iterate over the tuned_models dictionary
for name, model in tuned_models.items():
    if hasattr(model, 'predict_proba'):  # Check if the model supports predict_proba
        # Predict probabilities for the positive class
        y_prob = model.predict_proba(X_test)[:, 1]
        
        # Calculate the ROC curve
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        
        # Calculate the AUC value
        roc_auc = auc(fpr, tpr)
        
        # Store the ROC curve and AUC value in the roc_curves dictionary
        roc_curves[name] = {'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc}
        
        # Plot the ROC curve
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

# Plot the diagonal reference line for random guessing
plt.plot([0, 1], [0, 1], 'k--')

# Set plot labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')

# Show the plot
plt.show()

Print the Confusion Matrix for all the Classifier

In [None]:
# Iterate over the best estimator dictionary and print the confusion matrix and classification report for each classifier
for name, model in best_estimators.items():
    # Generate predicted labels for test data using the best model
    y_pred = model.predict(X_test)
    # Print the confusion matrix and classification report for the current classifier
    print(f'Confusion matrix for {name}:')
    print(confusion_matrix(y_test, y_pred))
    print('\n')
    print(f'Classification report for {name}:')
    print(classification_report(y_test, y_pred))
    print('\n')

Plot the Confussion Matrix Heatmaps

In [None]:
# Calculate the number of rows and columns for subplots
num_classifiers = len(best_estimators)
num_cols = 2
num_rows = int(np.ceil(num_classifiers / num_cols))

# Create subplots for the confusion matrix heatmaps
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 6 * num_rows))
fig.suptitle('Confusion Matrix Heatmaps')

# Flatten the axes if there is only one row of subplots
if num_rows == 1:
    axes = [axes]

# Iterate over the best estimators dictionary and create a confusion matrix heatmap for each classifier
for idx, (name, estimator) in enumerate(best_estimators.items()):
    row = idx // num_cols
    col = idx % num_cols
    ax = axes[row][col]
    
    # Generate predicted labels for test data using the current classifier
    y_pred = estimator.predict(X_test)
    
    # Create a confusion matrix heatmap for the current classifier
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu", ax=ax)
    ax.set_title(f'Confusion Matrix - {name}')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

# Adjust layout and show the subplots
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

### Models Comparism

In [None]:
#model comparison 
models = pd.DataFrame({
    'Model' : [ 'Logistics Regression','K-Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Support Vector Machine','Gradient Boosting', 'Adaboost', 'LightGBM'],
    'Score' : [ lr_acc, knn_acc, dt_acc, rf_acc, svm_acc, gb_acc, ada_acc, lgbm_acc ]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
# Plot the Acuracy for all the models
plt.figure(figsize=(10, 6))
bars = plt.barh(models['Model'], models['Score'], color='skyblue')
plt.xlabel('Accuracy (%)')
plt.ylabel('Model')
plt.title('Model Comparison')
plt.xlim(0, 100)  # Set x-axis limit to percentage scale
plt.gca().invert_yaxis()  # Invert y-axis to have the highest score on top

# Add text labels on each bar with scores in percentage format
for bar in bars:
    plt.text(bar.get_width(), bar.get_y() + bar.get_height() / 2, f'{bar.get_width():.2f}%',
             va='center', color='black', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Fit the AdaBoost model on the training data
ada_model= tuned_models['AdaBoost'].fit(X_train, y_train)

# Get feature importances from the AdaBoost model
feature_importances = ada_model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in ascending order
importance_df_sorted = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted feature importance
print(importance_df_sorted)

# Plot the sorted feature importance
plt.figure(figsize=(10, 6))
plt.barh(importance_df_sorted['Feature'], importance_df_sorted['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for AdaBoost')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
plt.tight_layout()
plt.show()

In [None]:
# Fit the Decision model on the training data
dt_model= tuned_models['Decision Tree'].fit(X_train, y_train)

# Get feature importances from the fitted model
feature_importances = dt_model.feature_importances_

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the DataFrame by Importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importance
print(feature_importance_df)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (Decision Tree)')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest importance on top
plt.tight_layout()
plt.show()