### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score,StratifiedKFold

from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import KFold
#import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
import os

Importing Data File

In [None]:

dir_path = os.getcwd()


# Load dataset
dataFrame = pd.read_csv('../../AIBetic2Dataset/balanced_diabetes_data.csv')


In [None]:

#Showing head of the dataset
dataFrame.head(50)

In [None]:

#Showing null values 
dataFrame.isna().sum()

In [None]:


#Showing the type of the dataset
dataFrame.info()

### Distribution of Different Features

In [None]:
# Create a count plot for the 'class' column of the DataFrame
class_distribution_plot = sns.countplot(x='class', hue='class', data=dataFrame, palette='viridis', legend=False)

# Display the plot
class_distribution_plot


In [None]:


# Create a new figure and axes for the subplots
figure, subplot_axes = plt.subplots(1, 2, figsize=(14,7))

# Compute the counts of each class
class_counts = dataFrame['class'].value_counts()

# Define the color palette and explode parameters
color_palette_pie = sns.color_palette("pastel", 7)
explode_params = [0.1, 0]

# Create a pie chart on the first subplot with the new color palette
subplot_axes[0].pie(class_counts, autopct='%1.0f%%', startangle=60, labels=["Positive","Negative"], colors=color_palette_pie, explode=explode_params, shadow=True, wedgeprops={"linewidth":2,"edgecolor":"k"})
subplot_axes[0].set_title("Target Variable Composition")

# Create a bar plot on the second subplot with a different color
class_counts.plot(kind='barh', ax=subplot_axes[1], color='skyblue',legend=False)
for idx, val in enumerate(class_counts.values):
    subplot_axes[1].text(val * 0.7, idx, str(val), weight='bold', fontsize=20)
subplot_axes[1].set_title("Number of Instances for Each Class")

# Adjust the layout and display the plots
plt.tight_layout()
plt.show()

In [None]:


# Visualising the distribution of Gender
gender_plot = sns.countplot(x=dataFrame['Gender'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Gender', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Gender
dataFrame.groupby("Gender")["class"].value_counts().unstack()

Females in the dataset turned out to be more more positive

In [None]:
# Visualizing the distribution of Polyuria
polyuria_plot = sns.countplot(x=dataFrame['Polyuria'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Polyuria', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Polyuria
dataFrame.groupby("Polyuria")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of Polydipsia
polydipsia_plot = sns.countplot(x=dataFrame['Polydipsia'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Polydipsia', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Polydipsia
dataFrame.groupby("Polydipsia")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of sudden weight loss
sudden_weight_loss_plot = sns.countplot(x=dataFrame['sudden weight loss'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['sudden weight loss', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on sudden weight loss
dataFrame.groupby("sudden weight loss")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of weakness
weakness_plot = sns.countplot(x=dataFrame['weakness'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['weakness', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on  weakness
dataFrame.groupby("weakness")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of Polyphagia
polyphagia_plot = sns.countplot(x=dataFrame['Polyphagia'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Polyphagia', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Polyphagia
dataFrame.groupby("Polyphagia")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of Genital thrush
genital_thrush_plot = sns.countplot(x=dataFrame['Genital thrush'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Genital thrush', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Genital thrush
dataFrame.groupby("Genital thrush")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of visual blurring
visual_blurring_plot = sns.countplot(x=dataFrame['visual blurring'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['visual blurring', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on visual blurring
dataFrame.groupby("visual blurring")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of Itching
itching_plot = sns.countplot(x=dataFrame['Itching'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Itching', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Itching
dataFrame.groupby("Itching")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of Irritability
irritability_plot = sns.countplot(x=dataFrame['Irritability'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Irritability', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Irritability
dataFrame.groupby("Irritability")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of delayed healing
delayed_healing_plot = sns.countplot(x=dataFrame['delayed healing'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['delayed healing', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on delayed healing
dataFrame.groupby("delayed healing")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of partial paresis
partial_paresis_plot = sns.countplot(x=dataFrame['partial paresis'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['partial paresis', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on partial paresis
dataFrame.groupby("partial paresis")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of muscle stiffness
muscle_stiffness_plot = sns.countplot(x=dataFrame['muscle stiffness'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['muscle stiffness', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on muscle stiffness
dataFrame.groupby("muscle stiffness")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of Alopecia
alopecia_plot = sns.countplot(x=dataFrame['Alopecia'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Alopecia', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Alopecia
dataFrame.groupby("Alopecia")["class"].value_counts().unstack()

In [None]:


# Visualizing the distribution of Obesity
obesity_plot = sns.countplot(x=dataFrame['Obesity'], hue=dataFrame['class'], palette='Greens')

# Defining the criteria for the cross-tabulation
cross_tab_criteria = ['Obesity', 'class']

# Creating a color map
color_map = sns.light_palette("blue", as_cmap=True)

# Creating a cross-tabulation and normalizing the values by column
cross_tab = pd.crosstab(dataFrame[cross_tab_criteria[0]], dataFrame[cross_tab_criteria[1]], normalize='columns')

# Converting the proportions to percentages and rounding to 2 decimal places
cross_tab_percentages = cross_tab.mul(100).round(2)

# Applying a gradient coloring to the table
styled_cross_tab = cross_tab_percentages.style.background_gradient(cmap=color_map)

styled_cross_tab

In [None]:
#Analysis based on Obesity
dataFrame.groupby("Obesity")["class"].value_counts().unstack()

### Occurences of Symptoms in patients

In [None]:

# Define a function to create a bar plot
def plotBar(value, title):
    # Set the figure size
    plt.figure(figsize=(6,4))
    # Get the counts of each unique value in the input series
    value_counts = value.value_counts()
    # Create a list of colors, 'blue' for 'Yes' and 'red' for other values
    colors = ['green' if v == 'Yes' else 'red' for v in value_counts.index]
    # Create a bar plot with the unique values as the x-axis and their counts as the y-axis
    plt.bar(value_counts.index, value_counts.values, color=colors)
    # Set the title of the plot
    plt.title(title)
    # Display the plot
    plt.show()

# Create a new dataframe that includes all columns from the original dataframe except 'Age', 'class', and 'Gender'
df_symptoms = dataFrame[dataFrame.columns.difference(["Age", "class", "Gender"])]

# For each column in the new dataframe
for column in df_symptoms.columns:
    # Call the plotBar function with the column data and the capitalized column name as the title
    plotBar(df_symptoms[column], column.capitalize())


  

In [None]:


# Filter the DataFrame based on the integer value in 'class' column
positive_diabetes = dataFrame[dataFrame["class"] == 1].copy()
positive_diabetes.drop(['Age', 'Gender', 'class'], axis=1, inplace=True)

# Map binary values to 'Yes' and 'No'
positive_diabetes = positive_diabetes.replace({1: 'Yes', 0: 'No'})

# Check and transform the data
symptom_counts = positive_diabetes.apply(pd.Series.value_counts).transpose()

# Ensure 'Yes' and 'No' columns exist
if 'Yes' in symptom_counts.columns and 'No' in symptom_counts.columns:
    symptom_counts['Symptom_Present_Percentage'] = symptom_counts['Yes'] / (symptom_counts['Yes'] + symptom_counts['No']) * 100
    symptom_counts['Symptom_Absent_Percentage'] = symptom_counts['No'] / (symptom_counts['Yes'] + symptom_counts['No']) * 100
    symptom_counts.drop(['Yes', 'No'], inplace=True, axis=1)

    print('\033[1m' + '\n\t People who were tested positive for Diabetes')
    print('**********************************************************')
    sorted_symptoms = symptom_counts.sort_values(by='Symptom_Present_Percentage', ascending=False)
    print(sorted_symptoms)
else:
    print("Error: 'Yes' and/or 'No' columns not found in symptom_counts DataFrame")


### Data Pre-Processing

In [None]:
# Changing 'Positive' to 1 and 'Negative' to 0
dataFrame['class'] = dataFrame['class'].replace({'Positive': 1, 'Negative': 0})
dataFrame['class'].head()

In [None]:
# Splitting the DataFrame into features and target
features = dataFrame.drop(['class'], axis=1)  # All columns except 'class' are considered as features
target = dataFrame['class']  # 'class' column is the target

In [None]:
# Identify and store columns in the features DataFrame that are of type 'object'
object_columns = features.columns[features.dtypes == 'object']

# Convert the identified columns to lowercase
features.columns = features.columns.str.lower()

#convert column to lowercase
object_columns = object_columns.str.lower()

print(object_columns)

In [None]:
# Print the information of the DataFrame
print (features.info())

In [None]:


# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate over each feature in the object list
for column in object_columns:
    # Convert the column to string type and perform label encoding
    features[column] = label_encoder.fit_transform(features[column].astype(str))

# Print the information of the DataFrame
print(features.info())

In [None]:
features.head(50)

In [None]:


# Compute the correlation matrix
correlation_matrix = features.corr()

# Create a new figure for the plot
fig, ax = plt.subplots(figsize=(12, 10))

# Create a heatmap
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='viridis', ax=ax)

# Set the title of the heatmap
heatmap.set_title("Correlation Matrix of Diabetes Dataset")

# Display the plot
plt.show()

In [None]:

# Select the 10 best features based on the chi-squared (chi^2) statistical test for non-negative features
k_best_selector = SelectKBest(score_func=chi2, k=10)
fitted_selector = k_best_selector.fit(features, target)

# Create a DataFrame with the scores from the chi^2 test
score_data = pd.DataFrame(fitted_selector.scores_)

# Create a DataFrame with the column names
column_data = pd.DataFrame(features.columns)

# Concatenate the two DataFrames along the columns
feature_scores = pd.concat([column_data, score_data], axis=1)
feature_scores.columns = ['Feature', 'Score']
feature_scores

In [None]:
# Select the top 10 features based on chi-squared scores
top_chi2_features = feature_scores.nlargest(10, 'Score')
top_chi2_features

In [None]:
# Get the names of the top 10 features selected by the chi-squared test
top_chi2_features = fitted_selector.get_support(indices=True)
top_chi2_features = [column for column in features.columns[top_chi2_features]]
top_chi2_features

In [None]:

#pie chart of the features
featureview = pd.Series(fitted_selector.scores_, index=features.columns)
featureview.plot(kind='pie', figsize=(10, 10))  # Increase the size as needed


In [None]:
# Select features with a variance higher than 0.5 * (1 - 0.5)
#variance_selector = VarianceThreshold(threshold=(0.5 * (1 - 0.5)))
variance_selector = VarianceThreshold(threshold=0.24)
fitted_variance = variance_selector.fit(features)
print(fitted_variance)

# Create a DataFrame with the variances
variance_data = pd.DataFrame(fitted_variance.variances_)

# Create a DataFrame with the column names
column_data1 = pd.DataFrame(features.columns)

# Concatenate the two DataFrames along the columns
high_variance_features = pd.concat([variance_data, column_data1], axis=1)
high_variance_features.columns = ['Variance', 'Feature']

# Select the features with a variance higher than 0.2
top_variance_features = high_variance_features[high_variance_features['Variance'] > 0.24]
top_variance_features

In [None]:
# Get the names of the features selected by the variance threshold
selected_features_indices = fitted_variance.get_support(indices=True)
selected_features = [column for column in features.columns[selected_features_indices]]
selected_features

In [None]:
# Combine the selected features
final_selected_features = list(set(top_chi2_features).union(set(selected_features)))
print(final_selected_features)
# Select these features from the original dataset
X_final = features[final_selected_features]




#X_FS = X[['Polydipsia','sudden weight loss','partial paresis','Irritability','Polyphagia','Age','visual blurring']]

### Splitting the dataset into training and testing sets

In [None]:
# Split the dataset into 80% training data and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X_final, target, test_size = 0.2, random_state=0)


### Data Normalization

In [None]:
# MinMax Scaling
minmax = MinMaxScaler()
X_train[['age']] = minmax.fit_transform(X_train[['age']])
X_test[['age']] = minmax.transform(X_test[['age']])

X_train.head()

### Model Building/Logistic Regression

It's good for binary classification problems like diabetes (yes or no). It's interpretable and works well with smaller datasets.

In [None]:


# Define the Logistic Regression model
log_reg_model = LogisticRegression(random_state=0, penalty='l2')

# Train the model
log_reg_model.fit(X_train, y_train)

# Define the cross-validation strategy
strat_k_fold = StratifiedKFold(n_splits=10)

# Compute cross-validation scores
cv_scores = cross_val_score(log_reg_model, X_train, y_train, cv=strat_k_fold, scoring='accuracy')

# Predict the target for the test data
y_predict_lr = log_reg_model.predict(X_test)

# Compute evaluation metrics
acc_score = accuracy_score(y_test, y_predict_lr)
prec_score = precision_score(y_test,y_predict_lr)
rec_score = recall_score(y_test, y_predict_lr)
f1_sc = f1_score(y_test, y_predict_lr)

# Store the results in a DataFrame
log_reg_results = pd.DataFrame({
    'Model': ['Logistic Regression'],
    'Accuracy': [acc_score],
    'Cross Val Accuracy': [cv_scores.mean()],
    'Precision': [prec_score],
    'Recall': [rec_score],
    'F1 Score': [f1_sc]
})

# Display the results
log_reg_results

### Model Building/Random Forest



In [None]:

# Define the Random Forest model
random_forest_model = RandomForestClassifier(random_state=0)

# Train the model
random_forest_model.fit(X_train, y_train)

# Define the cross-validation strategy
strat_k_fold = StratifiedKFold(n_splits=10, random_state=7, shuffle=True)

# Compute cross-validation scores
cv_scores_rf = cross_val_score(random_forest_model, X_train, y_train, cv=strat_k_fold, scoring='accuracy')

# Predict the target for the test data
y_predict_rf = random_forest_model.predict(X_test)

# Compute evaluation metrics
acc_score_rf = accuracy_score(y_test, y_predict_rf)
prec_score_rf = precision_score(y_test, y_predict_rf)
rec_score_rf = recall_score(y_test, y_predict_rf)
f1_sc_rf = f1_score(y_test, y_predict_rf)

# Store the results in a DataFrame
random_forest_results = pd.DataFrame({
    'Model': ['Random Forest'],
    'Accuracy': [acc_score_rf],
    'Cross Val Accuracy': [cv_scores_rf.mean()],
    'Precision': [prec_score_rf],
    'Recall': [rec_score_rf],
    'F1 Score': [f1_sc_rf]
})

# Display the results
random_forest_results

### Model Building/ Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.optimizers import Adam

# Neural Network Model
def create_model(input_dim):
    # Define the model
    model = Sequential()
    model.add(Dense(16, input_dim=input_dim, activation='relu'))  # Hidden layer
    model.add(Dense(1, activation='sigmoid'))  # Output layer

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Create the neural network model
nn_model = create_model(X_train.shape[1])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the neural network
nn_history = nn_model.fit(X_train, y_train, epochs=100, batch_size=10, 
                          verbose=1, callbacks=[early_stopping], validation_split=0.1)

# Predict the target for the test data
y_predict_nn = (nn_model.predict(X_test) > 0.5).astype(int)

# Compute evaluation metrics
nn_acc_score = accuracy_score(y_test, y_predict_nn)
nn_prec_score = precision_score(y_test,y_predict_nn)
nn_rec_score = recall_score(y_test, y_predict_nn)
nn_f1_sc = f1_score(y_test, y_predict_nn)

# Store the results in a DataFrame
result_nn = pd.DataFrame([['Neural Network', nn_acc_score, nn_prec_score,nn_rec_score, nn_f1_sc]], columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])


# Display the results
result_nn 


### Model Building/ Support Vector Machines (SVM)

In [None]:

# Define the SVM model
svm_classifier = svm.SVC(kernel='linear', random_state=0)

# Train the model
svm_classifier.fit(X_train, y_train)

# Define the cross-validation strategy
strat_k_fold = StratifiedKFold(n_splits=10)

# Compute cross-validation scores
cv_scores_svm = cross_val_score(svm_classifier, X_train, y_train, cv=strat_k_fold, scoring='accuracy')

# Predict the target for the test data
y_pred_svm = svm_classifier.predict(X_test)

# Compute evaluation metrics
acc_score_svm = accuracy_score(y_test, y_pred_svm)
prec_score_svm = precision_score(y_test, y_pred_svm)
rec_score_svm = recall_score(y_test, y_pred_svm)
f1_sc_svm = f1_score(y_test, y_pred_svm)

# Store the results in a DataFrame
results_svm = pd.DataFrame({
    'Model': ['SVM'],
    'Accuracy': [acc_score_svm],
    'Cross Val Accuracy': [cv_scores_svm.mean()],
    'Precision': [prec_score_svm],
    'Recall': [rec_score_svm],
    'F1 Score': [f1_sc_svm]
})

# Display the results
results_svm

### Model Building/ K Nearest Neighbors

In [None]:


# Define the KNN model
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

# Train the model
knn_classifier.fit(X_train, y_train)

# Define the cross-validation strategy
strat_k_fold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Compute cross-validation scores
cv_scores_knn = cross_val_score(knn_classifier, X_train, y_train, cv=strat_k_fold, scoring='accuracy')

# Predict the target for the test data
y_predict_knn = knn_classifier.predict(X_test)

# Compute evaluation metrics
acc_score_knn = accuracy_score(y_test, y_predict_knn)
prec_score_knn = precision_score(y_test,y_predict_knn)
rec_score_knn = recall_score(y_test, y_predict_knn)
f1_sc_knn = f1_score(y_test,y_predict_knn)

# Store the results in a DataFrame
knn_results = pd.DataFrame({
    'Model': ['KNN'],
    'Accuracy': [acc_score_knn],
    'Cross Val Accuracy': [cv_scores_knn.mean()],
    'Precision': [prec_score_knn],
    'Recall': [rec_score_knn],
    'F1 Score': [f1_sc_knn]
})

# Display the results
knn_results

### Model Building/XGB Classifier


In [None]:
# Initialize XGBoost classifier
classifier = xgb.XGBClassifier(random_state=0)

# Fit the classifier to the training set
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_predict_svm = classifier.predict(X_test)

# Evaluate model
acc = accuracy_score(y_test,y_predict_svm)
prec = precision_score(y_test,y_predict_svm)
rec = recall_score(y_test,y_predict_svm)
f1 = f1_score(y_test,y_predict_svm)

# Perform cross-validation
kfold = KFold(n_splits=10, random_state=0, shuffle=True)
acc_xgb = cross_val_score(classifier, X_train, y_train, cv=kfold, scoring='accuracy')


# Compile the results into a DataFrame
results_xgb = pd.DataFrame([['XGB', acc, acc_xgb.mean(), prec, rec, f1]], 
                       columns=['Model', 'Accuracy', 'Cross Val Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Display the results
results_xgb

In [None]:
# Combine the results of all models
combined_results = pd.concat([log_reg_results, results_svm, knn_results,result_nn,random_forest_results], ignore_index=True)

# Display the combined results
combined_results

### Confusion matrix to visualize the performance of the models

In [None]:
# Assuming y_test is your actual test labels and y_predict_* are the predicted labels from each model
y_predict_lr = log_reg_model.predict(X_test) # Logistic Regression predictions
y_predict_svm = svm_classifier.predict(X_test) # SVM predictions
y_predict_rf = random_forest_model.predict(X_test) # Random Forest predictions
y_predict_knn = knn_classifier.predict(X_test) # Random Forest predictions


In [None]:

# Check the unique values in the predictions and the target variable
print("Unique values in y_test:", pd.unique(y_test))
print("Unique values in y_predict_lr:", pd.unique(y_predict_lr))
print("Unique values in y_predict_svm:", pd.unique(y_predict_svm))
print("Unique values in y_predict_rf:", pd.unique(y_predict_rf))
print("Unique values in y_predict_knn:", pd.unique(y_predict_knn))

In [None]:

# Create confusion matrices for each model
cm_lr = confusion_matrix(y_test, y_predict_lr)
cm_svm = confusion_matrix(y_test, y_predict_svm)
cm_rf = confusion_matrix(y_test, y_predict_rf)
cm_knn = confusion_matrix(y_test, y_predict_knn)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, model_name):
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

# Plotting confusion matrices
plot_confusion_matrix(cm_lr, "Logistic Regression")
plot_confusion_matrix(cm_svm, "SVM")
plot_confusion_matrix(cm_rf, "Random Forest")
plot_confusion_matrix(cm_knn, "KNN")

plt.show()

# Generate the confusion matrix for the neural network model
nn_confusion_matrix = confusion_matrix(y_test,y_predict_nn)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(nn_confusion_matrix, annot=True, fmt="d", cmap='Blues', 
            xticklabels=['Predicted Negative', 'Predicted Positive'], 
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.title('Confusion Matrix for Neural Network Model')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()


A confusion matrix is a tool often used in classification tasks to visualise the performance of an algorithm. Typically, it is a square matrix that compares the actual target values with the values predicted by the model. To provide a detailed explanation and interpretation, I will describe the common components of a confusion matrix:

True Positives (TP): These are cases where the model correctly predicts the positive class.

True Negatives (TN): These are cases where the model correctly predicts the negative class.

False Positives (FP), also known as Type I error: These are cases where the model incorrectly predicts the positive class.

False Negatives (FN), also known as Type II error: These are cases where the model incorrectly predicts the negative class.

The confusion matrix typically looks like this:

Actual \ Predicted	Positive Prediction	Negative Prediction
Actual Positive	              TP	         FN
Actual Negative	              FP	         TN


Interpretation:
Accuracy: (TP + TN) / (TP + TN + FP + FN). This measures how often the classifier makes the correct prediction. It’s the ratio of the number of correct predictions to the total number of predictions.

Precision: TP / (TP + FP). This measures how many of the items identified as positive were actually positive. It’s important in contexts where False Positives are more significant than False Negatives.

Recall (Sensitivity): TP / (TP + FN). This measures how many of the actual positive items were identified correctly. It’s important in contexts where False Negatives are more significant than False Positives.

F1 Score: 2 * (Precision * Recall) / (Precision + Recall). This is a harmonic mean of precision and recall and gives a balance between them.

Specificity: TN / (TN + FP). This measures the proportion of actual negatives that are correctly identified as such.

Contextual Importance:
In medical testing, high recall might be more important because missing a positive (disease) case (False Negative) could be more detrimental than a False Positive.
In spam detection, high precision might be more crucial because categorizing a legitimate email as spam (False Positive) is often seen as worse than not catching a spam email (False Negative).

### Model Testing


In [None]:
import unittest

class TestAllModels(unittest.TestCase):
    @classmethod
    def setUp(cls):

        # Load your test dataset
        cls.dataset = pd.read_csv('../../AIBetic2Dataset/test_model_dataset.csv')

         # Convert categorical columns to numerical
        columns_to_convert = ['delayed healing', 'Polyuria', 'Alopecia', 'Gender', 'Itching', 'visual blurring', 'weakness', 'partial paresis', 'Polydipsia', 'Age', 'Polyphagia', 'sudden weight loss', 'Irritability']
        for column in columns_to_convert:
            cls.dataset[column] = cls.dataset[column].replace({'Yes': 1, 'No': 0})

        cls.dataset['Gender'] = cls.dataset['Gender'].replace({'Male': 1, 'Female': 0})

        cls.dataset['class'] = cls.dataset['class'].replace({'Positive': 1, 'Negative': 0})

        # Convert column names to lowercase
        cls.dataset.columns = map(str.lower, cls.dataset.columns)

        # Select columns
        selected_columns = ['itching', 'sudden weight loss', 'weakness', 'partial paresis', 'polyphagia', 'visual blurring', 'irritability', 'alopecia', 'polydipsia', 'delayed healing', 'polyuria', 'age', 'gender', 'class']


        cls.dataset = cls.dataset[selected_columns]

        # Split the dataset into features and target
        cls.X_test = cls.dataset.drop('class', axis=1)
        cls.y_test = cls.dataset['class']

        # Scale the 'Age' column
        minmax = MinMaxScaler()
        cls.dataset[['age']] = minmax.fit_transform(cls.dataset[['age']])

        # print(cls.dataset.head())

        
    def test_svm(self):
        y_pred = svm_classifier.predict(self.X_test)
        print("Model: Support Vector Machine")
        print(pd.DataFrame({'Actual': self.y_test, 'Predicted': y_pred}))

    def test_log_reg(self):
        y_pred = log_reg_model.predict(self.X_test)
        print("Model: Logistic Regression")
        print(pd.DataFrame({'Actual': self.y_test, 'Predicted': y_pred}))

    def test_nn(self):
        y_pred = nn_model.predict(self.X_test)
        y_pred = (y_pred > 0.5).astype(int)
        y_pred = y_pred.flatten()
        print("Model: Neural Network")
        print(pd.DataFrame({'Actual': self.y_test, 'Predicted': y_pred}))

    def test_knn(self):
        y_pred = knn_classifier.predict(self.X_test)
        print("Model: K-Nearest Neighbors")
        print(pd.DataFrame({'Actual': self.y_test, 'Predicted': y_pred}))

    
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)