# Machine Learning Pipeline

### A. Load the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('./diabetes_dataset_with_notes.csv')
df.head()

### B. Perform Data Pre-processing

#### i. Perform Basic Data Exploration such as (10pts)

In [None]:
# 1. View the first few rows
df.head()

In [None]:
# 2. Check for missing values
df.isna().sum()

In [None]:
# 3. Summary statistics of numerical features
df.describe()

In [None]:
# 4. Check Data Types
df.dtypes

In [None]:
# Count of diabetic and non-diabetic cases
df['diabetes'].value_counts()

#### ii. The dataset contains categorical attributes, encode them using Label Encoding (5 pts)

In [None]:
# Store the categorical columns in an array
categorical_cols = ['location' ,'gender','smoking_history']
# Copy the original data (df) to not mess/change it up
df_encoded = df.copy()
# Dictionary to store encoders
label_encoders = {}

# This will encode the categorical data into (0,1) and will be placed into  a new column (name_encoded) to not mess up the original ones and we can have a reference
for col in categorical_cols:
  le = LabelEncoder()
  df_encoded[f'{col}_encoded'] = le.fit_transform(df_encoded[col])
  label_encoders[col] = le

df_encoded.head()
# data['location'].unique()

#### iii. Perform data visualization (15 pts)

##### 1. Plot the Class Distribution

In [None]:
non_diabetic_count = df_encoded['diabetes'].value_counts().get(0,0)
diabetic_count = df_encoded['diabetes'].value_counts().get(1,0)

# Creating the diabetes distribution plot using Seaborn's countplot
plt.figure(figsize=(10, 6))  # Setting the (width, height) of the graph

# We are now using the encoded version of the dataset (refer above)
sns.countplot(x='diabetes', data=df_encoded, palette='Set2', hue="gender_encoded")  # Plot with Seaborn (x is the attribute, df_encoded is the df_encoded from pandas, palette is the color scheme hue os the second variable)

# Some added customization with Matplotlib
plt.title('Class Distribution of Diabetes based on Gender', fontsize=14) # The title of the graph
plt.xlabel('Diabetes Status (Left = No, Right = Yes)', fontsize=14) # The bottom label (x-axis/horizontal)
plt.ylabel('Count', fontsize=14) # The left side label (y-axis/vertical)
plt.xticks(ticks=[0,1], labels=[f'Non-Diabetic Total: {non_diabetic_count}', f'Diabetic total: {diabetic_count}'], fontsize=12) #

plt.legend(title='Gender', labels=['Female', 'Male', 'Other'], fontsize=12, title_fontsize=14) # Some Customization of the legend part

plt.show()

##### 2. Visualize Feature Correlations

In [None]:
#  Get all the numerical values and store it in an array (I will use the encoded version for this)
numerical_cols = ['year', 'age', 'race:AfricanAmerican', 'race:Asian', 'race:Caucasian', 
                  'race:Hispanic', 'race:Other', 'hypertension', 'heart_disease', 'bmi', 
                  'hbA1c_level', 'blood_glucose_level', 'diabetes', 'location_encoded', 
                  'gender_encoded', 'smoking_history_encoded']

numerical_df = df_encoded[numerical_cols]

corr_matrix = numerical_df.corr()

plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0, 
            fmt='.2f', linewidths=0.5, square=True)

plt.title('Feature Correlation Heatmap', fontsize=16, pad=20)
plt.xticks(rotation=45, ha='right', fontsize=12)  # Rotate x labels for readability
plt.yticks(fontsize=12)

# Step 5: Display the plot
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()

##### 3. Boxplot to Check Outliers

In [None]:
# Step 1: Select continuous numerical columns
continuous_cols = ['year', 'age', 'bmi', 'hbA1c_level', 'blood_glucose_level']

# Step 2: Set Seaborn style
sns.set_style("whitegrid")

# Step 3: Create subplots
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(15, 5))  # 1 row, 5 columns

# Step 4: Plot each feature in its own subplot
for i, col in enumerate(continuous_cols):
    sns.boxplot(y=df_encoded[col], ax=axes[i], color='skyblue')
    axes[i].set_title(col, fontsize=12)
    axes[i].set_ylabel('')  # Remove y-label for simplicity

# Step 5: Customize and display
plt.suptitle('Boxplots to Check Outliers in Continuous Features', fontsize=16, y=1.05)
plt.tight_layout()
plt.show()

### C. Perform Feature Selection. 
##### There are 17 attributes in the dataset. Drop those unnecessary columns. And extract 10 attributes only. Make justification why you choose those 10 attributes. (10pts)

In [None]:
# Assuming 'data' is your DataFrame with 20 columns

# Step 1: Define the 10 selected attributes
selected_cols = ['age', 'gender_encoded', 'location_encoded', 'hypertension', 
                 'heart_disease', 'smoking_history_encoded', 'bmi', 'hbA1c_level', 
                 'blood_glucose_level', 'diabetes']

# Step 2: Create a new DataFrame with only the selected columns
data_selected = df_encoded[selected_cols]

# Step 3: Verify the result
print("Shape of selected dataset:", data_selected.shape)
print("\nFirst few rows of selected dataset:")
print(data_selected.head())

#### Justification Summary
##### Clinical Relevance: bmi, hbA1c_level, blood_glucose_level are direct diabetes indicators; hypertension, heart_disease, smoking_history_encoded are risk factors.
##### Demographic Insight: age, gender_encoded, location_encoded capture patient context with moderate predictive value.
##### Target: diabetes is essential as the outcome.
##### Dropped: Race columns are redundant and less impactful; year and clinical_notes lack immediate usability without further processing.