# Data Preprocessing:
* Load the dataset.
* Handle categorical and numerical features.
* Split data for supervised learning tasks.

In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

In [3]:
# Load the Salary Dataset
df = pd.read_csv(r'E:\Neuronetix Traning\Tasks\Task 5\heart_2020_cleaned.csv')  
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
# Preprocessing: One-Hot Encoding categorical variables, Scaling numerical variables
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease'].apply(lambda x: 1 if x == 'Yes' else 0)  # Binary encoding for target

In [5]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Encoding Data

In [45]:
### Encoding variable
yes_no_dict = {
    'Yes':1,
    'No':0
}

for colname in ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']:
    df[colname] = df[colname].map(yes_no_dict)
     

In [47]:
### Encoding sex
f_m_dict = {
    'Male':1,
    'Female':0
}

for colname in ['Sex']:
    df[colname] = df[colname].map(f_m_dict)

In [48]:
### Encoding AgeCategory
age_dict = {
    '18-24' :1,
    '25-29' :2,
    '30-34' :3,
    '35-39' :4,
    '40-44' :5,
    '45-49' :6,
    '50-54' :7,
    '55-59' :8,
    '60-64' :9,
    '65-69' :10,
    '70-74' :11,
    '75-79' :12,
    '80 or older' :13,

}

for colname in ['AgeCategory']:
    df[colname] = df[colname].map(age_dict)

In [49]:
### Encoding GenHealth
gen_dict = {
    'Poor' :1,
    'Fair' :2,
    'Good' :3,
    'Very good' :4,
    'Excellent' :5,
    

}

for colname in ['GenHealth']:
    df[colname] = df[colname].map(gen_dict)

In [50]:
# Onehot Encoding
onehot_columns = ['Race', 'Diabetic']
df= pd.get_dummies(df, columns = onehot_columns)

In [51]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0,16.60,1,0,0,3.0,30.0,0,0,8,...,False,False,False,False,False,True,False,False,True,False
1,0,20.34,0,0,1,0.0,0.0,0,0,13,...,False,False,False,False,False,True,True,False,False,False
2,0,26.58,1,0,0,20.0,30.0,0,1,10,...,False,False,False,False,False,True,False,False,True,False
3,0,24.21,0,0,0,0.0,0.0,0,0,12,...,False,False,False,False,False,True,True,False,False,False
4,0,23.71,0,0,0,28.0,0.0,1,0,5,...,False,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,1,9,...,False,False,False,True,False,False,False,False,True,False
319791,0,29.84,1,0,0,0.0,0.0,0,1,4,...,False,False,False,True,False,False,True,False,False,False
319792,0,24.24,0,0,0,0.0,0.0,0,0,6,...,False,False,False,True,False,False,True,False,False,False
319793,0,32.81,0,0,0,0.0,0.0,0,0,2,...,False,False,False,True,False,False,True,False,False,False


In [52]:
# Split data into training and test sets for supervised learning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Supervised Learning: Logistic Regression and Random Forest

## Logistic Regression 

In [53]:
# Logistic Regression pipeline
logreg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', LogisticRegression(max_iter=1000))])

In [54]:
# Train Logistic Regression
logreg_pipeline.fit(X_train, y_train)

In [55]:
# Predict and evaluate Logistic Regression
y_pred_logreg = logreg_pipeline.predict(X_test)
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_logreg))
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression ROC AUC: ", roc_auc_score(y_test, y_pred_logreg))

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96     87727
           1       0.55      0.11      0.18      8212

    accuracy                           0.92     95939
   macro avg       0.74      0.55      0.57     95939
weighted avg       0.89      0.92      0.89     95939

Logistic Regression Accuracy:  0.9161237869896497
Logistic Regression ROC AUC:  0.5497257324732852


## Random Forest

In [56]:
# Random Forest pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])

In [57]:
# Train Random Forest
rf_pipeline.fit(X_train, y_train)

In [58]:
# Predict and evaluate Random Forest
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Random Forest ROC AUC: ", roc_auc_score(y_test, y_pred_rf))

Random Forest Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.95     87727
           1       0.33      0.14      0.19      8212

    accuracy                           0.90     95939
   macro avg       0.63      0.55      0.57     95939
weighted avg       0.87      0.90      0.88     95939

Random Forest Accuracy:  0.9025422403819093
Random Forest ROC AUC:  0.5547163613077635


# Unsupervised Learning: K-Means and GMM

In [59]:
# Preprocessing for unsupervised learning (no train-test split since it's unsupervised)
X_unsupervised = df.drop('HeartDisease', axis=1)

## K-Means Clustering

In [65]:
kmeans = KMeans(n_clusters=2, random_state=42)
clusters_kmeans = kmeans.fit_predict(X_unsupervised)

# Evaluate K-Means
silhouette_kmeans = silhouette_score(X_unsupervised, clusters_kmeans)
print(f"K-Means Silhouette Score: {silhouette_kmeans}")

K-Means Silhouette Score: 0.5741185594944092


## Gaussian Mixture Model (GMM)

In [61]:
# Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=2, random_state=42)
clusters_gmm = gmm.fit_predict(X_unsupervised)

# Evaluate GMM
silhouette_gmm = silhouette_score(X_unsupervised, clusters_gmm)
print(f"GMM Silhouette Score: {silhouette_gmm}")

GMM Silhouette Score: 0.15195150241127484


# Comparison:
* For Supervised: Compare models based on classification performance (e.g., ROC-AUC).
* For Unsupervised: Compare models based on clustering metrics (e.g., Silhouette Score).

In [66]:
# Compare models
print(f"\nComparison:\nLogistic Regression Accuracy: {accuracy_score(y_test, y_pred_logreg)}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"K-Means Silhouette Score: {silhouette_kmeans}")
print(f"GMM Silhouette Score: {silhouette_gmm}")


Comparison:
Logistic Regression Accuracy: 0.9161237869896497
Random Forest Accuracy: 0.9025422403819093
K-Means Silhouette Score: 0.5741185594944092
GMM Silhouette Score: 0.15195150241127484
