In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import numpy as np

# Load dataset
heart_disease_data = pd.read_csv('H:\\Samuel\'s folder\\2024 - OPIT\\Assessments\\ML\\Assessment 3 ML - Group Project\\Data\\heart_disease_cleaned.csv')

# Separate features (X) and target (y)
X = heart_disease_data.drop(columns=['id', 'num'])  # Dropping ID and target columns
y = heart_disease_data['num']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Train and evaluate supervised models
model_results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    # Collect metrics
    metrics = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro') if y_pred_proba is not None else None
    
    model_results[name] = {
        "classification_report": metrics,
        "roc_auc_score": auc
    }

# Perform K-means clustering
pipeline_kmeans = Pipeline([
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=2, random_state=42))
])

pipeline_kmeans.fit(X_train)
clusters = pipeline_kmeans.named_steps['kmeans'].labels_
silhouette_avg = silhouette_score(pipeline_kmeans.named_steps['preprocessor'].transform(X_train), clusters)

# Output results
print("Supervised Model Results:")
for name, results in model_results.items():
    print(f"\n{name}:\n")
    print(f"Classification Report:\n{results['classification_report']}")
    print(f"ROC-AUC Score: {results['roc_auc_score']}")

print("\nUnsupervised Model Results:")
print(f"K-means Silhouette Score: {silhouette_avg}")


Supervised Model Results:

Logistic Regression:

Classification Report:
{'0': {'precision': 0.9047619047619048, 'recall': 0.926829268292683, 'f1-score': 0.9156626506024096, 'support': 82.0}, '1': {'precision': 0.631578947368421, 'recall': 0.6792452830188679, 'f1-score': 0.6545454545454545, 'support': 53.0}, '2': {'precision': 0.4444444444444444, 'recall': 0.18181818181818182, 'f1-score': 0.25806451612903225, 'support': 22.0}, '3': {'precision': 0.30303030303030304, 'recall': 0.47619047619047616, 'f1-score': 0.37037037037037035, 'support': 21.0}, '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6.0}, 'accuracy': 0.6847826086956522, 'macro avg': {'precision': 0.45676311992101465, 'recall': 0.4528166418640418, 'f1-score': 0.4397285983294534, 'support': 184.0}, 'weighted avg': {'precision': 0.6728563833826992, 'recall': 0.6847826086956522, 'f1-score': 0.6697306715919736, 'support': 184.0}}
ROC-AUC Score: 0.8810212533459065

Random Forest:

Classification Report:
{'0': {'p

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import numpy as np

# Load dataset using the provided file path
file_path = r"C:\Users\Fredrich Bernard\Desktop\Online Writing\Pullman\ML\Data\heart_disease_cleaned.csv"
heart_disease_data = pd.read_csv(file_path)

# Display the first few rows to confirm successful loading
print(heart_disease_data.head())


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Fredrich Bernard\\Desktop\\Online Writing\\Pullman\\ML\\Data\\heart_disease_cleaned.csv'

In [None]:
# Separate features (X) and target (y)
X = heart_disease_data.drop(columns=['id', 'num'])  # Dropping ID and target columns
y = heart_disease_data['num']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Train and evaluate supervised models
model_results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    # Collect metrics
    metrics = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro') if y_pred_proba is not None else None
    
    model_results[name] = {
        "classification_report": metrics,
        "roc_auc_score": auc
    }

# Perform K-means clustering
pipeline_kmeans = Pipeline([
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=2, random_state=42))
])

pipeline_kmeans.fit(X_train)
clusters = pipeline_kmeans.named_steps['kmeans'].labels_
silhouette_avg = silhouette_score(pipeline_kmeans.named_steps['preprocessor'].transform(X_train), clusters)

# Output results
print("Supervised Model Results:")
for name, results in model_results.items():
    print(f"\n{name}:\n")
    print(f"Classification Report:\n{results['classification_report']}")
    print(f"ROC-AUC Score: {results['roc_auc_score']}")

print("\nUnsupervised Model Results:")
print(f"K-means Silhouette Score: {silhouette_avg}")

  super()._check_params_vs_input(X, default_n_init=10)


Supervised Model Results:

Logistic Regression:

Classification Report:
{'0': {'precision': 0.9047619047619048, 'recall': 0.926829268292683, 'f1-score': 0.9156626506024096, 'support': 82.0}, '1': {'precision': 0.631578947368421, 'recall': 0.6792452830188679, 'f1-score': 0.6545454545454544, 'support': 53.0}, '2': {'precision': 0.4444444444444444, 'recall': 0.18181818181818182, 'f1-score': 0.2580645161290322, 'support': 22.0}, '3': {'precision': 0.30303030303030304, 'recall': 0.47619047619047616, 'f1-score': 0.37037037037037035, 'support': 21.0}, '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6.0}, 'accuracy': 0.6847826086956522, 'macro avg': {'precision': 0.45676311992101465, 'recall': 0.4528166418640418, 'f1-score': 0.4397285983294533, 'support': 184.0}, 'weighted avg': {'precision': 0.6728563833826992, 'recall': 0.6847826086956522, 'f1-score': 0.6697306715919736, 'support': 184.0}}
ROC-AUC Score: 0.8812752402579284

Random Forest:

Classification Report:
{'0': {'pr