In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.linear_model import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import matplotlib.pyplot as plt

ImportError: cannot import name 'LinearDiscriminantAnalysis' from 'sklearn.linear_model' (C:\Users\RCallis\AppData\Roaming\Python\Python311\site-packages\sklearn\linear_model\__init__.py)

# 1. Read in the dataset and review its columns

In [None]:
nigeria = pd.read_csv('data/heart_attack_nigeria_youth_vs_adult.csv')
print(nigeria.columns)

# 2. Clean the dataset

In [None]:
nigeria.columns = (
    nigeria.columns
    .str.strip()          
    .str.lower()  
    .str.replace(' ', '_')  
    .str.replace('[^a-z0-9_]', '', regex=True)
)

print(nigeria.columns)

# 3. Fix null values and categorical data

In [None]:
print(nigeria.isnull().sum())

In [None]:
nigeria.fillna('Unknown', inplace=True)

In [None]:
nigeria = pd.get_dummies(nigeria, drop_first=True)

# 4. Find a model

## 

In [None]:
X = nigeria.drop('heart_attack', axis=1)
Y = nigeria['heart_attack']

# Split the dataset into training and validation sets
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1, shuffle=True)

# Standardize the features for PCA and clustering
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Models for supervised learning
supervised_models = [
    ('LDA', LinearDiscriminantAnalysis()),  # Linear Discriminant Analysis
    ('KNN', KNeighborsClassifier()),       # K-Nearest Neighbors (default n_neighbors=5)
    ('CART', DecisionTreeClassifier()),    # Decision Tree Classifier (default max_depth=None)
    ('NB', GaussianNB()),                  # Naive Bayes
    ('RF', RandomForestClassifier()),      # Random Forest
    ('Bagging', BaggingClassifier()),      # Bagging Classifier
]

# Unsupervised models
unsupervised_models = [
    ('PCA', PCA(n_components=2)),           # Principal Component Analysis for dimensionality reduction
    ('KMeans', KMeans(n_clusters=3)),       # K-Means clustering with 3 clusters
    ('DBSCAN', DBSCAN()),                   # DBSCAN clustering
    ('Agglomerative', AgglomerativeClustering(n_clusters=3))  # Agglomerative Clustering
]

# Train and evaluate supervised models
supervised_results = []
names = []

for name, model in supervised_models:
    kfold = StratifiedKFold(n_splits=10, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    supervised_results.append(cv_results)
    names.append(name)
    print(f'{name} (Supervised): {cv_results.mean():.6f} ({cv_results.std():.6f})')

# Train and evaluate unsupervised models
unsupervised_results = []
unsupervised_names = []

for name, model in unsupervised_models:
    if name == 'PCA':
        # Apply PCA and print explained variance
        pca_result = model.fit_transform(X_train_scaled)
        unsupervised_results.append(pca_result)
        unsupervised_names.append(f'{name} (Components)')
        print(f'{name} (Unsupervised): Explained Variance = {model.explained_variance_ratio_.sum():.6f}')
    elif name == 'KMeans':
        # Fit KMeans and print inertia
        kmeans_result = model.fit_predict(X_train_scaled)
        unsupervised_results.append(kmeans_result)
        unsupervised_names.append(f'{name} (Clusters)')
        print(f'{name} (Unsupervised): Inertia = {model.inertia_:.6f}')
    elif name == 'DBSCAN':
        # Fit DBSCAN and print number of clusters
        dbscan_result = model.fit_predict(X_train_scaled)
        unsupervised_results.append(dbscan_result)
        unsupervised_names.append(f'{name} (Clusters)')
        print(f'{name} (Unsupervised): Number of clusters = {len(set(dbscan_result))}')
    elif name == 'Agglomerative':
        # Fit Agglomerative Clustering and print number of clusters
        agglomerative_result = model.fit_predict(X_train_scaled)
        unsupervised_results.append(agglomerative_result)
        unsupervised_names.append(f'{name} (Clusters)')
        print(f'{name} (Unsupervised): Number of clusters = {len(set(agglomerative_result))}')