In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Function to preprocess the data
def preprocess_data(df):
    # Create label encoders for categorical variables
    le = LabelEncoder()
    categorical_cols = ['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 
                       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 
                       'Sex', 'Age_Category', 'Smoking_History', 'Alcohol_Consumption',
                       'Fruit_Consumption', 'Green_Vegetables_Consumption', 
                       'FriedPotato_Consumption']
    
    # Apply label encoding to categorical columns
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
    
    # Standardize all features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df)
    
    return scaled_features

# Function to perform and visualize k-means clustering
def kmeans_analysis(X, n_clusters=3):
    # Perform k-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    
    # Reduce dimensionality for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
    plt.title('K-Means Clustering Results (PCA Visualization)')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.colorbar(scatter, label='Cluster Label')
    plt.show()
    
    return cluster_labels

# Function to perform and visualize DBSCAN clustering
def dbscan_analysis(X, eps=0.5, min_samples=5):
    # Perform DBSCAN clustering
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    cluster_labels = dbscan.fit_predict(X)
    
    # Reduce dimensionality for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
    plt.title('DBSCAN Clustering Results (PCA Visualization)')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.colorbar(scatter, label='Cluster Label')
    plt.show()
    
    return cluster_labels

# Function to analyze clusters in relation to heart disease
def analyze_clusters(df, cluster_labels, algorithm_name):
    # Adiciona as labels dos clusters ao dataframe
   
    df['Cluster'] = cluster_labels
    
    # Calcula as estatísticas por cluster
    cluster_analysis = df.groupby('Cluster').agg({
        'Heart_Disease': ['count', 'mean'],
        'Age_Category': 'mean',
        'BMI': 'mean',
        'Exercise': 'mean'
    })
    #result = df.groupby(grouping_columns).mean(numeric_only=True)
    
    print(f"\n{algorithm_name} Clustering Analysis:")
    print("================================")
    print("\nCluster Statistics:")
    print(cluster_analysis)
    
    # Visualize heart disease distribution in clusters
    plt.figure(figsize=(10, 6))
    for cluster in df['Cluster'].unique():
        cluster_data = df[df['Cluster'] == cluster]
        plt.hist(cluster_data['Heart Disease'], alpha=0.5, 
                label=f'Cluster {cluster}', bins=20)
    
    plt.title(f'Heart Disease Distribution by {algorithm_name} Cluster')
    plt.xlabel('Heart Disease')
    plt.ylabel('Count')
    plt.legend()
    plt.show()

# Main execution
def main():
    # Load your dataset here
    file_path = "../CVD_cleaned.csv" 

    # For demonstration, I'll create a sample dataframe
    # Replace this with your actual data loading
    df = pd.read_csv(file_path)
    
    # Remove the target variable for clustering
    X = df.drop('Heart_Disease', axis=1)
    
    # Preprocess the data
    X_scaled = preprocess_data(X)
    
    # Perform k-means clustering
    print("\nPerforming k-means clustering...")
    kmeans_labels = kmeans_analysis(X_scaled)
    analyze_clusters(df, kmeans_labels, "K-Means")
    
    # Perform DBSCAN clustering
    print("\nPerforming DBSCAN clustering...")
    dbscan_labels = dbscan_analysis(X_scaled)
    analyze_clusters(df, dbscan_labels, "DBSCAN")

if __name__ == "__main__":
    main()

**Data Preprocessing:**

Converts categorical variables using LabelEncoder
Standardizes all features to ensure equal weighting
Handles the mix of numerical and categorical data appropriately


**K-Means Analysis:**

Groups patients into distinct clusters based on all features
Can identify natural groupings of patients with similar characteristics
Useful for finding patterns that might correlate with heart disease
The implementation uses 3 clusters by default but can be adjusted


**DBSCAN Analysis:**

Identifies clusters of varying shapes and sizes
Can detect outliers (labeled as -1)
Better at handling non-spherical clusters
Parameters (eps and min_samples) can be tuned based on your data


**Visualization and Analysis:**

Uses PCA to create 2D visualizations of the clusters
Provides detailed statistics for each cluster
Shows the distribution of heart disease within each cluster
Includes relevant metrics like mean age, BMI, and exercise levels

To use this script:

Replace the data loading section with your actual dataset
Adjust the preprocessing if needed for your specific column names
Tune the clustering parameters (n_clusters for k-means, eps and min_samples for DBSCAN)
Run the analysis to get insights about patient groups