In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import warnings

# Suppress warnings to keep the output clean
warnings.filterwarnings('ignore')

# Load the dataset
file_path = r'C:\Users\USER\Downloads\CVD_cleaned (1).csv'
data = pd.read_csv(file_path)

# Encode categorical variables
label_encoder = LabelEncoder()
categorical_columns = ['Checkup', 'Exercise', 'Sex', 'Age_Category', 'Smoking_History', 'Alcohol_Consumption']
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

# Define features and target variable
target_column = 'General_Health'
X = data.drop(['General_Health', 'Heart_Disease', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis'], axis=1)
y = data[target_column]

# Encode the target variable if it is categorical
if y.dtype == 'object' or y.dtype.name == 'category':
    y = label_encoder.fit_transform(y)

# Check class distribution of the target variable
unique, counts = np.unique(y, return_counts=True)
print(f"Class distribution of {target_column}: {dict(zip(unique, counts))}")

# Reduce the dataset size for quicker iterations (using a subset of the data)
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.2, stratify=y, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

# Apply SMOTE to balance the training dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Feature selection
selector = SelectKBest(score_func=f_classif, k=10)
X_train_selected = selector.fit_transform(X_train_scaled, y_train_resampled)
X_test_selected = selector.transform(X_test_scaled)

# Define a function to evaluate models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    report = classification_report(y_test, predictions)
    return accuracy, f1, report

# Train and evaluate Random Forest with Randomized Search
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

random_search_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=5, cv=3, n_jobs=-1, verbose=2)
random_search_rf.fit(X_train_selected, y_train_resampled)
best_rf_model = random_search_rf.best_estimator_
rf_accuracy, rf_f1, rf_report = evaluate_model(best_rf_model, X_test_selected, y_test)
print("Best Random Forest Accuracy:", rf_accuracy)
print("Best Random Forest F1 Score:", rf_f1)
print("Best Random Forest Classification Report:\n", rf_report)

# Train and evaluate Gradient Boosting with Randomized Search
gb_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

random_search_gb = RandomizedSearchCV(GradientBoostingClassifier(random_state=42), gb_params, n_iter=5, cv=3, n_jobs=-1, verbose=2)
random_search_gb.fit(X_train_selected, y_train_resampled)
best_gb_model = random_search_gb.best_estimator_
gb_accuracy, gb_f1, gb_report = evaluate_model(best_gb_model, X_test_selected, y_test)
print("Best Gradient Boosting Accuracy:", gb_accuracy)
print("Best Gradient Boosting F1 Score:", gb_f1)
print("Best Gradient Boosting Classification Report:\n", gb_report)

# Train and evaluate XGBoost with Randomized Search
xgb_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

random_search_xgb = RandomizedSearchCV(xgb.XGBClassifier(random_state=42, use_label_encoder=False), xgb_params, n_iter=5, cv=3, n_jobs=-1, verbose=2)
random_search_xgb.fit(X_train_selected, y_train_resampled)
best_xgb_model = random_search_xgb.best_estimator_
xgb_accuracy, xgb_f1, xgb_report = evaluate_model(best_xgb_model, X_test_selected, y_test)
print("Best XGBoost Accuracy:", xgb_accuracy)
print("Best XGBoost F1 Score:", xgb_f1)
print("Best XGBoost Classification Report:\n", xgb_report)

# Train and evaluate Logistic Regression
lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_train_selected, y_train_resampled)
lr_accuracy, lr_f1, lr_report = evaluate_model(lr_model, X_test_selected, y_test)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression F1 Score:", lr_f1)
print("Logistic Regression Classification Report:\n", lr_report)

Class distribution of General_Health: {0: 55954, 1: 35810, 2: 95364, 3: 11331, 4: 110395}
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Random Forest Accuracy: 0.34668933139064273
Best Random Forest F1 Score: 0.3511388765133881
Best Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.40      0.36      2238
           1       0.22      0.24      0.23      1433
           2       0.39      0.32      0.35      3814
           3       0.14      0.28      0.19       453
           4       0.42      0.38      0.40      4416

    accuracy                           0.35     12354
   macro avg       0.30      0.33      0.31     12354
weighted avg       0.36      0.35      0.35     12354

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Gradient Boosting Accuracy: 0.3418326048243484
Best Gradient Boosting F1 Score: 0.3447175839296761
Best Gradient Boosting Classification Report:
           