In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# Get the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

# Construct the full file path
file_path = os.path.join(current_directory, "CVD_cleaned.csv")

# Load the dataset
try:
    df = pd.read_csv(file_path)
    # Display basic information about the dataset
    df.info()
    print(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    print("Please make sure the file is in the correct directory.")
    exit() # Exit if file not found

# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future use

# Define features and target
X = df.drop(columns=['General_Health'])  # Replace 'cardio' with 'General_Health'
y = df['General_Health']

# Scale numerical features
scaler = StandardScaler()
num_cols = ['Height_(cm)', 'Weight_(kg)', 'BMI']
X[num_cols] = scaler.fit_transform(X[num_cols])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize models
svm_model = SVC(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Train models
svm_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)

# Evaluate models
models = {'SVM': svm_model, 'Random Forest': rf_model, 'Logistic Regression': lr_model}
results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"Model: {name}\nAccuracy: {acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Plot accuracies
plt.figure(figsize=(8, 6))
plt.bar(results.keys(), results.values(), color=['blue', 'green', 'red'])
plt.title('Model Accuracies')
plt.ylabel('Accuracy')
plt.show()

# Hyperparameter Tuning with RandomizedSearchCV (example for RandomForest)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid_rf,
                                      n_iter=5, cv=3, scoring='accuracy', random_state=42)
random_search_rf.fit(X_train, y_train)
print(f"Best hyperparameters for Random Forest: {random_search_rf.best_params_}")

# Example prediction (replace with actual user input)
example_index = 0  # index in X_test
example_input = X_test.iloc[example_index].values.reshape(1, -1)

# Predict using the best Random Forest model
prediction = random_search_rf.predict(example_input)[0]
print(f"Predicted health status for example input: {prediction}")
