In [None]:
# Hypertension Risk Prediction Project
# Advanced ML Project â€“ 6 Credits / 200 Marks

# -----------------------------
# Step 1: Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import joblib

# -----------------------------
# Step 2: Load & Explore Dataset
# -----------------------------
df = pd.read_csv("hypertension_dataset.csv")  # replace with your dataset path
print("First 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

# Visualize target distribution
sns.countplot(x='Hypertension', data=df)
plt.title("Hypertension Distribution")
plt.show()

# -----------------------------
# Step 3: Data Preprocessing
# -----------------------------
# Encode categorical features
le = LabelEncoder()
categorical_cols = ['Gender', 'Smoking', 'Alcohol', 'Family_History', 'Physical_Activity']  # adjust as per dataset
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Fill missing values with median
df.fillna(df.median(), inplace=True)

# Split features and target
X = df.drop('Hypertension', axis=1)
y = df['Hypertension']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -----------------------------
# Step 4: Train Multiple ML Models
# -----------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n{name} Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-------------------------------")

# -----------------------------
# Step 5: Compare Model Performance
# -----------------------------
plt.figure(figsize=(10,6))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.ylim(0,1)
plt.xticks(rotation=45)
plt.show()

# -----------------------------
# Step 6: Feature Importance (Random Forest)
# -----------------------------
best_model = models["Random Forest"]
importances = best_model.feature_importances_
features = X.columns
plt.figure(figsize=(10,6))
sns.barplot(x=importances, y=features)
plt.title("Feature Importance - Random Forest")
plt.show()

# -----------------------------
# Step 7: ROC Curve for Best Model
# -----------------------------
y_prob = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.legend()
plt.show()

# -----------------------------
# Step 8: Hyperparameter Tuning (Random Forest)
# -----------------------------
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3,5,7, None],
    'min_samples_split': [2,5,10]
}
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
print("\nBest Hyperparameters:", grid.best_params_)
best_model = grid.best_estimator_

# -----------------------------
# Step 9: Save Best Model
# -----------------------------
joblib.dump(best_model, "hypertension_rf_model.pkl")
print("\nModel saved as 'hypertension_rf_model.pkl'")

# -----------------------------
# Step 10: Cross-Validation
# -----------------------------
cv_scores = cross_val_score(best_model, X, y, cv=5)
print("\nCross-validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
