<a href="https://colab.research.google.com/github/Shivani-2204/Foresights-of-heart-stroke/blob/main/Implementation_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from google.colab import files
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Upload heart_data.csv
print("Please upload the first dataset heart_data.csv")
uploaded1 = files.upload()  # Upload first dataset
file1 = list(uploaded1.keys())[0]
df1 = pd.read_csv(file1)

# Preprocessing for heart_data.csv
df1 = df1.drop_duplicates()
if df1.isnull().sum().sum() > 0:
    df1 = df1.fillna(df1.median())

scaler = StandardScaler()
numerical_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
df1[numerical_cols] = scaler.fit_transform(df1[numerical_cols])
df1["target"] = df1["target"].apply(lambda x: 1 if x > 0 else 0)

# Apply Local Outlier Factor (LOF) for outlier removal
try:
    lof = LocalOutlierFactor(n_neighbors=5, contamination=0.05)
    is_inlier = lof.fit_predict(df1[numerical_cols])
    df1 = df1[is_inlier == 1].reset_index(drop=True)
    print("LOF applied successfully on heart_data.csv; outliers removed.")
except Exception as e:
    print("LOF error on heart_data.csv:", e)

# Upload cleveland_heart.csv
print("Please upload the second dataset cleveland_heart.csv")
uploaded2 = files.upload()  # Upload second dataset
file2 = list(uploaded2.keys())[0]
df2 = pd.read_csv(file2)

# Preprocessing for cleveland_heart.csv
df2 = df2.drop(columns=["id", "dataset"], errors='ignore')
df2 = df2.rename(columns={"thalch": "thalach", "num": "target"})

categorical_cols = ["sex", "cp", "restecg", "slope", "thal"]
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df2[col] = le.fit_transform(df2[col].astype(str))
    label_encoders[col] = le

df2[numerical_cols] = scaler.transform(df2[numerical_cols])
bool_cols = ["fbs", "exang"]
for col in bool_cols:
    if col in df2.columns:
        df2[col] = df2[col].astype(int)

# Apply Local Outlier Factor (LOF) for outlier removal
try:
    lof = LocalOutlierFactor(n_neighbors=5, contamination=0.05)
    is_inlier = lof.fit_predict(df2[numerical_cols])
    df2 = df2[is_inlier == 1].reset_index(drop=True)
    print("LOF applied successfully on cleveland_heart.csv; outliers removed.")
except Exception as e:
    print("LOF error on cleveland_heart.csv:", e)

df2["target"] = df2["target"].apply(lambda x: 1 if x > 0 else 0)

# Hyperparameter Tuning
def tune_model(model, param_grid, X_train, y_train, use_random=False):
    if use_random:
        search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
    else:
        search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    search.fit(X_train, y_train)
    return search.best_estimator_

# Define models and hyperparameters
model_params = {
    "Random Forest": (RandomForestClassifier(class_weight="balanced"), {
        "n_estimators": np.arange(50, 500, 50),
        "max_depth": [None, 10, 20, 30, 50],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4, 8]
    }, True),  # Use RandomizedSearchCV for Random Forest
    "Logistic Regression": (LogisticRegression(max_iter=1000, class_weight="balanced"), {"C": [0.01, 0.1, 1, 10, 100]}, False),
    "Decision Tree": (DecisionTreeClassifier(class_weight="balanced"), {
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }, False),
    "SVM": (SVC(probability=True, class_weight="balanced"), {
        "C": [0.1, 1, 10, 100],
        "kernel": ["linear", "rbf", "poly"],
        "gamma": ["scale", "auto"]
    }, False),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 20, 2)}, False)
}

# Prepare results dictionary
results = {metric: {"heart_data.csv": [], "cleveland_heart.csv": []} for metric in ["Accuracy", "Precision", "Recall", "F1-Score", "Specificity"]}
optimized_models = {}

def train_and_evaluate(dataset_name, X_train, X_test, y_train, y_test):
    for model_name, (model, param_grid, use_random) in model_params.items():
        best_model = tune_model(model, param_grid, X_train, y_train, use_random)
        optimized_models[model_name] = best_model
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

        # Store metrics
        results["Accuracy"][dataset_name].append(accuracy_score(y_test, y_pred))
        results["Precision"][dataset_name].append(precision_score(y_test, y_pred))
        results["Recall"][dataset_name].append(recall_score(y_test, y_pred))
        results["F1-Score"][dataset_name].append(f1_score(y_test, y_pred))
        results["Specificity"][dataset_name].append(specificity)
        print(f"{model_name} optimized and evaluated on {dataset_name}")

        # NEW: Print all Performance metrics for each model on each dataset
        print(f" - Accuracy: {acc:.4f}")
        print(f" - Precision: {prec:.4f}")
        print(f" - Recall: {rec:.4f}")
        print(f" - F1-Score: {f1:.4f}")
        print(f" - Specificity: {specificity:.4f}\n")

# Train models on heart_data.csv
X1, y1 = df1.drop(columns=["target"]), df1["target"]
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, stratify=y1, random_state=42)
train_and_evaluate("heart_data.csv", X1_train, X1_test, y1_train, y1_test)

# Train models on cleveland_heart.csv
X2, y2 = df2.drop(columns=["target"]), df2["target"]
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, stratify=y2, random_state=42)
train_and_evaluate("cleveland_heart.csv", X2_train, X2_test, y2_train, y2_test)

# Plotly Visualization
def plot_metric_comparison(metric_name):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=list(optimized_models.keys()),
        y=results[metric_name]["heart_data.csv"],
        name="heart_data.csv",
        marker_color='orange'  # changed from blue
    ))
    fig.add_trace(go.Bar(
        x=list(optimized_models.keys()),
        y=results[metric_name]["cleveland_heart.csv"],
        name="cleveland_heart.csv",
        marker_color='green'  # changed from red
    ))

    fig.update_layout(
        title={
            'text': f'{metric_name} Comparison Across Models',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 30}
        },
        xaxis=dict(
            title='Models',
            titlefont=dict(size=30),
            tickfont=dict(size=20)
        ),
        yaxis=dict(
            title=metric_name,
            titlefont=dict(size=30),
            tickfont=dict(size=20)
        ),
        legend=dict(
            title='Dataset',
            font=dict(size=20)
        ),
        barmode='group',
        hovermode='x',
        template='plotly_white',
        font=dict(size=20)
    )
    fig.show()

for metric in results.keys():
    plot_metric_comparison(metric)

print("\nModel training, tuning, and evaluation completed successfully!")



Please upload the first dataset heart_data.csv


Saving heart_data.csv to heart_data.csv
LOF applied successfully on heart_data.csv; outliers removed.
Please upload the second dataset cleveland_heart.csv


Saving cleveland_heart.csv to cleveland_heart.csv
LOF applied successfully on cleveland_heart.csv; outliers removed.
Random Forest optimized and evaluated on heart_data.csv
 - Accuracy: 0.8621
 - Precision: 0.8529
 - Recall: 0.9062
 - F1-Score: 0.8788
 - Specificity: 0.8077

Logistic Regression optimized and evaluated on heart_data.csv
 - Accuracy: 0.8621
 - Precision: 0.8750
 - Recall: 0.8750
 - F1-Score: 0.8750
 - Specificity: 0.8462

Decision Tree optimized and evaluated on heart_data.csv
 - Accuracy: 0.8448
 - Precision: 0.8710
 - Recall: 0.8438
 - F1-Score: 0.8571
 - Specificity: 0.8462

SVM optimized and evaluated on heart_data.csv
 - Accuracy: 0.8793
 - Precision: 0.8378
 - Recall: 0.9688
 - F1-Score: 0.8986
 - Specificity: 0.7692

KNN optimized and evaluated on heart_data.csv
 - Accuracy: 0.8103
 - Precision: 0.7692
 - Recall: 0.9375
 - F1-Score: 0.8451
 - Specificity: 0.6538

Random Forest optimized and evaluated on cleveland_heart.csv
 - Accuracy: 0.8686
 - Precision: 0.8936



Model training, tuning, and evaluation completed successfully!
