In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load dataset
df = pd.read_csv("heart_disease.csv")

# Step 2: Drop rows with missing values
df.dropna(inplace=True)

# Step 3: Encode categorical variables
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['prevalentStroke'] = df['prevalentStroke'].map({'yes': 1, 'no': 0})
df['Heart_ stroke'] = df['Heart_ stroke'].map({'yes': 1, 'No': 0})

# One-hot encode 'education'
df = pd.get_dummies(df, columns=['education'], drop_first=True)

# Step 4: Define features and target
X = df.drop('Heart_ stroke', axis=1)
y = df['Heart_ stroke']

# Step 5: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Step 7: Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 8: Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Apply SMOTE to training data only
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_resampled).value_counts())

# Train model on resampled data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Predict on original test set
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.8278688524590164
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       610
           1       0.40      0.07      0.11       122

    accuracy                           0.83       732
   macro avg       0.62      0.52      0.51       732
weighted avg       0.77      0.83      0.77       732

Before SMOTE: Heart_ stroke
0    2489
1     435
Name: count, dtype: int64
After SMOTE: Heart_ stroke
0    2489
1    2489
Name: count, dtype: int64
Accuracy: 0.7855191256830601
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       610
           1       0.30      0.22      0.26       122

    accuracy                           0.79       732
   macro avg       0.58      0.56      0.57       732
weighted avg       0.76      0.79      0.77       732

