In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('heart_attack_risk_prediction_1.csv')
data

Unnamed: 0,Age,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,Diet_Score,Cholesterol_Level,Triglyceride_Level,Systolic_BP,Diastolic_BP,Air_Pollution_Exposure,Family_History,Stress_Level,Heart_Attack_History,Heart_Attack_Risk
0,42,0,0,0,1,1,0,0,9,248,125,93,119,1,0,4,0,0
1,26,1,0,0,0,0,1,1,4,272,51,134,115,0,0,7,0,0
2,78,1,0,0,1,0,0,1,6,268,213,104,117,0,1,10,0,0
3,58,1,1,0,1,0,0,1,9,224,250,91,65,0,0,1,1,0
4,22,1,0,0,0,0,0,1,5,277,129,141,109,0,0,9,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,20,0,0,0,0,1,0,1,6,160,238,133,74,1,1,10,0,0
9996,34,0,0,0,0,0,0,0,5,257,241,124,78,0,1,1,0,1
9997,76,1,0,0,0,0,0,0,2,205,69,151,98,1,0,1,1,0
9998,52,1,0,0,0,0,0,1,5,155,288,127,60,1,0,7,0,0


In [5]:
x = data.drop(columns=['Heart_Attack_Risk'])
y = data['Heart_Attack_Risk']
x.shape, y.shape

((10000, 17), (10000,))

In [7]:
# Selecting binary columns, then we can split the dataset into two, 
# One of them contains binary values and the other can have continuous numerical columns
# So that we can applyscaling on only these continuous numerical columns

# Select binary columns
binary_columns = [col for col in x.columns if x[col].nunique() == 2]
print("Binary columns:\n", binary_columns)

Binary columns:
 ['Gender', 'Diabetes', 'Hypertension', 'Obesity', 'Smoking', 'Alcohol_Consumption', 'Physical_Activity', 'Air_Pollution_Exposure', 'Family_History', 'Heart_Attack_History']


In [9]:
# Select columns having conotinuous numerical values

numerical_columns = [col for col in data.columns if data[col].nunique() > 2]
print("Numerical columns:\n", numerical_columns)

Numerical columns:
 ['Age', 'Diet_Score', 'Cholesterol_Level', 'Triglyceride_Level', 'Systolic_BP', 'Diastolic_BP', 'Stress_Level']


In [11]:
# Applying Train-test split BEFORE scaling

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7000, 17), (3000, 17), (7000,), (3000,))

In [15]:
# Scaling only numerical columns AFTER the split
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled_numerical = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled_numerical = scaler.transform(X_test[numerical_columns])

# Reconstructing final X_train and X_test with binary_columns
X_train_final = pd.DataFrame(X_train_scaled_numerical, columns=numerical_columns, index=X_train.index)
X_train_final[binary_columns] = X_train[binary_columns]

X_test_final = pd.DataFrame(X_test_scaled_numerical, columns=numerical_columns, index=X_test.index)
X_test_final[binary_columns] = X_test[binary_columns]

In [21]:
X_train_final.shape, X_test_final.shape, y_train.shape, y_test.shape

((7000, 17), (3000, 17), (7000,), (3000,))

In [25]:
X_test_final.head()

Unnamed: 0,Age,Diet_Score,Cholesterol_Level,Triglyceride_Level,Systolic_BP,Diastolic_BP,Stress_Level,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,Air_Pollution_Exposure,Family_History,Heart_Attack_History
6252,0.898305,0.9,0.590604,0.610442,0.078652,0.0,0.0,1,1,0,0,1,0,0,1,0,0
4684,0.423729,0.3,0.268456,0.052209,0.314607,0.898305,1.0,0,0,0,0,0,0,0,0,0,0
1731,0.847458,0.5,0.744966,0.184739,0.685393,0.694915,0.0,1,0,0,1,0,0,1,1,0,1
4742,0.355932,0.7,0.899329,0.783133,0.404494,0.779661,0.666667,0,0,0,0,0,1,0,1,0,0
4521,0.305085,1.0,0.724832,0.971888,0.314607,0.779661,0.555556,1,0,0,0,0,0,1,0,0,0


In [27]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Define the SVM model
svm_model = SVC(
    kernel='rbf',
    C=1,
    gamma='scale',
    class_weight='balanced',
    probability=True,
    random_state=42
)

# Train the model
svm_model.fit(X_train_final, y_train)

# Predict on test set
y_pred_svm = svm_model.predict(X_test_final)

# Predict probabilities for ROC AUC
y_prob_svm = svm_model.predict_proba(X_test_final)[:, 1]

# Evaluate
print("SVM Evaluation on Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))


SVM Evaluation on Test Data:
Accuracy: 0.5026666666666667
Precision: 0.3066316480630335
Recall: 0.5171650055370985
F1 Score: 0.38499587798845836
ROC AUC Score: 0.48191768972285987

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.50      0.58      2097
           1       0.31      0.52      0.38       903

    accuracy                           0.50      3000
   macro avg       0.51      0.51      0.48      3000
weighted avg       0.58      0.50      0.52      3000



In [27]:
# Predict on the train set
y_pred_train = svm_model.predict(X_train_final)

# Evaluate the model
print("Model Evaluation on Test Data:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Precision:", precision_score(y_train, y_pred_train))
print("Recall:", recall_score(y_train, y_pred_train))
print("F1 Score:", f1_score(y_train, y_pred_train))
print("ROC AUC Score:", roc_auc_score(y_train, y_pred_train))

print("\n Classification Report:\n", classification_report(y_train, y_pred_train))

Model Evaluation on Test Data:
Accuracy: 0.6248571428571429
Precision: 0.4275402554136591
Recall: 0.7319391634980988
F1 Score: 0.539782684893095
ROC AUC Score: 0.6553895163895722

 Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.58      0.68      4896
           1       0.43      0.73      0.54      2104

    accuracy                           0.62      7000
   macro avg       0.63      0.66      0.61      7000
weighted avg       0.71      0.62      0.64      7000



**Conclusion:** Even after scaling, the model still struggles — same trend as before. The class imbalance continues to dominate despite class_weight='balanced'