In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler


In [2]:
# Load the dataset
heart_dataset = pd.read_csv("../../datasets/heart.csv")

# Check the class distribution in the target variable
heart_dataset['target'].value_counts()


target
1    165
0    138
Name: count, dtype: int64

In [3]:
# Split dataset into features (X) and target (y)
X = heart_dataset.drop(columns=['target'], axis=1)
y = heart_dataset['target']


In [4]:
# Train-test split (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the first few rows of X_train to confirm split
X_train.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
132,42,1,1,120,295,0,1,162,0,0.0,2,0,2
202,58,1,0,150,270,0,0,111,1,0.8,2,0,3
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2
75,55,0,1,135,250,0,0,161,0,1.4,1,0,2
176,60,1,0,117,230,1,1,160,1,1.4,2,2,3


In [5]:
# Feature scaling for models like SVM (Standardize the features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# Apply Tomek Links for under-sampling (to handle class imbalance)
tl = TomekLinks()
X_train_resampled, Y_train_resampled = tl.fit_resample(X_train_scaled, Y_train)

# Check the number of samples in the resampled data
print(f"Original dataset shape: {X_train.shape}, Resampled dataset shape: {X_train_resampled.shape}")


Original dataset shape: (242, 13), Resampled dataset shape: (227, 13)


In [7]:
# Initialize the model (Support Vector Machine with linear kernel)
model = svm.SVC(kernel='linear')


In [8]:
# Train the model on the resampled data
model.fit(X_train_resampled, Y_train_resampled)


In [9]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)


In [10]:
# Evaluate the model with accuracy, confusion matrix, precision, and recall
accuracy = accuracy_score(Y_test, y_pred)
conf_matrix = confusion_matrix(Y_test, y_pred)
class_report = classification_report(Y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Accuracy: 0.8525
Confusion Matrix:
[[26  3]
 [ 6 26]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85        29
           1       0.90      0.81      0.85        32

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.86      0.85      0.85        61



In [11]:
# Cross-validation (optional for model validation)
cv_scores = cross_val_score(model, X_train_scaled, Y_train, cv=5)
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")


Cross-Validation Accuracy: 0.8139


In [12]:
import pickle
file_name = 'heart_model.sav'
pickle.dump(model, open(file_name, 'wb'))