In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from sklearn.impute import SimpleImputer

In [5]:
data=pd.read_csv('/content/heart 2.csv')

In [7]:
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [6]:
num_rows = len(data)
print(num_rows)

1025


In [8]:
# Perform one-hot encoding on categorical variables
data_encoded = pd.get_dummies(data)

# Assuming 'stroke' is your target variable
X = data_encoded.drop(columns=['target'])  # Features
y = data_encoded['target']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Perform imputation to handle missing values
imputer = SimpleImputer(strategy='mean')
X_train= imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [10]:
# Dictionary to store results for different SMOTE techniques
results = {}

In [11]:
# Function to print class distribution
def print_class_distribution(y):
    counts = y.value_counts()
    ratio = counts[0] / counts[1]
    print("Class 0 count:", counts[0])
    print("Class 1 count:", counts[1])
    print("Ratio of Class 0 to Class 1:", ratio)

In [12]:
from sklearn.impute import SimpleImputer

# Identify NaN values in y_train
nan_indices = np.isnan(y_train)

# Replace NaN values with the most frequent class
most_frequent_class = y_train.mode()[0]
y_train_imputed = y_train.copy()
y_train_imputed[nan_indices] = most_frequent_class


In [13]:
# SMOTE
print("\nResults for SMOTE:")
print("Before SMOTE:")
print_class_distribution(y_train)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("\nAfter SMOTE:")
print_class_distribution(y_resampled)
results['SMOTE'] = (X_resampled, y_resampled)


Results for SMOTE:
Before SMOTE:
Class 0 count: 397
Class 1 count: 423
Ratio of Class 0 to Class 1: 0.9385342789598109

After SMOTE:
Class 0 count: 423
Class 1 count: 423
Ratio of Class 0 to Class 1: 1.0


In [14]:
# Borderline-SMOTE
print("\nResults for Borderline-SMOTE:")
print("Before Borderline-SMOTE:")
print_class_distribution(y_train)
borderline_smote = BorderlineSMOTE(random_state=42)
X_resampled, y_resampled = borderline_smote.fit_resample(X_train, y_train)
print("\nAfter Borderline-SMOTE:")
print_class_distribution(y_resampled)
results['Borderline-SMOTE'] = (X_resampled, y_resampled)


Results for Borderline-SMOTE:
Before Borderline-SMOTE:
Class 0 count: 397
Class 1 count: 423
Ratio of Class 0 to Class 1: 0.9385342789598109

After Borderline-SMOTE:
Class 0 count: 423
Class 1 count: 423
Ratio of Class 0 to Class 1: 1.0


In [15]:
# Borderline-SMOTE SVM
print("\nResults for Borderline-SMOTE SVM:")
print("Before Borderline-SMOTE SVM:")
print_class_distribution(y_train)
svm_smote = SVMSMOTE(random_state=42)
X_resampled, y_resampled = svm_smote.fit_resample(X_train, y_train)
print("\nAfter Borderline-SMOTE SVM:")
print_class_distribution(y_resampled)
results['Borderline-SMOTE SVM'] = (X_resampled, y_resampled)


Results for Borderline-SMOTE SVM:
Before Borderline-SMOTE SVM:
Class 0 count: 397
Class 1 count: 423
Ratio of Class 0 to Class 1: 0.9385342789598109

After Borderline-SMOTE SVM:
Class 0 count: 423
Class 1 count: 423
Ratio of Class 0 to Class 1: 1.0


In [16]:
# Train and evaluate a machine learning model for each resampled dataset
for technique, (X_resampled, y_resampled) in results.items():
    print(f"\nTraining RandomForestClassifier for {technique}:")
    # Train a RandomForestClassifier on the resampled data
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_resampled, y_resampled)

    # Evaluate the model on the original test set
    y_pred = clf.predict(X_test)
    print("\nClassification Report on Test Set:")
    print(classification_report(y_test, y_pred))


Training RandomForestClassifier for SMOTE:

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205


Training RandomForestClassifier for Borderline-SMOTE:

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205


Training RandomForestClassifier for Borderline-SMOTE SVM:

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.97  