In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [52]:
df = pd.read_csv('https://raw.githubusercontent.com/SumanGouda/Scikit-Learn-/refs/heads/main/PROJECT/HEART%20DISEASE/heart_disease_uci.csv')

### <span style="color:cyan"><i>Data Cleaning</i></span>

In [53]:
# Percentage data missing from each column: 
total = (df.isnull().sum() / len(df)) * 100 

categorical_cols = ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
numerical_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

total


id           0.000000
age          0.000000
sex          0.000000
dataset      0.000000
cp           0.000000
trestbps     6.413043
chol         3.260870
fbs          9.782609
restecg      0.217391
thalch       5.978261
exang        5.978261
oldpeak      6.739130
slope       33.586957
ca          66.413043
thal        52.826087
num          0.000000
dtype: float64

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score  
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer

In [41]:
df_dropped = df.dropna()
X = df_dropped[['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']]
y = df_dropped['num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7833333333333333


In [43]:
X_train_df = pd.DataFrame(X_train, columns=['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca'])
X_train_df

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca
6,62,140.0,268.0,160.0,3.6,2.0
185,63,140.0,195.0,179.0,0.0,2.0
187,66,160.0,246.0,120.0,0.0,3.0
147,41,112.0,250.0,179.0,0.0,0.0
30,69,140.0,239.0,151.0,1.8,2.0
...,...,...,...,...,...,...
190,50,129.0,196.0,163.0,0.0,0.0
71,67,125.0,254.0,163.0,0.2,2.0
107,57,128.0,229.0,150.0,0.4,1.0
274,59,134.0,204.0,162.0,0.8,2.0


In [62]:
# Split first to avoid data leakage
X = df.drop('num', axis=1)
y = df['num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define transformer
transformer = ColumnTransformer(
    transformers=[
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_cols),
        ('num', KNNImputer(n_neighbors=3, weights='distance'), numerical_cols)
    ]
)

# Fit and transform on training data only
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)  # Only transform test data

# Create DataFrames ensuring correct column order
features = categorical_cols + numerical_cols  
X_train_df = pd.DataFrame(X_train_transformed, columns=features)
X_test_df = pd.DataFrame(X_test_transformed, columns=features)
X_train_df

Unnamed: 0,sex,dataset,cp,fbs,restecg,exang,slope,thal,age,trestbps,chol,thalch,oldpeak,ca
0,Male,VA Long Beach,asymptomatic,False,st-t abnormality,True,flat,normal,62.0,138.902072,170.0,120.0,3.0,0.545062
1,Male,Hungary,non-anginal,False,normal,False,flat,normal,54.0,150.0,203.0,122.0,0.0,0.275869
2,Male,VA Long Beach,non-anginal,False,normal,False,flat,normal,51.0,151.396204,339.0,131.077975,1.170443,0.0
3,Female,Cleveland,non-anginal,False,normal,False,flat,normal,50.0,120.0,219.0,158.0,1.6,0.0
4,Male,Cleveland,atypical angina,False,normal,False,upsloping,normal,52.0,120.0,325.0,172.0,0.2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,Male,Cleveland,asymptomatic,False,normal,True,upsloping,reversable defect,59.0,140.0,177.0,162.0,0.0,1.0
732,Male,Cleveland,asymptomatic,False,lv hypertrophy,True,upsloping,reversable defect,61.0,140.0,207.0,138.0,1.9,1.0
733,Male,VA Long Beach,asymptomatic,True,normal,True,downsloping,reversable defect,75.0,160.0,310.0,112.0,2.0,3.0
734,Female,Hungary,atypical angina,False,normal,True,flat,normal,53.0,140.0,216.0,142.0,2.0,0.936101


In [None]:
oridanl_columns = [['cp', 'restecg', 'slope', 'thal']]


Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')