In [1]:
'''
Perform the following operations using Python on Heart Diseases data sets  
a. Data cleaning(Remove NA, ?, Negative values etc.)  
b. Error correcting(Outlier detection and removal)  
c. Data transformation  
d. Build Data model using regression and kNN methods and compare accuracy of 
heart disease prediction.
'''

'\nPerform the following operations using Python on Heart Diseases data sets  \na. Data cleaning(Remove NA, ?, Negative values etc.)  \nb. Error correcting(Outlier detection and removal)  \nc. Data transformation  \nd. Build Data model using regression and kNN methods and compare accuracy of \nheart disease prediction.\n'

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [49]:
# List of dataset paths
files = [
    r"DSBDALExam DataSets\HeartDisease\Cleavland.csv",
    r"DSBDALExam DataSets\HeartDisease\hung.csv",
    r"DSBDALExam DataSets\HeartDisease\Switzerland.csv"
]

# Define column names based on known attributes used in past experiments
columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# Combine datasets
df_list = []
for file in files:
    df = pd.read_csv(file, header=None, names=columns, na_values='?')
    df_list.append(df)

data = pd.concat(df_list, ignore_index=True)

In [50]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [51]:
# Convert all columns to numeric
data = data.apply(pd.to_numeric, errors='coerce')

In [52]:
# Check for missing values
print("\n🧹 Missing Values:")
print(data.isna().sum())

# Check for negative values
print("\n📉 Negative Values in Columns:")
negative_counts = (data < 0).sum()
print(negative_counts[negative_counts > 0])


🧹 Missing Values:
age           0
sex           0
cp            0
trestbps      3
chol         23
fbs          83
restecg       2
thalach       2
exang         2
oldpeak       6
slope       207
ca          413
thal        320
target        0
dtype: int64

📉 Negative Values in Columns:
oldpeak    11
dtype: int64


In [53]:
# Drop rows with too many missing values (optional threshold)
data = data.dropna(thresh=10)

In [54]:
# Fill remaining missing values with median
imputer = SimpleImputer(strategy='median')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [55]:
# Remove outliers using IQR
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

data_cleaned = remove_outliers(data_imputed)

In [56]:
# Print number of outliers removed
print(f"\n🚨 Rows after outlier removal: {data_cleaned.shape[0]} (from {data_imputed.shape[0]})")


🚨 Rows after outlier removal: 398 (from 716)


In [57]:
# Convert target to binary (0: no disease, 1: disease)
data_cleaned['target'] = data_cleaned['target'].apply(lambda x: 1 if x > 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['target'] = data_cleaned['target'].apply(lambda x: 1 if x > 0 else 0)


In [58]:
# Features and target
X = data_cleaned.drop('target', axis=1)
y = data_cleaned['target']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [59]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [60]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

In [61]:
# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)

In [62]:
# Print Results
print("Logistic Regression Accuracy: ", lr_acc)
print("KNN Accuracy: ", knn_acc)

Logistic Regression Accuracy:  0.75
KNN Accuracy:  0.8
