In [12]:
import pandas as pd
df = pd.read_csv('heartdisease.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


**DATA CLEANING**

In [13]:
print("Missing Values", df.isnull().sum())
df.dropna(inplace=True)

Missing Values Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64


In [14]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

0


**DATA INTEGRATION**

In [16]:
subset1 = ['Age', 'Sex', 'ChestPain', 'RestBP', 'Chol']
subset2 = ['Fbs', 'RestECG', 'MaxHR', 'ExAng', 'Oldpeak']

merged_df = pd.concat([df[subset1],df[subset2]], axis=1)
merged_df.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak
0,63,1,typical,145,233,1,2,150,0,2.3
1,67,1,asymptomatic,160,286,0,2,108,1,1.5
2,67,1,asymptomatic,120,229,0,2,129,1,2.6
3,37,1,nonanginal,130,250,0,0,187,0,3.5
4,41,0,nontypical,130,204,0,2,172,0,1.4


**ERROR CORRECTING (OUTLIER REMOVAL)**

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns


# Boxplot before correction
for col in numerical_cols:
    plt.figure(figsize=(4, 2))
    sns.boxplot(data=df, x=col)
    plt.title(f'Before removing outliers - {col}')
    plt.show()

In [32]:
# Remove outliers using IQR method
def remove_outliers(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    return col[~((col < (Q1 - 1.5 * IQR)) | (col > (Q3 + 1.5 * IQR)))]

for col in numerical_cols:
    df[col] = remove_outliers(df[col])


# print(df[numerical_cols].isnull().sum())
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

for col in numerical_cols:
    plt.figure(figsize=(4, 2))
    sns.boxplot(data=df, x=col)
    plt.title(f'Before removing outliers - {col}')
    plt.show()

**DATA TRANSFORMATION**

In [33]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Separate features and target
X = df.drop('AHD', axis=1)
y = df['AHD']

# Label encode 'ChestPain' and 'Thal' columns
le = LabelEncoder()
X['ChestPain'] = le.fit_transform(X['ChestPain'])
X['Thal'] = le.fit_transform(X['Thal'])

scaler = StandardScaler()
numerical_cols = ['Age', 'Sex', 'RestBP', 'Chol', 'Fbs', 'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca']
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

**DATA MODEL BUILDING**

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))

# Support Vector Machine
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_preds = svc_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svc_preds))

# Confusion Matrix
ConfusionMatrixDisplay(confusion_matrix(y_test, svc_preds)).plot()
plt.title("SVM Confusion Matrix")
plt.show()