In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def clean(filepath):
  #read the file
  data= pd.read_csv(filepath)

  #drop columns which more than half of the data is lost
  data.drop(["ca", "thal"], axis=1, inplace=True)


  return data


In [None]:
data = clean("/kaggle/input/heart-disease-data/heart_disease_uci.csv")

In [None]:
data.head(10)

In [None]:
data.info()


In [None]:
data.isnull().sum()/data.shape[0]*100

In [None]:
correlation = data.select_dtypes("number").drop(["id"], axis=1).corr()
correlation



In [None]:
sns.heatmap(correlation)

In [None]:
numerical_features = data.select_dtypes("number").drop(["id"], axis=1)
sns.pairplot(numerical_features)
plt.show()


In [None]:
data.drop("id", axis=1).describe()

In [None]:
target = "num"
X = data.drop(target, axis=1)
y = data[target]

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the result
print("Training Features:\n", X_train.shape)
print("Testing Features:\n", X_test.shape)
print("Training Labels:\n", y_train.shape)
print("Testing Labels:\n", y_test.shape)

In [None]:
#Initialize different classification models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}


# Iterate through models and train them
for model_name, model in models.items():
    print(f"Training {model_name}")

    # Create a pipeline with one-hot encoding and model
    pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore'),
        SimpleImputer(strategy='mean'),
        model
    )

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict on test data
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}\n")

