In [1]:
import sys
from pathlib import Path
import polars as pl

sys.path.append(str(Path().resolve().parent))

from utils import download_data_from_kaggle

download_data_from_kaggle(
    "rashikrahmanpritom/heart-attack-analysis-prediction-dataset", "data"
)

Fetching rashikrahmanpritom/heart-attack-analysis-prediction-dataset
Downloading from https://www.kaggle.com/api/v1/datasets/download/rashikrahmanpritom/heart-attack-analysis-prediction-dataset?dataset_version_number=2...


100%|██████████| 4.11k/4.11k [00:00<00:00, 3.89MB/s]

Extracting files...
Moving heart.csv to data
Moving o2Saturation.csv to data





### Data Understanding

* CP: Chest Pain
    * 0: typical angina
    * 1: atypical angina
    * 2: non-anginal pain
    * 3: asymptomatic

* TRTBPS: Resting Blood Pressure
* CHOL: Cholesterol in mg/dl
* FBS: Fasting blood sugar > 120 mg/dl
* REST_ECG: Resting ElectroCardioGrapic results
    * 0: Normal
    * 1: ST-T Wave abnormality
    * 2: Probable left ventricular hypertrophy under Estes' criteria
* THALACH: Maximuim heart rate achieved
* EXANG: Excercise induded angina
* OLDPEAK: ST Depression induced by exercise relative to rest
* SLP: The slope of the peak exercise ST segment
    * 0: unsloping
    * 1: flat
    * 2: downward sloping
* CAA: The number of major vessels
* THALL: thalassemia
    * 0: NA
    * 1: Fixed defect
    * 2: Normal
    * 3: Reversable defect
* OUTPUT the diognosis of heart disease:
    * 0: Negative diagnosis
    * 1: Positive diagnosis
    




In [2]:
data = pl.read_csv("data/heart.csv").rename(
    {
        "cp": "chest_pain",
        "trtbps": "resting_blood_pressure",
        "chol": "cholesterol",
        "fbs": "fasting_blood_sugar",
        "thalachh": "max_heart_rate",
        "exng": "excercise_ind_angina",
        "slp": "peak_slope",
        "caa": "num_major_vessles",
        "thall": "thalassemia",
        "output": "positive_diagnosis",
    }
)
data.head()

age,sex,chest_pain,resting_blood_pressure,cholesterol,fasting_blood_sugar,restecg,max_heart_rate,excercise_ind_angina,oldpeak,peak_slope,num_major_vessles,thalassemia,positive_diagnosis
i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64
63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
data.select(pl.all().is_null().sum())

age,sex,chest_pain,resting_blood_pressure,cholesterol,fasting_blood_sugar,restecg,max_heart_rate,excercise_ind_angina,oldpeak,peak_slope,num_major_vessles,thalassemia,positive_diagnosis
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
data["positive_diagnosis"].value_counts()

positive_diagnosis,count
i64,u32
0,138
1,165


In [None]:
data = data.with_columns(
    [
        (pl.col(col) / pl.col("age")).alias(col + "_div_age")
        for col in data.columns
        if col not in ["age", "positive_diagnosis"]
    ]
)

In [6]:
import plotly.express as px

px.imshow(data.corr(), width=800, height=600).update_yaxes(
    ticktext=data.columns, tickvals=list(range(len(data.columns)))
)

In [7]:
from utils import plot_many


plot_many(
    data=data,
    target="positive_diagnosis",
    features=data.columns,
    width=1600,
    height=800,
    n_col=4,
    n_row=4,
)

In [8]:
data.group_by("positive_diagnosis").mean()

positive_diagnosis,age,sex,chest_pain,resting_blood_pressure,cholesterol,fasting_blood_sugar,restecg,max_heart_rate,excercise_ind_angina,oldpeak,peak_slope,num_major_vessles,thalassemia
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,56.601449,0.826087,0.478261,134.398551,251.086957,0.15942,0.449275,139.101449,0.550725,1.585507,1.166667,1.166667,2.543478
1,52.49697,0.563636,1.375758,129.30303,242.230303,0.139394,0.593939,158.466667,0.139394,0.58303,1.593939,0.363636,2.121212


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

In [16]:
scaler = StandardScaler()
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

train_index, test_index = next(splitter.split(data, data["positive_diagnosis"]))
X_train = data[train_index].drop("positive_diagnosis")
y_train = data[train_index, "positive_diagnosis"].to_numpy()

X_test = data[test_index].drop("positive_diagnosis")
y_test = data[test_index, "positive_diagnosis"].to_numpy()

X_train_prepped = scaler.fit_transform(X_train)
X_test_prepped = scaler.transform(X_test)

In [17]:
model = LogisticRegression(random_state=0)

model.fit(X_train_prepped, y_train)
predictions = model.predict(X_test_prepped)

In [12]:
from sklearn.metrics import classification_report

In [13]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85        28
           1       0.86      0.91      0.88        33

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

