## IMPORTS

In [1]:
import numpy as np
import pandas as pd 
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

## EDA

In [2]:
df = pd.read_csv("data/raw/pretransformed_heart_disease.csv")
df.head()

Unnamed: 0,Age (in years),Sex,Chest pain type,Resting blood pressure (in mm Hg on admission to the hospital),Serum cholesterol (in mg/dl),Fasting blood sugar > 120 mg/dl,Resting electrocardiographic results,Maximum heart rate achieved,Exercise-induced angina,ST depression induced by exercise relative to rest,Slope of the peak exercise ST segment,Number of major vessels (0–3) colored by fluoroscopy,Thalassemia,Diagnosis of heart disease
0,63,male,typical angina,145,233,True,showing probable or definite left ventricular ...,150,no,2.3,downsloping,0.0,fixed defect,< 50% diameter narrowing
1,67,male,asymptomatic,120,229,False,showing probable or definite left ventricular ...,129,yes,2.6,flat,2.0,reversable defect,> 50% diameter narrowing
2,37,male,non-anginal pain,130,250,False,normal,187,no,3.5,downsloping,0.0,normal,< 50% diameter narrowing
3,41,female,atypical angina,130,204,False,showing probable or definite left ventricular ...,172,no,1.4,upsloping,0.0,normal,< 50% diameter narrowing
4,56,male,atypical angina,120,236,False,normal,178,no,0.8,upsloping,0.0,normal,< 50% diameter narrowing


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 14 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   Age (in years)                                                  219 non-null    int64  
 1   Sex                                                             219 non-null    object 
 2   Chest pain type                                                 219 non-null    object 
 3   Resting blood pressure (in mm Hg on admission to the hospital)  219 non-null    int64  
 4   Serum cholesterol (in mg/dl)                                    219 non-null    int64  
 5   Fasting blood sugar > 120 mg/dl                                 219 non-null    bool   
 6   Resting electrocardiographic results                            219 non-null    object 
 7   Maximum heart rate achieved                          

In [4]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [5]:
numeric_columns = [
    'age_(in_years)', 
    'resting_blood_pressure_(in_mm_hg_on_admission_to_the_hospital)', 
    'serum_cholesterol_(in_mg/dl)', 
    'maximum_heart_rate_achieved'
]

pair_plots = []
for x_col, y_col in itertools.combinations(numeric_columns, 2):
    chart = alt.Chart(df).mark_circle(size=60).encode(
        alt.X(f'{x_col}:Q', title=x_col.replace("_", " ").capitalize()),
        alt.Y(f'{y_col}:Q', title=y_col.replace("_", " ").capitalize()),
        color='diagnosis_of_heart_disease:N',
        tooltip=[x_col, y_col, 'diagnosis_of_heart_disease']
    ).properties(
        width=200,
        height=200
    )
    pair_plots.append(chart)

pair_plot_grid = alt.vconcat(*[
    alt.hconcat(*pair_plots[i:i + len(numeric_columns) - 1])
    for i in range(0, len(pair_plots), len(numeric_columns) - 1)
])

pair_plot_grid


In [6]:
# Age distribution
age_dist = alt.Chart(df).mark_bar().encode(
    alt.X('age_(in_years):Q', bin=True, title="Age (Years)"),
    alt.Y('count()', title="Count"),
    tooltip=['count()']
).properties(
    title="Age Distribution",
    width=400,
    height=300
)

age_dist


In [7]:
# Diagnosis of Heart Disease
diagnosis_sex = alt.Chart(df).mark_bar().encode(
    alt.X('diagnosis_of_heart_disease:N', title="Heart Disease Diagnosis", axis=alt.Axis(labelAngle=0)),
    alt.Y('count()', title="Count"),
    color='sex:N',
    tooltip=['count()', 'sex']
).properties(
    title="Heart Disease Diagnosis by Sex",
    width=400,
    height=300
)

diagnosis_sex

In [8]:
# Resting Blood Pressure Distribution
resting_bp = alt.Chart(df).mark_bar().encode(
    alt.X('resting_blood_pressure_(in_mm_hg_on_admission_to_the_hospital):Q', bin=True, title="Resting Blood Pressure (mm Hg)"),
    alt.Y('count()', title="Count"),
    tooltip=['count()']
).properties(
    title="Resting Blood Pressure Distribution",
    width=400,
    height=300
)

resting_bp

In [9]:
# 4. Cholesterol Levels by Sex
cholesterol_sex = alt.Chart(df).mark_boxplot().encode(
    alt.X('sex:N', title="Sex"),
    alt.Y('serum_cholesterol_(in_mg/dl):Q', title="Cholesterol (mg/dl)"),
    color='sex:N',
    tooltip=['serum_cholesterol_(in_mg/dl)', 'sex']
).properties(
    title="Cholesterol Levels by Sex",
    width=400,
    height=300
)

cholesterol_sex


In [10]:
# 5. Maximum Heart Rate Distribution
heart_rate = alt.Chart(df).mark_bar().encode(
    alt.X('maximum_heart_rate_achieved:Q', bin=True, title="Maximum Heart Rate"),
    alt.Y('count()', title="Count"),
    tooltip=['count()']
).properties(
    title="Maximum Heart Rate Distribution",
    width=400,
    height=300
)

heart_rate


In [11]:
# 6. Diagnosis by Chest Pain Type
chest_pain_diag = alt.Chart(df).mark_bar().encode(
    alt.X('chest_pain_type:N', title="Chest Pain Type", axis=alt.Axis(labelAngle=0)),
    alt.Y('count()', title="Count"),
    color='diagnosis_of_heart_disease:N',
    tooltip=['count()', 'chest_pain_type', 'diagnosis_of_heart_disease']
).properties(
    title="Heart Disease Diagnosis by Chest Pain Type",
    width=400,
    height=300
)

chest_pain_diag



In [12]:
# 7. ST Depression by Diagnosis
st_depression = alt.Chart(df).mark_boxplot().encode(
    alt.X('diagnosis_of_heart_disease:N', title="Heart Disease Diagnosis", axis=alt.Axis(labelAngle=0)),
    alt.Y('st_depression_induced_by_exercise_relative_to_rest:Q', title="ST Depression"),
    color='diagnosis_of_heart_disease:N',
    tooltip=['st_depression_induced_by_exercise_relative_to_rest', 'diagnosis_of_heart_disease']
).properties(
    title="ST Depression by Diagnosis",
    width=400,
    height=300
)

st_depression

In [13]:
# 8. Number of Major Vessels by Chest Pain Type
vessels_chest_pain = alt.Chart(df).mark_boxplot().encode(
    alt.X('chest_pain_type:N', title="Chest Pain Type"),
    alt.Y('number_of_major_vessels_(0–3)_colored_by_fluoroscopy:Q', title="Number of Major Vessels"),
    color='chest_pain_type:N',
    tooltip=['number_of_major_vessels_(0–3)_colored_by_fluoroscopy', 'chest_pain_type']
).properties(
    title="Number of Major Vessels by Chest Pain Type",
    width=400,
    height=300
)

vessels_chest_pain


In [14]:
# 9. Diagnosis Distribution
diagnosis_dist = alt.Chart(df).mark_bar().encode(
    alt.X('diagnosis_of_heart_disease:N', title="Heart Disease Diagnosis", axis=alt.Axis(labelAngle=0)),
    alt.Y('count()', title="Count"),
    color='diagnosis_of_heart_disease:N',
    tooltip=['count()']
).properties(
    title="Heart Disease Diagnosis Distribution",
    width=400,
    height=300
)

diagnosis_dist


## ML-analysis (code)

In [15]:
#Import data

## Written analysis

In [16]:
#Title


In [17]:
#summary

In [18]:
#Methods / results

In [19]:
#Discussion

In [20]:
#References