## IMPORTS

In [37]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
 
from sklearn.model_selection import RandomizedSearchCV, cross_validate, train_test_split

## EDA

In [None]:
#import data

In [None]:
#visualize different parameters

## ML-analysis (code)

In [32]:
#Import data
data = pd.read_csv('data/raw/pretransformed_heart_disease.csv')
data.head()

Unnamed: 0,Age (in years),Sex,Chest pain type,Resting blood pressure (in mm Hg on admission to the hospital),Serum cholesterol (in mg/dl),Fasting blood sugar > 120 mg/dl,Resting electrocardiographic results,Maximum heart rate achieved,Exercise-induced angina,ST depression induced by exercise relative to rest,Slope of the peak exercise ST segment,Number of major vessels (0–3) colored by fluoroscopy,Thalassemia,Diagnosis of heart disease
0,63,male,typical angina,145,233,True,showing probable or definite left ventricular ...,150,no,2.3,downsloping,0.0,fixed defect,< 50% diameter narrowing
1,67,male,asymptomatic,120,229,False,showing probable or definite left ventricular ...,129,yes,2.6,flat,2.0,reversable defect,> 50% diameter narrowing
2,37,male,non-anginal pain,130,250,False,normal,187,no,3.5,downsloping,0.0,normal,< 50% diameter narrowing
3,41,female,atypical angina,130,204,False,showing probable or definite left ventricular ...,172,no,1.4,upsloping,0.0,normal,< 50% diameter narrowing
4,56,male,atypical angina,120,236,False,normal,178,no,0.8,upsloping,0.0,normal,< 50% diameter narrowing


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 14 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   Age (in years)                                                  232 non-null    int64  
 1   Sex                                                             232 non-null    object 
 2   Chest pain type                                                 232 non-null    object 
 3   Resting blood pressure (in mm Hg on admission to the hospital)  232 non-null    int64  
 4   Serum cholesterol (in mg/dl)                                    232 non-null    int64  
 5   Fasting blood sugar > 120 mg/dl                                 232 non-null    bool   
 6   Resting electrocardiographic results                            232 non-null    object 
 7   Maximum heart rate achieved                          

In [34]:
data.describe()

Unnamed: 0,Age (in years),Resting blood pressure (in mm Hg on admission to the hospital),Serum cholesterol (in mg/dl),Maximum heart rate achieved,ST depression induced by exercise relative to rest,Number of major vessels (0–3) colored by fluoroscopy
count,232.0,232.0,232.0,232.0,232.0,228.0
mean,53.646552,130.732759,244.775862,154.431034,0.785345,0.464912
std,9.326919,16.832178,51.427179,21.014826,0.96871,0.820227
min,29.0,94.0,126.0,88.0,0.0,0.0
25%,46.0,120.0,211.0,143.0,0.0,0.0
50%,54.0,130.0,239.0,158.0,0.4,0.0
75%,60.0,140.0,270.25,170.0,1.4,1.0
max,77.0,192.0,564.0,202.0,4.4,3.0


In [35]:
train_df, test_df = train_test_split(data, test_size=0.2)

X_train = train_df.drop('Diagnosis of heart disease', axis=1)
y_train = train_df['Diagnosis of heart disease']
X_test = test_df.drop('Diagnosis of heart disease', axis=1)
y_test = test_df['Diagnosis of heart disease']

In [36]:
# Lists of feature names
categorical_features = ['sex', 
                        'Chest pain type', 
                        'Fasting blood sugar > 120 mg/dl', 
                        'Resting electrocardiographic results', 
                        'Exercise-induced angina', 
                        'Slope of the peak exercise ST segment', 
                        'Thalassemia']
numeric_features = list(set(X_train.columns) - set(categorical_features))

# Create transformer pipeline
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore", drop='if_binary', dtype=int, sparse_output=False),
)

numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median", fill_value="missing"),
    StandardScaler(),
)

# Create the column transformer
preprocessor = make_column_transformer(
    (categorical_transformer, categorical_features),
    (numeric_transformer, numeric_features),
)
# Show the preprocessor
preprocessor

## Written analysis

In [None]:
#Title


In [None]:
#summary

In [None]:
#Methods / results

In [None]:
#Discussion

In [None]:
#References