## 1. Load data
- pandas is a convenient package to represent and process tabular data: https://pandas.pydata.org/docs/user_guide/10min.html

In [None]:
import numpy as np  # this package is for matrix computation
import pandas as pd  # this package is for data formating and processing

# load data from data file
train_df = pd.read_csv('/kaggle/input/heart-attack/train.csv')
test_X_df = pd.read_csv('/kaggle/input/heart-attack/test_X.csv')
sample_y_df = pd.read_csv('/kaggle/input/heart-attack/sample_submission.csv')

In [None]:
# take a look at your training set (with features and ground-truth label 'HeartDisease')
train_df.info()
train_df.head(n=5)

In [None]:
# take a look at your test set (with only features)
test_X_df.info()
test_X_df.head(n=5)

In [None]:
# take a look at the format of submission (with only predicted labels)
# your submitted prediction on test_X should follow this format, otherwise you may receive errors on Kaggle
sample_y_df.info()
sample_y_df.head(n=5)

## 2. Data processing
- Categorical feature -> numerical feature
- Feature scaling: https://scikit-learn.org/stable/modules/preprocessing.html
- ...

In [None]:
# this function is to convert categorical feature to numerical (one-hot representation)
def convert_categorical_to_numerical(df):
    new_df = df.copy()  # so operations on new_df will not influence df
    
    # check get_dummies doc: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html for more info
    sex = pd.get_dummies(new_df['Sex'], prefix='sex', dtype=float) # convert Sex to integer values
    chest = pd.get_dummies(new_df['ChestPainType'], prefix='chest', dtype=float) # convert ChestPainType to integer values
    # YOUR TASK: convert other categorical features
    
    
    
    # drop categorical features with their numerical values
    # YOUR TASK: drop other categorical features
    new_df.drop(['Sex', 'ChestPainType'], axis=1, inplace=True) 
    
    # create new dataframe with only numerical values
    # YOUR TASK: concatenate with other converted features
    new_df = pd.concat([new_df, sex, chest], axis=1)
    
    return new_df

# convert features for training and testing data
my_train_df = convert_categorical_to_numerical(train_df)
my_test_X_df = convert_categorical_to_numerical(test_X_df)

my_train_df.head(n=5)

In [None]:
# You may apply feature proceccing tricks mentioned in class
# e.g., feature normalization/standardization etc


## 3. Create classifier and fit the data
- sklearn is a convenient package for ML: https://scikit-learn.org/stable/
- you are encouraged to try any ML models: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
- you are encouraged to try model selection methods: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

# prepare features and labels for training/testing
train_X = my_train_df.drop(["HeartDisease", "PatientID"], axis=1)
train_y = my_train_df["HeartDisease"]
test_X = my_test_X_df.drop(["PatientID"], axis=1)

# define and fit your model, with manually set hyperparameter
# e.g., here is an example of KNN classifier, and you may tune the hyperparameter "n_neighbors"
model = KNeighborsClassifier(n_neighbors=10)
model.fit(train_X, train_y)

# evaluate accuracy/f1 score on training data
train_y_pred = model.predict(train_X)
print(accuracy_score(train_y, train_y_pred))
print(f1_score(train_y, train_y_pred))

In [None]:
# model selection: hyperparameter tuning
hyperpara_grid = {'n_neighbors':[3, 5, 10, 15]} # candidate values for the hyperparameter to try
base_model = KNeighborsClassifier()
clf = GridSearchCV(base_model, hyperpara_grid, cv=5) # 5-fold cross validation
clf.fit(train_X, train_y)
print(clf.cv_results_.keys()) # all results for 5-fold cross validation
print(clf.cv_results_['mean_test_score']) # average validation performance for different hyperparameter values

## 4. Make predictions and format them into required submission file

In [None]:
# make predictions on test data
test_y_pred = model.predict(test_X)

# prepare the prediction file to submit on Kaggle
submission_df = pd.DataFrame({
    'PatientID': my_test_X_df['PatientID'],
    'HeartDisease': test_y_pred
    }
)
submission_df.to_csv("y_predict.csv", index=False)
submission_df.head(3)