## 1. Load data
- pandas is a convenient package to represent and process tabular data: https://pandas.pydata.org/docs/user_guide/10min.html

In [None]:
import numpy as np  # this package is for matrix computation
import pandas as pd  # this package is for data formating and processing

# load data from data file
train_df = pd.read_csv('../data/train.csv')
test_X_df = pd.read_csv('../data/test_X.csv')
sample_y_df = pd.read_csv('../data/sample_submission.csv')

In [29]:
# take a look at your training set (with features and ground-truth label 'HeartDisease')
train_df.info()
train_df.head(n=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PatientID       720 non-null    int64  
 1   Age             720 non-null    int64  
 2   Sex             720 non-null    object 
 3   ChestPainType   720 non-null    object 
 4   RestingBP       720 non-null    int64  
 5   Cholesterol     720 non-null    int64  
 6   FastingBS       720 non-null    int64  
 7   RestingECG      720 non-null    object 
 8   MaxHR           720 non-null    int64  
 9   ExerciseAngina  720 non-null    object 
 10  Oldpeak         720 non-null    float64
 11  ST_Slope        720 non-null    object 
 12  HeartDisease    720 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 73.2+ KB


Unnamed: 0,PatientID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,761,52,M,ASY,112,230,0,Normal,160,N,0.0,Up,1
1,181,51,M,ATA,130,224,0,Normal,150,N,0.0,Up,0
2,309,57,M,ASY,95,0,1,Normal,182,N,0.7,Down,1
3,84,56,M,ASY,150,213,1,Normal,125,Y,1.0,Flat,1
4,88,43,M,TA,120,291,0,ST,155,N,0.0,Flat,1


In [30]:
# take a look at your test set (with only features)
test_X_df.info()
test_X_df.head(n=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PatientID       198 non-null    int64  
 1   Age             198 non-null    int64  
 2   Sex             198 non-null    object 
 3   ChestPainType   198 non-null    object 
 4   RestingBP       198 non-null    int64  
 5   Cholesterol     198 non-null    int64  
 6   FastingBS       198 non-null    int64  
 7   RestingECG      198 non-null    object 
 8   MaxHR           198 non-null    int64  
 9   ExerciseAngina  198 non-null    object 
 10  Oldpeak         198 non-null    float64
 11  ST_Slope        198 non-null    object 
dtypes: float64(1), int64(6), object(5)
memory usage: 18.7+ KB


Unnamed: 0,PatientID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,581,48,M,ASY,140,208,0,Normal,159,Y,1.5,Up
1,45,59,M,NAP,130,318,0,Normal,120,Y,1.0,Flat
2,509,58,M,ASY,110,198,0,Normal,110,N,0.0,Flat
3,232,38,F,ATA,120,275,0,Normal,129,N,0.0,Up
4,810,55,F,ATA,135,250,0,LVH,161,N,1.4,Flat


In [31]:
# take a look at the format of submission (with only predicted labels)
# your submitted prediction on test_X should follow this format, otherwise you may receive errors on Kaggle
sample_y_df.info()
sample_y_df.head(n=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   PatientID     198 non-null    int64
 1   HeartDisease  198 non-null    int64
dtypes: int64(2)
memory usage: 3.2 KB


Unnamed: 0,PatientID,HeartDisease
0,581,0
1,45,0
2,509,1
3,232,0
4,810,0


## 2. Data processing
- Categorical feature -> numerical feature
- Feature scaling: https://scikit-learn.org/stable/modules/preprocessing.html
- ...

In [32]:
# this function is to convert categorical feature to numerical (one-hot representation)
def convert_categorical_to_numerical(df):
    new_df = df.copy()  # so operations on new_df will not influence df

    # check get_dummies doc: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html for more info
    sex = pd.get_dummies(new_df['Sex'], prefix='sex', dtype=float) # convert Sex to integer values
    chest = pd.get_dummies(new_df['ChestPainType'], prefix='chest', dtype=float) # convert ChestPainType to integer values

    # YOUR TASK: convert other categorical features
    resting = pd.get_dummies(new_df['RestingECG'], prefix="restingECG", dtype=float)
    angina = pd.get_dummies(new_df['ExerciseAngina'], prefix="angina", dtype=float)
    st_slope = pd.get_dummies(new_df['ST_Slope'], prefix="stslope", dtype=float)



    # drop categorical features with their numerical values
    # YOUR TASK: drop other categorical features
    new_df.drop(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina','ST_Slope' ], axis=1, inplace=True)

    # create new dataframe with only numerical values
    # YOUR TASK: concatenate with other converted features
    new_df = pd.concat([new_df, sex, chest, resting, angina, st_slope], axis=1)

    return new_df

# convert features for training and testing data
my_train_df = convert_categorical_to_numerical(train_df)
my_test_X_df = convert_categorical_to_numerical(test_X_df)

my_train_df.head(n=5)

Unnamed: 0,PatientID,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,sex_F,sex_M,...,chest_NAP,chest_TA,restingECG_LVH,restingECG_Normal,restingECG_ST,angina_N,angina_Y,stslope_Down,stslope_Flat,stslope_Up
0,761,52,112,230,0,160,0.0,1,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,181,51,130,224,0,150,0.0,0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,309,57,95,0,1,182,0.7,1,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,84,56,150,213,1,125,1.0,1,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,88,43,120,291,0,155,0.0,1,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [33]:
# You may apply feature proceccing tricks mentioned in class
# e.g., feature normalization/standardization etc
for column_name, column_data in my_train_df.iteritems():
    min_val = column_data.min()
    max_val = column_data.max()
    q1 = column_data.quantile(0.25)
    q3 = column_data.quantile(0.75)

    print(f"{column_name}: min: {min_val} MAX: {max_val} Q1: {q1} Q3: {q3}")

PatientID: min: 1 MAX: 917 Q1: 236.75 Q3: 690.25
Age: min: 28 MAX: 77 Q1: 48.0 Q3: 60.0
RestingBP: min: 80 MAX: 200 Q1: 120.0 Q3: 140.0
Cholesterol: min: 0 MAX: 603 Q1: 170.75 Q3: 265.25
FastingBS: min: 0 MAX: 1 Q1: 0.0 Q3: 0.0
MaxHR: min: 60 MAX: 202 Q1: 120.0 Q3: 154.0
Oldpeak: min: -2.0 MAX: 6.2 Q1: 0.0 Q3: 1.6
HeartDisease: min: 0 MAX: 1 Q1: 0.0 Q3: 1.0
sex_F: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 0.0
sex_M: min: 0.0 MAX: 1.0 Q1: 1.0 Q3: 1.0
chest_ASY: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 1.0
chest_ATA: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 0.0
chest_NAP: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 0.0
chest_TA: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 0.0
restingECG_LVH: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 0.0
restingECG_Normal: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 1.0
restingECG_ST: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 0.0
angina_N: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 1.0
angina_Y: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 1.0
stslope_Down: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 0.0
stslope_Flat: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 1.0
stslope_Up: min: 0.0 MAX: 1.0 Q1: 0.0 Q3: 1

  for column_name, column_data in my_train_df.iteritems():


In [34]:
# removing outliers of oldpeak, maxhr, and age
def identify_outliers(df):
  outlier_rows = []
  for column_name, items in df.iteritems():
      if column_name in ["Age", "MaxHR","Oldpeak"]:
          q1 = items.quantile(0.25)
          q3 = items.quantile(0.75)

          iqr = q3 - q1
          lb = q1 - (1.5 * iqr)
          ub = q3 + (1.5 * iqr)

          column_outliers = []

          for index, row in df.iterrows():
              if row[column_name] < lb or row[column_name] > ub:
                  column_outliers.append(index)

          outlier_rows.append((column_name, column_outliers))  # Append outliers for this column
  return outlier_rows

def print_outliers(outlier_rows):
  for column_name, outliers in outlier_rows:
      print(f"For column {column_name}, the outliers are:")
      print(outliers)
      print("------------------")

def drop_outliers(outlier_rows, df):
  for column_name, outliers in outlier_rows:
      df = df.drop(outliers)
  return df


def cap_outliers(df, column_name):
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    df[column_name] = np.where(df[column_name] < lower_bound, lower_bound, df[column_name])
    df[column_name] = np.where(df[column_name] > upper_bound, upper_bound, df[column_name])

    return df

# Cap outliers for the 'Oldpeak' column in the training dataset
my_train_df = cap_outliers(my_train_df, 'Oldpeak')

# Display the updated statistics for 'Oldpeak'
my_train_df['Oldpeak'].describe()




outliers = identify_outliers(my_train_df)

print_outliers(outliers)

my_train_df = drop_outliers(outliers, my_train_df)

print("------------------------------")


outliers = identify_outliers(my_train_df)

print_outliers(outliers)





  for column_name, items in df.iteritems():


For column Age, the outliers are:
[318, 630, 686]
------------------
For column MaxHR, the outliers are:
[278, 299]
------------------
For column Oldpeak, the outliers are:
[]
------------------
------------------------------


  for column_name, items in df.iteritems():


For column Age, the outliers are:
[]
------------------
For column MaxHR, the outliers are:
[]
------------------
For column Oldpeak, the outliers are:
[]
------------------


## 3. Create classifier and fit the data
- sklearn is a convenient package for ML: https://scikit-learn.org/stable/
- you are encouraged to try any ML models: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
- you are encouraged to try model selection methods: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

# prepare features and labels for training/testing
train_X = my_train_df.drop(["HeartDisease", "PatientID"], axis=1)
train_y = my_train_df["HeartDisease"]
test_X = my_test_X_df.drop(["PatientID"], axis=1)

# define and fit your model, with manually set hyperparameter
# e.g., here is an example of KNN classifier, and you may tune the hyperparameter "n_neighbors"
model = KNeighborsClassifier(n_neighbors=10)
model.fit(train_X, train_y)

# evaluate accuracy/f1 score on training data
train_y_pred = model.predict(train_X)
print(accuracy_score(train_y, train_y_pred))
print(f1_score(train_y, train_y_pred))

0.7580419580419581
0.7866831072749692


In [40]:
# model selection: hyperparameter tuning
hyperpara_grid = {'n_neighbors':[3, 5, 10, 15]} # candidate values for the hyperparameter to try
base_model = KNeighborsClassifier()
clf = GridSearchCV(base_model, hyperpara_grid, cv=5) # 5-fold cross validation
clf.fit(train_X, train_y)
print(clf.cv_results_.keys()) # all results for 5-fold cross validation
print(clf.cv_results_['mean_test_score']) # average validation performance for different hyperparameter values

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
[0.68671329 0.7006993  0.68811189 0.69370629]


In [43]:
#re-train model after finding the optimal hyper params:

optimal_k = clf.best_params_['n_neighbors']
final_model = KNeighborsClassifier(n_neighbors=optimal_k)
final_model.fit(train_X, train_y)

train_y_pred = final_model.predict(train_X)
print(accuracy_score(train_y, train_y_pred))
print(f1_score(train_y, train_y_pred))


0.8013986013986014
0.8301435406698564


## 4. Make predictions and format them into required submission file

In [42]:
# make predictions on test data
test_y_pred = final_model.predict(test_X)

# prepare the prediction file to submit on Kaggle
submission_df = pd.DataFrame({
    'PatientID': my_test_X_df['PatientID'],
    'HeartDisease': test_y_pred
    }
)
submission_df.to_csv("y_predict.csv", index=False)
submission_df.head(3)

Unnamed: 0,PatientID,HeartDisease
0,581,1
1,45,1
2,509,0
