## 1. Load data
- pandas is a convenient package to represent and process tabular data: https://pandas.pydata.org/docs/user_guide/10min.html

In [1]:
import numpy as np  # this package is for matrix computation
import pandas as pd  # this package is for data formating and processing

# load data from data file
train_df = pd.read_csv('/Users/shubhangkaushik/Downloads/fall-2023-ist-557-individual-project-i/train.csv')
test_X_df = pd.read_csv('/Users/shubhangkaushik/Downloads/fall-2023-ist-557-individual-project-i/test_X.csv')
sample_y_df = pd.read_csv('/Users/shubhangkaushik/Downloads/fall-2023-ist-557-individual-project-i/sample_submission.csv')

In [2]:
# take a look at your training set (with features and ground-truth label 'HeartDisease')
train_df.info()
train_df.head(n=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PatientID       720 non-null    int64  
 1   Age             720 non-null    int64  
 2   Sex             720 non-null    object 
 3   ChestPainType   720 non-null    object 
 4   RestingBP       720 non-null    int64  
 5   Cholesterol     720 non-null    int64  
 6   FastingBS       720 non-null    int64  
 7   RestingECG      720 non-null    object 
 8   MaxHR           720 non-null    int64  
 9   ExerciseAngina  720 non-null    object 
 10  Oldpeak         720 non-null    float64
 11  ST_Slope        720 non-null    object 
 12  HeartDisease    720 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 73.2+ KB


Unnamed: 0,PatientID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,761,52,M,ASY,112,230,0,Normal,160,N,0.0,Up,1
1,181,51,M,ATA,130,224,0,Normal,150,N,0.0,Up,0
2,309,57,M,ASY,95,0,1,Normal,182,N,0.7,Down,1
3,84,56,M,ASY,150,213,1,Normal,125,Y,1.0,Flat,1
4,88,43,M,TA,120,291,0,ST,155,N,0.0,Flat,1


In [3]:
# take a look at your test set (with only features)
test_X_df.info()
test_X_df.head(n=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PatientID       198 non-null    int64  
 1   Age             198 non-null    int64  
 2   Sex             198 non-null    object 
 3   ChestPainType   198 non-null    object 
 4   RestingBP       198 non-null    int64  
 5   Cholesterol     198 non-null    int64  
 6   FastingBS       198 non-null    int64  
 7   RestingECG      198 non-null    object 
 8   MaxHR           198 non-null    int64  
 9   ExerciseAngina  198 non-null    object 
 10  Oldpeak         198 non-null    float64
 11  ST_Slope        198 non-null    object 
dtypes: float64(1), int64(6), object(5)
memory usage: 18.7+ KB


Unnamed: 0,PatientID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,581,48,M,ASY,140,208,0,Normal,159,Y,1.5,Up
1,45,59,M,NAP,130,318,0,Normal,120,Y,1.0,Flat
2,509,58,M,ASY,110,198,0,Normal,110,N,0.0,Flat
3,232,38,F,ATA,120,275,0,Normal,129,N,0.0,Up
4,810,55,F,ATA,135,250,0,LVH,161,N,1.4,Flat


In [4]:
# take a look at the format of submission (with only predicted labels)
# your submitted prediction on test_X should follow this format, otherwise you may receive errors on Kaggle
sample_y_df.info()
sample_y_df.head(n=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   PatientID     198 non-null    int64
 1   HeartDisease  198 non-null    int64
dtypes: int64(2)
memory usage: 3.2 KB


Unnamed: 0,PatientID,HeartDisease
0,581,0
1,45,0
2,509,1
3,232,0
4,810,0


## 2. Data processing
- Categorical feature -> numerical feature
- Feature scaling: https://scikit-learn.org/stable/modules/preprocessing.html
- ...

In [5]:
# this function is to convert categorical feature to numerical (one-hot representation)
def convert_categorical_to_numerical(df):
    new_df = df.copy()  # so operations on new_df will not influence df
    
    # check get_dummies doc: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html for more info
    sex = pd.get_dummies(new_df['Sex'], prefix='sex', dtype=float) # convert Sex to integer values
    chest = pd.get_dummies(new_df['ChestPainType'], prefix='chest', dtype=float) # convert ChestPainType to integer values
    ecg = pd.get_dummies(new_df['RestingECG'], prefix='ecg', dtype=float) 
    exercise = pd.get_dummies(new_df['ExerciseAngina'], prefix='exercise', dtype=float)
    slope = pd.get_dummies(new_df['ST_Slope'], prefix='slope', dtype=float)
    # YOUR TASK: convert other categorical features
    
    
    
    # drop categorical features with their numerical values
    # YOUR TASK: drop other categorical features
    new_df.drop(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], axis=1, inplace=True) 
    
    # create new dataframe with only numerical values
    # YOUR TASK: concatenate with other converted features
    new_df = pd.concat([new_df, sex, chest, ecg, exercise, slope], axis=1)
    
    return new_df

# convert features for training and testing data
my_train_df = convert_categorical_to_numerical(train_df)
my_test_X_df = convert_categorical_to_numerical(test_X_df)

my_train_df.head(n=5)

Unnamed: 0,PatientID,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,sex_F,sex_M,...,chest_NAP,chest_TA,ecg_LVH,ecg_Normal,ecg_ST,exercise_N,exercise_Y,slope_Down,slope_Flat,slope_Up
0,761,52,112,230,0,160,0.0,1,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,181,51,130,224,0,150,0.0,0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,309,57,95,0,1,182,0.7,1,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,84,56,150,213,1,125,1.0,1,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,88,43,120,291,0,155,0.0,1,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [6]:
# You may apply feature proceccing tricks mentioned in class
# e.g., feature normalization/standardization etc


## 3. Create classifier and fit the data
- sklearn is a convenient package for ML: https://scikit-learn.org/stable/
- you are encouraged to try any ML models: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
- you are encouraged to try model selection methods: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [17]:
#from sklearn import linear_model
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


# prepare features and labels for training/testing
train_X = my_train_df.drop(["HeartDisease", "PatientID"], axis=1)
train_y = my_train_df["HeartDisease"]
test_X = my_test_X_df.drop(["PatientID"], axis=1)

# model selection: hyperparameter tuning
# Creating the hyperparameter grid
#c_space = np.logspace(-5, 5, 10, 15)
#param_grid = {'C': c_space}

# define and fit your model, with manually set hyperparameter
# e.g., here is an example of KNN classifier, and you may tune the hyperparameter "n_neighbors"
#model = linear_model.LogisticRegression(max_iter=800)
 
# Instantiating Decision Tree classifier
model = RandomForestClassifier()
model.fit(train_X,train_y)
# Instantiating RandomizedSearchCV object


# evaluate accuracy/f1 score on training data
train_y_pred = model.predict(train_X)
print(accuracy_score(train_y, train_y_pred))
print(f1_score(train_y, train_y_pred))

1.0
1.0


In [23]:
#aram_grid = {
#   'n_estimators': [25, 50, 100, 150],
#   'max_features': ['sqrt', 'log2', None],
#   'max_depth': [5,10,15],
#   'max_leaf_nodes': [5,10,15]
#}
# Creating the hyperparameter grid
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

model_grid = RandomForestClassifier(max_depth=12,
                                    max_features="log2",
                                    max_leaf_nodes=12,
                                    n_estimators=150)
#clf=GridSearchCV(model_grid,param_grid, cv=15) 
clf=RandomizedSearchCV(model, param_dist, cv = 10)
clf.fit(train_X, train_y)
print(clf.cv_results_.keys()) # all results for 5-fold cross validation
print(clf.cv_results_['mean_test_score']) # average validation performance for different hyperparameter values

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_max_features', 'param_min_samples_leaf', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
[0.86527778 0.86527778 0.87083333 0.87222222 0.85416667 0.85555556
 0.87222222 0.86805556 0.87638889 0.85972222]


## 4. Make predictions and format them into required submission file

In [24]:
# make predictions on test data
test_y_pred = model.predict(test_X)

# prepare the prediction file to submit on Kaggle
submission_df = pd.DataFrame({
    'PatientID': my_test_X_df['PatientID'],
    'HeartDisease': test_y_pred
    }
)
submission_df.info()
submission_df.to_csv("y_predict.csv", index=False)
print(submission_df)
submission_df.to_clipboard()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   PatientID     198 non-null    int64
 1   HeartDisease  198 non-null    int64
dtypes: int64(2)
memory usage: 3.2 KB
     PatientID  HeartDisease
0          581             1
1           45             1
2          509             1
3          232             0
4          810             0
..         ...           ...
193        595             1
194        742             0
195        230             0
196        437             1
197        111             1

[198 rows x 2 columns]
