In [88]:
# Preprocessing of heart disease CSV data

import pandas as pd
from sklearn.model_selection import train_test_split
# from sklearn.decomposition import PCA # Will revisit for visualizations
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

# Read the CSV so that it can be processed
heartDiseaseData = pd.read_csv("Heart_Disease_Prediction.csv")

# Used the replace method from pandas library per the documentation to convert the target column (Heart Disease) values in the dataframe
# Replaced instances of "Presence" with 1 and instances of "Absence" with 0
# Referred to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html
heartDiseaseData.replace(to_replace = "Presence", value = 1, inplace = True)
heartDiseaseData.replace(to_replace = "Absence", value = 0, inplace = True)
print(heartDiseaseData)

# Dropped the target column (again Heart Disease) to form the attributes dataframe
# Referred to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html
heartDiseaseAttr = heartDiseaseData.drop(columns = ["Heart Disease"])
print(heartDiseaseAttr)

# Specified the heart disease target var
heartDiseaseTarget = heartDiseaseData["Heart Disease"]
print(heartDiseaseTarget)
print("\n")

# Referred to https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html for splitting the data into training and testing sets
# From prior experience, an 80-20 train-test split might be useful
heartDiseaseAttrTrain, heartDiseaseAttrTest, heartDiseaseTargetTrain, heartDiseaseTargetTest = train_test_split(heartDiseaseAttr, heartDiseaseTarget, test_size = 0.20, random_state = 50, shuffle = True)

# Normalizing in accordance with our project proposal (our team discussed min-max normalization)
# Referred to https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler
# Fit the scaler on the heart disease attributes of the training data, and then transformed the attributes in both the training and testing data
heartDiseaseMMScaler = MinMaxScaler()
fittedHeartDiseaseMMScale = heartDiseaseMMScaler.fit(heartDiseaseAttrTrain)
normHeartDiseaseAttrTrain = fittedHeartDiseaseMMScale.transform(heartDiseaseAttrTrain)
normHeartDiseaseTargetTrain = fittedHeartDiseaseMMScale.transform(heartDiseaseAttrTest)

# Referred to https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
heartDiseaseLogRegClassifier = LogisticRegression().fit(normHeartDiseaseAttrTrain, heartDiseaseTargetTrain)
heartDiseaseLogRegClassifierPred = heartDiseaseLogRegClassifier.predict(normHeartDiseaseTargetTrain)
print(heartDiseaseLogRegClassifier.score(normHeartDiseaseTargetTrain, heartDiseaseTargetTest))

print(heartDiseaseAttrTrain)

     Age  Sex  Chest pain type   BP  Cholesterol  FBS over 120  EKG results  \
0     70    1                4  130          322             0            2   
1     67    0                3  115          564             0            2   
2     57    1                2  124          261             0            0   
3     64    1                4  128          263             0            0   
4     74    0                2  120          269             0            2   
..   ...  ...              ...  ...          ...           ...          ...   
265   52    1                3  172          199             1            0   
266   44    1                2  120          263             0            0   
267   56    0                2  140          294             0            2   
268   57    1                4  140          192             0            0   
269   67    1                4  160          286             0            2   

     Max HR  Exercise angina  ST depression  Slope 

  heartDiseaseData.replace(to_replace = "Absence", value = 0, inplace = True)
