In [1]:
import os
import csv
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif

def makeDir():
    path1 = "images"  # make a location to store images
    path2 = "data"  # make a location to store images
    # mkdir for images
    try:
        os.mkdir(path1)
    except OSError:
        print ("\nCreation of the directory '%s' failed; it already exists.\n" % path1)
    else:
        print ("\nSuccessfully created the directory '%s'\n" % path1)
    # mkdir for cleaned data file
    try:
        os.mkdir(path2)
    except OSError:
        print ("\nCreation of the directory '%s' failed; it already exists.\n" % path2)
    else:
        print ("\nSuccessfully created the directory '%s'\n" % path2)

def dataLoad():
    file = 'data\heart.csv'
    df = pd.read_csv(file)
    return df

def cleanedDataFile(df):
    csv_path = f"data\heartCleaned.csv"
    df.to_csv(csv_path, sep= ',', index= False)
    print(f"\n\nCleaned data is saved to: \n{csv_path}")

def eda(df):
    print(f"Basic stats on data:\n{df.describe()}\n")
    print(f"Data info:\n{df.info()}\n")
    # check for dupes
    print('\nNumber of Duplicates:\n', len(df[df.duplicated()]))
    print(f"\nCounts\n{df.count()}\n")
    # check for dupe rows
    print(f"\nValue Counts:\n{df.value_counts()}\nNulls:\n{df.notna()}\n")
    # examine unique values; look for binary variables and other non-continuous data
    print(f"\nUnique values:\n{df.nunique()}\n")
    # checking for NaN values
    print(f"\nNulls in cloumns: \n{df.isna().sum()}\n")
    # count for whole dataframe
    print(f"Number of nulls in entire dataset: {df.isna().sum().sum()}\n")
    df.corr()
def viz(df):
    # Look for zeroes in datasets that may be unreasonable.
    fig, axs = plt.subplots(len(df.columns), figsize=(5, 20))
    for n, col in enumerate(df.columns):
        # print(col)
        a = df[col].hist(ax=axs[n])
        a.set_title(col)
    fig.tight_layout()

    # cholesterol stood out with about 18% of the values being 0 - visualize
    df.Cholesterol.plot(kind = "hist", bins = 20, figsize = (8,5))
    plt.legend()
    plt.savefig("images\cholesterol.png")
    plt.show()

    # Viz for title page 
    plt.figure(figsize=(20,20))
    sns.displot(df['Age'], color="red", label="Age", kde= True)
    plt.legend()
    plt.savefig("images\CVD_by_Age.png")
    plt.show()

    # look at correlations to target
    targetHeatmap = sns.heatmap(
        df.corr()[['HeartDisease']].sort_values(by='HeartDisease', ascending=False), 
        vmin=-0.5, vmax=1, annot=True, cmap='BrBG', fmt = '.2f'
        )
    fig = targetHeatmap.get_figure()
    fig.savefig('images\heartdiseaseCorr.png')
    fig

    # Look for outliers in numeric variables
    col = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
    num_col = df[col]

    fig, ax = plt.subplots(figsize =(10, 10))
    fig.patch.set_facecolor('#f1faee')

    c = 1
    for i in num_col.columns: 
        plt.subplot(3, 2, c)
        plt.boxplot(num_col[i])
        plt.title(i, fontsize=12, color='Black')
        plt.suptitle('Numeric Variables', fontsize=20)
        c = c + 1
    fig.savefig('images\outliers.png')
    
    # check categorical variables
    heart_df = df[df['HeartDisease'] == 1]
    fig, ax = plt.subplots(figsize =(10, 10))
    fig.patch.set_facecolor('#f1faee')
    fig.savefig('images\categoricalHistograms.png')

    j = 1
    for i in heart_df.columns: 
        if heart_df[i].dtypes  == 'object':
            plt.subplot(3, 2, j)
            sns.countplot(y = heart_df[i], data = heart_df, order=heart_df[i].value_counts().index, palette='Blues_r', edgecolor='black')
            #plt.title(i, fontsize=15, color='black')
            plt.suptitle('Categorical Variables', fontsize=20)
            j = j + 1
def dataCleanTransform(df):
    # through visualization found 1 zero value for blood pressure = dead
    # impute mean value for 0 in the resting BP with the mean for that demographic
    df.RestingBP.mask(df.RestingBP == 0, 132, inplace=True)
    cleanedDF = df
    # encode categorical variables
    categorical = ['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope']
    encoded = pd.get_dummies(cleanedDF[categorical])
    print(encoded.info())
    # joing original dataframe and encoded set of features
    # create new dataframe and only assign columsn with numerical values
    encodedDF = df.drop(columns=categorical)
    encodedDF = encodedDF.join(encoded)
    print(encodedDF)

    print(f"\n\nNew, cleaned data stats: \n"
        f"{encodedDF.describe()}")
    # get column names
    headers = encodedDF.columns

    # Visualize new dataframe with pairplots
    # Feature Pair Plots
    sns.pairplot(data=encodedDF,
        x_vars=headers,
        y_vars=headers,
        diag_kind='kde'
    )
    # Too many features to see anything substantial; splitting into two
    a = sns.pairplot(data=encodedDF.iloc[:,0:12],diag_kind='kde',corner=True)
    a.savefig("images\pairplot.png")
    b = sns.pairplot(data=encodedDF.iloc[:,13:],diag_kind='kde',corner=True)
    b.savefig("images\pairplot2.png")
    # b has no useful data, just print a
    print(
        f"\n\nPair plots \n"
        f"{a}"
    )

    # Heatmap
    fig, ax = plt.subplots(figsize= (10,6))
    corrImage = sns.heatmap(df.corr(), vmin=-1, vmax=1, cmap= 'BrBG', annot=True)
    fig = corrImage.get_figure()
    fig.savefig('images/correlationHeatMap.png')

    # The output above is not as clear as it could be. Let's mask some of the output.
    # get upper triangle with NumPy method
    np.triu(np.ones_like(df.corr()))
    # set plot size
    plt.figure(figsize=(16,6))
    # create heatmap with lower triangle only
    upper = np.triu(np.ones_like(df.corr(), dtype=np.bool_))
    triangleHeatMap = sns.heatmap(
        df.corr(), mask=upper, vmin=-0.5, vmax=1, annot=True, cmap='Blues'
        )
    fig = triangleHeatMap.get_figure()
    fig.savefig('images/TriangleHeatMap.png')

    return encodedDF

def cleanedDataFile(df):
    csv_path = f"data\heartCleaned.csv"
    df.to_csv(csv_path, sep= ',', index= False)
    print(f"\n\nCleaned data is saved to: \n{csv_path}\n")

def model(df):
    # check counts for heart disease or not, and is it balanced or not?
    round(df['HeartDisease'].value_counts()/len(df['HeartDisease'])*100)

    # Dummy variables
    pd.set_option('display.max_columns', 40)

    # Select categorical variables
    cat = df.select_dtypes(include=object).columns
    df_dummy = pd.get_dummies(df, columns=cat)

    inp = df_dummy.drop(columns='HeartDisease')
    out = df_dummy['HeartDisease']

    x_train, x_test, y_train, y_test = train_test_split(
        inp, out, test_size=0.20, random_state=20)
    
    # Log regression
    log_reg = LogisticRegression(solver='liblinear').fit(x_train, y_train)
    y_pred_lr = log_reg.predict(x_test)
    print('Accuracy score: ', round(accuracy_score(y_test, y_pred_lr), 3))
    print('F1 Score: ', round(f1_score(y_test, y_pred_lr), 3))

    # input, output var declarations 
    Y = df_dummy['HeartDisease']
    X = df_dummy[['Age','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak']]
    columns = X.columns
    # apply Standardization
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)
    X_std = pd.DataFrame(X_std, columns = columns)
    # Dataset of all dummy columns
    df2 = df_dummy.iloc[:,7:]
    # Merge the Standardization column with dummy columns
    X_nr = X_std.join(df2)

    # train test split
    x_train, x_test, y_train, y_test = train_test_split(
        X_nr, Y, test_size=0.20, random_state=20)
    
    log_reg_nr = LogisticRegression(solver='liblinear').fit(x_train, y_train)
    y_pred_lr_nr = log_reg_nr.predict(x_test)
    print('Accuracy score: ', round(accuracy_score(y_test, y_pred_lr_nr), 3))
    print('F1 Score: ', round(f1_score(y_test, y_pred_lr_nr), 3))

    # Confusion Matrix for Logistic Regression¶
    predictedLabels: np.ndarray = log_reg_nr.predict(x_test)
    confusionMatrixDF: pd.DataFrame = pd.DataFrame(metrics.confusion_matrix(y_test,predictedLabels),index=['Actual +','Actual -'],columns=['Predicted +', 'Predicted -'])
    print(confusionMatrixDF)
    print(metrics.classification_report(y_test, y_pred_lr))

    # Random Forest
    rf = RandomForestClassifier(random_state=20, n_estimators=100)
 
    # x_train, x_test, y_train, y_test
    rf = rf.fit(x_train, y_train)
    y_pred_rf = rf.predict(x_test)

    print('Accuracy score: ', round(accuracy_score(y_test, y_pred_rf), 3))
    print('F1 Score: ', round(f1_score(y_test, y_pred_rf), 3))

    # Tune the model with grid search
    n_estimators = [100, 200, 300]
    max_depth = [10, 20, 30]
    max_depth.append(None)
    max_features = ['auto', 'sqrt']
    min_samples_split = [5, 10, 15]
    min_samples_leaf = [1, 2]
    bootstrap = [True, False]

    params = {'n_estimators': n_estimators, 'max_features': max_features,
            'max_depth': max_depth, 'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

    RF = RandomForestClassifier(random_state=20)

    grid_search = GridSearchCV(estimator = RF, 
                            param_grid = params,
                            scoring = 'f1',
                            cv = 5,
                            verbose=0, 
                            n_jobs=-1)

    grid_search.fit(x_train, y_train)
    print("best score: ", grid_search.best_score_)
    print("best param: ", grid_search.best_params_)

    # apply best params to the RF model
    best_para = grid_search.best_params_
    rf_2 = RandomForestClassifier(random_state=20, **best_para)
                                
    rf_2 = rf_2.fit(x_train, y_train)
    y_pred_rf_2 = rf_2.predict(x_test)
    print(f'\nReport on Ramdom Forest\n')
    print('Accuracy score: ', round(accuracy_score(y_test, y_pred_rf_2), 3))
    print('F1 Score: ', round(f1_score(y_test, y_pred_rf_2), 3))

    # Confusion matrix and report for RF
    print(metrics.confusion_matrix(y_test, y_pred_rf_2))
    print(metrics.classification_report(y_test, y_pred_rf_2))

    # predictedLabels: np.ndarray = log_reg_nr.predict(x_test)
    confusionMatrixDF: pd.DataFrame = pd.DataFrame(metrics.confusion_matrix(y_test,y_pred_rf_2),index=['Actual +','Actual -'],columns=['Predicted +', 'Predicted -'])
    print(confusionMatrixDF)
    print(metrics.classification_report(y_test, y_pred_rf_2))

    # See what SKlearn says are important features with dummy variables in model
    from sklearn.feature_selection import mutual_info_classif
    plt.figure(figsize=(15,8))
    imp = mutual_info_classif(inp, out)
    feature_imp = pd.Series(imp, df_dummy.columns[0:len(df_dummy.columns)-1])
    feature_imp = feature_imp.sort_values(ascending=True)
    feature_imp.plot(kind = 'barh', color = 'teal')
    plt.title("Feature importance plot")
    plt.savefig("images\\featureImportance.png")

    # See impact of using only the top 10 features
    best_feat_df = df_dummy[['ST_Slope_Flat','Sex_M','ST_Slope_Down','RestingECG_ST','ExerciseAngina_N', 
                            'Oldpeak','ChestPainType_ASY','MaxHR','Sex_F','Cholesterol','HeartDisease']]

    inp_feat = best_feat_df.drop(columns='HeartDisease')
    out_feat = best_feat_df['HeartDisease']

    x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(
        inp_feat, out_feat, test_size=0.20, random_state=20)
    
    rf_feat = RandomForestClassifier(random_state=20, n_estimators=100)
 
    rf_feat = rf_feat.fit(x_train_2, y_train_2)
    y_pred_rf_feat = rf_feat.predict(x_test_2)
    print(f'\nReport on Ramdom Forest with only top 10 feautres\n')
    print('Accuracy score: ', round(accuracy_score(y_test_2, y_pred_rf_feat), 3))
    print('F1 Score: ', round(f1_score(y_test_2, y_pred_rf_feat), 3))

def main():
    makeDir()
    dataFrame = dataLoad()
    eda(dataFrame)
    viz(dataFrame)
    newDF = dataCleanTransform(dataFrame)

    # ''' Comment out for now'''
    # # # This library captures many EDA elements
    # # # https://pandas-profiling.github.io/pandas-profiling/docs/master/index.html
    # # profile = ProfileReport(dataFrame, title="Cardiovascular Disease Data Profiling Report", explorative=True)
    # # # export for report
    # # profile.to_file("CVD.html")
    # # # print to screen in interactive frame
    # # # 3.1 widgets() is broken; regress to 3.0 for widgets or use to_notebook_iframe (CAO: Nov 2021)
    # # # Selecting alternative path since user action required to rollback version
    # # profile.to_notebook_iframe()
    # '''end comment'''
    print(f"\n\nNow running models to train and test data with new dataset.\n")
    model(newDF)
    cleanedDataFile(newDF)     
    # # run other notebooks in library
    # # %run ./eda.ipynb
    # # %run ./models.ipynb

if __name__ == "__main__":
    main()


Creation of the directory images failed; it already exists.


Creation of the directory data failed; it already exists.

Basic stats on data:
              Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.432617   18.514154   109.384145    0.423046   25.460334   
min     28.000000    0.000000     0.000000    0.000000   60.000000   
25%     47.000000  120.000000   173.250000    0.000000  120.000000   
50%     54.000000  130.000000   223.000000    0.000000  138.000000   
75%     60.000000  140.000000   267.000000    0.000000  156.000000   
max     77.000000  200.000000   603.000000    1.000000  202.000000   

          Oldpeak  HeartDisease  
count  918.000000    918.000000  
mean     0.887364      0.553377  
std      1.066570      0.497414  
min     -2.600000      0.000000  
25%      0.000000      0.000000  
50%      0.600000     

  plt.show()
  plt.show()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Sex_F              918 non-null    uint8
 1   Sex_M              918 non-null    uint8
 2   ChestPainType_ASY  918 non-null    uint8
 3   ChestPainType_ATA  918 non-null    uint8
 4   ChestPainType_NAP  918 non-null    uint8
 5   ChestPainType_TA   918 non-null    uint8
 6   RestingECG_LVH     918 non-null    uint8
 7   RestingECG_Normal  918 non-null    uint8
 8   RestingECG_ST      918 non-null    uint8
 9   ExerciseAngina_N   918 non-null    uint8
 10  ExerciseAngina_Y   918 non-null    uint8
 11  ST_Slope_Down      918 non-null    uint8
 12  ST_Slope_Flat      918 non-null    uint8
 13  ST_Slope_Up        918 non-null    uint8
dtypes: uint8(14)
memory usage: 12.7 KB
None
     Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  HeartDisease  \
0     40        140          289     