# Model Predictions

# 100K Train, 2.2M Test

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
### vvvvvvvvvvvvvvvvvvvvvvvvvvvvv global vars vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

RANDOM_STATE = 42

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ end global vars ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


def menu(choices, prompts):
    
    # display menu by displaying the list of prompts and list of choices:
    print()
    for p in range(len(prompts)):
        print()
        print(prompts[p], ": ", choices[p]) 
    print()
    
    # validate user selection:
    while(True):
        choice = input('Enter your selection from the options above: ')
        # if the user's input is in the choices list, return that choice because it is valid:
        if choice in choices:
            return choice
        else:
            print('Please enter your selection from the options above: ')


def preProcess(file):
       
    df = pd.read_csv(file, header=0)
    df["LR"] = ""
    df["age_group_float"] = df["age_group"].replace({"-65":0, "65-74":1, "75-84":2, "85+":3})
    print()
    display = input('To display dataset parameters, enter 3 otherwise just press enter: ')
    if(display=='3'):
    
        print('df.head(): \n', df.head(), '\n')
        
        # use describe and info to get a better idea what the data looks like:
        print('df.info(): ')    
        print(df.info(), '\n'  ) 
            
        print('df.describe(): ')    
        print(df.describe(), '\n'  )  
        
        #find dimensions of the data:           
        print(" data set dimensions:{}". format(df.shape))
        
        #go()
     
    #find missing or null data
    
    if(df.isnull().sum().sum()>0):
        df.dropna()
        print('found and corrected null data: ')
        
    if(df.isna().sum().sum()>0):
        df.dropna()  
        print('found and corrected missing data: ')
    
    if(display=='3'):
        print('df.isnull().sum()')
        print(df.isnull().sum(), '\n')

        print('df.isna().sum()')
        print(df.isna().sum())
        
    print('\ncompletion of search for missing/null data: \n\n')
    
    X = df
    y = df["dep_target_hd"]

    if(display=='3'):
        print('df X:')
        print(X, '\n')
            
        print('df Y:')    
        
        print(y, '\n')
        
        #go()
    
    from sklearn import preprocessing
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    labelencoder_Y = LabelEncoder()
    y = labelencoder_Y.fit_transform(y)
    
    if(display=='3'):
        print('Y transformed into 0s and 1s:')
        print(y, '\n') 
    
    if(display=='3'):
        print('X: ')
        print(X, '\n')
        print('X_tshape: ', X.shape, '\n') 
   
    return X, y


# Logistic Regression Algorithm:
def LR(X_train_data, y_train, X_test_data,X_test, y_test, choices, choice, results):
    # Logistic Regression Algorithm
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(solver = 'lbfgs', random_state = RANDOM_STATE)
    #NEW
    classifier.fit(X_train_data, y_train)
    y_pred = classifier.predict(X_test_data)
#     print("y_pred",y_pred)
    sub = pd.DataFrame(data=X_test)
#     print("sub",sub)
    sub['LR'] = y_pred
    sub = sub[["person_id", "gender_source_value","race_source_value","ethnicity_source_value", "i_diabetes", "i_chronic","i_high_bp","i_obesity","i_statin","i_aspirin","age_group_float",'dep_target_hd',"LR"]].copy()
    display(sub.head(5))


    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)

    #cm[0,0] is the true positives
    #cm[1,1] is the true negatives
    #cm[0,1] are the false positives
    #cm[1,0] are the false negatives

    print('\nResults for Logistic Regression: \n')
    print('cm: ')
    print(cm, '\n')
    
    score = classifier.score(X_test_data, y_test)
    print('score: ', score, '\n')
    
    
    acc = (cm[0,0] + cm[1,1])/y_test.shape[0] # the total of the test cases, not the train cases
    print('acc: ', acc, '\n')
    lr_acc = acc
   
    results[choices[choice]] = acc 
    choice += 1
    print('results: ', results, '\n')
    
#     go()
    print('\n')
    
    return choice, results, sub;

# KNeighborsClassifier
def KNN(X_train_data, y_train, X_test_data,X_test, y_test, choices, choice, results, sub):
   
    from sklearn.neighbors import KNeighborsClassifier

    nearest = 2
    classifier = KNeighborsClassifier(n_neighbors=nearest, metric = 'minkowski', p = 2)
    classifier.fit(X_train_data, y_train)
    print('This is the KNN: ')
    #NEW
    y_pred = classifier.predict(X_test_data)
    sub['KNN'] = y_pred
    display(sub.head(5))
    

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
 
    print('\nResults for KNN: \n')
    print('cm: ')
    print(cm, '\n')
    
    acc = (cm[0,0] + cm[1,1])/y_test.shape[0] # the total of the test cases, not the train cases
    knn_acc = acc # probly don't need this now as results stored in dict
    print('acc: ', acc, '\n')

    print('\n')

    results[choices[choice]] = acc 
    choice += 1
    print('results: ', results, '\n')
    
    print('\n')
    
    return choice, results, sub;

# SVM using SVC:
def SVC(X_train_data, y_train, X_test_data,X_test, y_test, choices, choice, results, sub):
  
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state=RANDOM_STATE)
    classifier.fit(X_train_data, y_train)
    print('This is the SVM')
    print('\n')

    #NEW
    y_pred = classifier.predict(X_test_data)
    sub['SVM'] = y_pred
    display(sub.head(5))

#     from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)

    print('\nResults for SVC: \n')
    print('cm: ')
    print(cm, '\n')
    
    acc = (cm[0,0] + cm[1,1])/y_test.shape[0] # the total of the test cases, not the train cases
    svc_acc = acc 
    print('acc: ', acc, '\n')

    results[choices[choice]] = acc 
    choice += 1
    print('results: ', results, '\n')
    print('\n')

    return choice, results, sub;


#Gaussian Naive Bayes
def GNB(X_train_data, y_train, X_test_data,X_test, y_test, choices, choice, results, sub):
  
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train_data, y_train)
    print('This is the GaussianNB')

    print('\n')
    
    #NEW
    y_pred = classifier.predict(X_test_data)
    sub['GNB'] = y_pred
    display(sub.head(5))
    
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)

    print('\nResults for GNB: \n')
    print('cm: ')
    print(cm, '\n')
    
    acc = (cm[0,0] + cm[1,1])/y_test.shape[0] # the total of the test cases, not the train cases
    GNB_acc = acc
    print('acc: ', acc, '\n')
    
    results[choices[choice]] = acc 
    choice += 1
    print('results: ', results, '\n')
    print('\n')
    
    return choice, results, sub;


#Decision Tree Classifier:
def DTC(X_train_data, y_train, X_test_data,X_test, y_test, choices, choice, results, sub):
    
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = RANDOM_STATE)
    classifier.fit(X_train_data, y_train)
    print('This is the decision tree classifier')
    print('\n')
    
    y_pred = classifier.predict(X_test_data)
    sub['DTC'] = y_pred
    display(sub.head(5))
    
#     from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)

    print('\nResults for DTC: \n')
    print('cm: ')
    print(cm, '\n')
    
    acc = (cm[0,0] + cm[1,1])/y_test.shape[0] # the total of the test cases, not the train cases
    dtc_acc = acc
    print('acc: ', acc, '\n')
    
    results[choices[choice]] = acc  
    choice += 1
    print('results: ', results, '\n') 
    print('\n');
    
    return choice, results, sub;

#Random Forest Classifier:
def RF(X_train_data, y_train, X_test_data,X_test, y_test, choices, choice, results, sub):         
   
    from sklearn.ensemble import RandomForestClassifier
    
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=RANDOM_STATE)
    classifier.fit(X_train_data, y_train)
    print('This is the Random Forest Classifier')
#     go()
    print('\n')
    
    #NEW
    y_pred = classifier.predict(X_test_data)
    sub['RF'] = y_pred
    display(sub.head(5))

#     from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)

    print('\nResults for RF: \n')
    print('cm: ')
    print(cm, '\n')
    
    acc = (cm[0,0] + cm[1,1])/y_test.shape[0] # the total of the test cases, not the train cases
    rf_acc = acc
    print(rf_acc)
    print('acc: ', acc, '\n')
    
    # new CR code 210705; update the results dictionary with the results of this run:
    results[choices[choice]] = acc  
    choice += 1
    print('results: ', results, '\n')
    
    predictions = sub[["person_id","LR","KNN", "SVM","GNB", "DTC", "RF"]].copy()
    predictions.to_csv (r'heart_disease_conditions_model_predictions_2.2M.csv', index=None)
    
    return choice, results, sub;

In [18]:
def main():
#     print('\n\n Comparing classifiers using heart_disease_conditions_2.3M_20220319.csv\n\n')
    print('\n\n Comparing classifiers using heart_disease_conditions_100K_20220407.csv\n\n')
    
    """
    Report testing accuracy for each of the 6 classifiers.
    """
#     print('heart_disease_conditions_2.3M_20220319.csv');
    print('heart_disease_conditions_100K_20220407.csv');
    print('input and pre-process the dataset: \n')
    
#     file = r'heart_disease_conditions_2.3M_20220319.csv'
    file = r'heart_disease_conditions_100K_20220407.csv'
    file2 = r"heart_disease_conditions_22M_20220407.csv"
    
    X, y = preProcess(file)
    print("X.shape", X.shape)
    print("Y.shape", y.shape)
    X2, y2 = preProcess(file2)
    print("X2.shape", X2.shape)
    print("Y2.shape", y2.shape)
    
    print('split dataset into train and test segments:\n')
    from sklearn.model_selection import train_test_split
    testSize = 0.200
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state = RANDOM_STATE)
    X_train_data = X_train.loc[:,["gender_source_value","race_source_value","ethnicity_source_value", "i_diabetes", "i_chronic","i_high_bp","i_obesity","i_statin","i_aspirin", "age_group_float"]].values
    X_test_data = X_test.loc[:,["gender_source_value","race_source_value","ethnicity_source_value", "i_diabetes", "i_chronic","i_high_bp","i_obesity","i_statin","i_aspirin", "age_group_float"]].values
    
    X2_test_data = X2.loc[:,["gender_source_value","race_source_value","ethnicity_source_value", "i_diabetes", "i_chronic","i_high_bp","i_obesity","i_statin","i_aspirin", "age_group_float"]].values
    y2_test = y2
    
    print("X_train_data:")
    display(X_train_data)
    print("X_test_data:")
    display(X_test_data)
    print("y_train:")
    display(y_train)
    print("y_test:")
    display(y_test)

    choices = ['LR', 'KNN', 'SVM', 'GNB', 'DTC', 'RF']

    # store choices/results pairs where choices is key, results is value:
    results = {}
    choice = 0
   
    
    choice, results, sub = LR(X_train_data, y_train, X2_test_data,X2, y2_test, choices, choice, results);         
    
    choice, results, sub = KNN(X_train_data, y_train, X2_test_data,X2, y2_test, choices, choice, results, sub);     

    choice, results, sub = SVC(X_train_data, y_train, X2_test_data,X2, y2_test, choices, choice, results, sub);
    
    choice, results, sub = GNB(X_train_data, y_train, X2_test_data,X2, y2_test, choices, choice, results, sub);        
    
    choice, results, sub = DTC(X_train_data, y_train, X2_test_data,X2, y2_test, choices, choice, results, sub);
    
    choice, results, sub = RF(X_train_data, y_train, X2_test_data,X2, y2_test, choices, choice, results, sub);
    
    
main()



 Comparing classifiers using heart_disease_conditions_100K_20220407.csv


heart_disease_conditions_100K_20220407.csv
input and pre-process the dataset: 


To display dataset parameters, enter 3 otherwise just press enter: 3
df.head(): 
    person_id  gender_concept_id  year_of_birth  month_of_birth  day_of_birth  \
0       1054               8507           1942              11             1   
1       1067               8507           1938               6             1   
2       2017               8532           1926               4             1   
3       2963               8532           1936               6             1   
4       3248               8532           1923              12             1   

   birth_datetime  race_concept_id  ethnicity_concept_id  location_id  \
0             NaN             8527              38003564          609   
1             NaN             8527              38003564          615   
2             NaN             8527              38003564     


To display dataset parameters, enter 3 otherwise just press enter: 3
df.head(): 
    person_id  gender_concept_id  year_of_birth  month_of_birth  day_of_birth  \
0         30               8532           1959              11             1   
1         46               8507           1931               1             1   
2         66               8532           1920               4             1   
3         67               8507           1948              11             1   
4         99               8532           1920               9             1   

   birth_datetime  race_concept_id  ethnicity_concept_id  location_id  \
0             NaN             8527              38003564           30   
1             NaN             8527              38003564           46   
2             NaN             8527              38003564           63   
3             NaN             8527              38003564           64   
4             NaN             8527              38003564           89  

person_id                            0
gender_concept_id                    0
year_of_birth                        0
month_of_birth                       0
day_of_birth                         0
birth_datetime                 2226856
race_concept_id                      0
ethnicity_concept_id                 0
location_id                          0
provider_id                    2226856
care_site_id                   2226856
person_source_value_left             0
gender_source_value                  0
gender_source_concept_id       2226856
race_source_value                    0
race_source_concept_id         2226856
ethnicity_source_value               0
ethnicity_source_concept_id    2226856
ethnicity_concept_name               0
race_concept_name                    0
gender_concept_name                  0
dep_target_hd                        0
i_diabetes                           0
i_chronic                            0
i_high_bp                            0
i_obesity                

X_train_data:


array([[1, 1, 1, ..., 1, 1, 0],
       [1, 2, 2, ..., 0, 0, 3],
       [2, 1, 1, ..., 0, 0, 3],
       ...,
       [2, 2, 2, ..., 0, 1, 3],
       [2, 2, 2, ..., 1, 1, 2],
       [2, 1, 1, ..., 0, 0, 2]])

X_test_data:


array([[1, 1, 1, ..., 0, 0, 3],
       [1, 1, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 0, 0, 2],
       ...,
       [2, 1, 1, ..., 1, 0, 1],
       [1, 1, 1, ..., 0, 0, 3],
       [1, 1, 1, ..., 0, 0, 1]])

y_train:


array([1, 1, 0, ..., 0, 0, 0])

y_test:


array([1, 1, 1, ..., 1, 1, 0])

Unnamed: 0,person_id,gender_source_value,race_source_value,ethnicity_source_value,i_diabetes,i_chronic,i_high_bp,i_obesity,i_statin,i_aspirin,age_group_float,dep_target_hd,LR
0,30,2,1,1,0,1,0,0,0,0,0,0,0
1,46,1,1,1,1,1,0,0,0,0,2,0,1
2,66,2,1,1,1,0,0,0,0,0,3,1,0
3,67,1,1,1,0,0,0,0,0,0,0,0,0
4,99,2,1,1,0,0,0,0,0,0,3,1,0



Results for Logistic Regression: 

cm: 
[[1076619  194902]
 [ 391232  564103]] 

score:  0.736788548518629 

acc:  0.736788548518629 

results:  {'LR': 0.736788548518629} 



This is the KNN: 


Unnamed: 0,person_id,gender_source_value,race_source_value,ethnicity_source_value,i_diabetes,i_chronic,i_high_bp,i_obesity,i_statin,i_aspirin,age_group_float,dep_target_hd,LR,KNN
0,30,2,1,1,0,1,0,0,0,0,0,0,0,0
1,46,1,1,1,1,1,0,0,0,0,2,0,1,1
2,66,2,1,1,1,0,0,0,0,0,3,1,0,0
3,67,1,1,1,0,0,0,0,0,0,0,0,0,0
4,99,2,1,1,0,0,0,0,0,0,3,1,0,0



Results for KNN: 

cm: 
[[1117180  154341]
 [ 564861  390474]] 

acc:  0.6770325517231469 



results:  {'LR': 0.736788548518629, 'KNN': 0.6770325517231469} 



This is the SVM




Unnamed: 0,person_id,gender_source_value,race_source_value,ethnicity_source_value,i_diabetes,i_chronic,i_high_bp,i_obesity,i_statin,i_aspirin,age_group_float,dep_target_hd,LR,KNN,SVM
0,30,2,1,1,0,1,0,0,0,0,0,0,0,0,0
1,46,1,1,1,1,1,0,0,0,0,2,0,1,1,1
2,66,2,1,1,1,0,0,0,0,0,3,1,0,0,1
3,67,1,1,1,0,0,0,0,0,0,0,0,0,0,0
4,99,2,1,1,0,0,0,0,0,0,3,1,0,0,0



Results for SVC: 

cm: 
[[935227 336294]
 [270513 684822]] 

acc:  0.7275050564562774 

results:  {'LR': 0.736788548518629, 'KNN': 0.6770325517231469, 'SVM': 0.7275050564562774} 



This is the GaussianNB




Unnamed: 0,person_id,gender_source_value,race_source_value,ethnicity_source_value,i_diabetes,i_chronic,i_high_bp,i_obesity,i_statin,i_aspirin,age_group_float,dep_target_hd,LR,KNN,SVM,GNB
0,30,2,1,1,0,1,0,0,0,0,0,0,0,0,0,1
1,46,1,1,1,1,1,0,0,0,0,2,0,1,1,1,1
2,66,2,1,1,1,0,0,0,0,0,3,1,0,0,1,0
3,67,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
4,99,2,1,1,0,0,0,0,0,0,3,1,0,0,0,0



Results for GNB: 

cm: 
[[1015048  256473]
 [ 330390  624945]] 

acc:  0.7364611811450763 

results:  {'LR': 0.736788548518629, 'KNN': 0.6770325517231469, 'SVM': 0.7275050564562774, 'GNB': 0.7364611811450763} 



This is the decision tree classifier




Unnamed: 0,person_id,gender_source_value,race_source_value,ethnicity_source_value,i_diabetes,i_chronic,i_high_bp,i_obesity,i_statin,i_aspirin,age_group_float,dep_target_hd,LR,KNN,SVM,GNB,DTC
0,30,2,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0
1,46,1,1,1,1,1,0,0,0,0,2,0,1,1,1,1,1
2,66,2,1,1,1,0,0,0,0,0,3,1,0,0,1,0,1
3,67,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,99,2,1,1,0,0,0,0,0,0,3,1,0,0,0,0,0



Results for DTC: 

cm: 
[[971279 300242]
 [283198 672137]] 

acc:  0.7379983258908523 

results:  {'LR': 0.736788548518629, 'KNN': 0.6770325517231469, 'SVM': 0.7275050564562774, 'GNB': 0.7364611811450763, 'DTC': 0.7379983258908523} 



This is the Random Forest Classifier




Unnamed: 0,person_id,gender_source_value,race_source_value,ethnicity_source_value,i_diabetes,i_chronic,i_high_bp,i_obesity,i_statin,i_aspirin,age_group_float,dep_target_hd,LR,KNN,SVM,GNB,DTC,RF
0,30,2,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,46,1,1,1,1,1,0,0,0,0,2,0,1,1,1,1,1,1
2,66,2,1,1,1,0,0,0,0,0,3,1,0,0,1,0,1,1
3,67,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,99,2,1,1,0,0,0,0,0,0,3,1,0,0,0,0,0,0



Results for RF: 

cm: 
[[967153 304368]
 [277770 677565]] 

0.7385830067143991
acc:  0.7385830067143991 

results:  {'LR': 0.736788548518629, 'KNN': 0.6770325517231469, 'SVM': 0.7275050564562774, 'GNB': 0.7364611811450763, 'DTC': 0.7379983258908523, 'RF': 0.7385830067143991} 

