In [5]:
#Write the file paths to the clean training set, consisting of all parameters, 
#the clean label set, consisting of the label column and the column on which to join
#The clean new test set, consisting of a parameter set which you would like to predict
clean_params = './Data/clean_train_values.csv'
clean_labels = './Data/train_labels.csv'
label_column = 'heart_disease_present'
join_column = 'patient_id'
index_column = join_column
drop_columns = ['patient_id']
clean_new_params = "./Data/clean_test_values.csv"


In [6]:
#import modules function

def module_import():
    

    from sklearn.naive_bayes import BernoulliNB
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC # probability=True
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression #TODO get params and generate search space

    from sklearn.preprocessing import MaxAbsScaler
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler 

    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV

    from sklearn.metrics import log_loss

#TEST 
module_import()

In [9]:
#Load notebook

In [10]:
def notebook_Load(param_path, label_path, merge_col):
    #param_path is a string of the parameter dataframe path
    #label_path is a string of the label dataframe path
    #merge_col is a string of the column on which to merge
    import pandas as pd
    df_params = pd.read_csv(param_path, index_col = 0)
    df_labels = pd.read_csv(label_path, index_col = 0)
    df = pd.merge(df_params, df_labels, on=merge_col, how='inner')
    return df

In [11]:
#TEST 
df = notebook_Load(clean_params, clean_labels, join_column)

In [12]:
#TODO Define train_test_split function

In [13]:
def train_test(df, label_col, drop_cols):
    #df is a pandas datafrome
    #label_col is a string of the name of the label column
    #drop_cols is a list of column names or string of a column name to drop from the parameters
    from sklearn.model_selection import train_test_split
    df_labels = df[label_col]
    df_params = df.drop([label_col],axis = 1)
    df_params = df_params.drop(drop_cols,axis = 1)
    #X_train,X_test,y_train,y_test = train_test_split(df_params, df_labels, test_size=0.2, random_state=42, stratify = df_labels)
    return train_test_split(df_params, df_labels, test_size=0.2, random_state=42, stratify = df_labels)

In [14]:
#TEST
train_test(df, label_column, drop_columns)

[     slope_of_peak_exercise_st_segment  resting_blood_pressure  \
 139                                  1                     130   
 0                                    1                     128   
 169                                  2                     110   
 65                                   1                     126   
 98                                   1                     155   
 43                                   1                     140   
 156                                  1                     140   
 158                                  1                     120   
 73                                   1                     120   
 114                                  1                     112   
 38                                   1                     120   
 25                                   2                     120   
 123                                  1                     130   
 68                                   2                     13

In [15]:
#Define function to get the maximum score

In [16]:
def max_score(scores):
    import pandas as pd
    #scores is any object which can be converted to a pandas dataframe
    #Scores should contain the model, scaler, and metric used for scoring as a series
    Score_df = pd.DataFrame(scores)
    Score_df.columns = ['Model','Scaler','metric', 'file Path']
    Score_df = Score_df.sort_values("metric")
    Score_df.reset_index()
    return(Score_df)

In [17]:
#TODO Load params, models, and scalers

In [20]:
def pipeline_loader():
    #model_loader stores parameters, models, and scalers, and returns all of them as objects
    import numpy as np
    
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC # probability=True
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression #TODO get params and generate search space

    from sklearn.preprocessing import MaxAbsScaler
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler 
    value_range = np.arange(0, 1.1, 0.1)
    neighbor_range = np.arange(1,6,1)
    c_range = np.logspace(-3, 2, 6) 
    max_feat = ['sqrt','log2']
    gbc_learn = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
    Parameter_Grid = [
    {
        'NB__alpha':value_range, 
        'NB__binarize':value_range,
        'NB__fit_prior':[True,False] 
    },
    {
        'KNN__n_neighbors':neighbor_range,
        'KNN__weights':['uniform','distance'],
        'KNN__algorithm':['ball_tree','kd_tree','brute'],
    },
    {
        'SVC__C':c_range,
        'SVC__kernel':['linear','poly','rbf','sigmoid'],
        'SVC__gamma':c_range,
    },
    {
        'DTC__criterion':['gini','entropy'],
        'DTC__max_features':max_feat,
    },
    {
        'GBC__loss':['deviance','exponential'],
        'GBC__max_features':max_feat,
        'GBC__learning_rate':gbc_learn
    },
    {
        'LR__penalty':['l2'],
        'LR__C':c_range,
        'LR__solver':['liblinear','sag','saga']
    }]
    
    scalers = [StandardScaler(), MaxAbsScaler(), MinMaxScaler()]
    models = [
            ('NB', BernoulliNB()),
            ('KNN', KNeighborsClassifier()),
            ('SVC', SVC(probability = True)),
            ('DTC', DecisionTreeClassifier()),
            ('GBC', GradientBoostingClassifier()),
            ('LR', LogisticRegression())
            ]
    return(Parameter_Grid,scalers,models)
    
    

In [22]:
#TODO Gridsearch Function

In [23]:
def gridSearch(Parameter_Grid, scaler, model, X_train,y_train, X_test,y_test):
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV

    from sklearn.metrics import log_loss
    
    import pickle
    
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
    #This function accepts a parameter grid, scaler and model
    #It returns a tuple of the fitted model and score
    
    pipe = [
            ('scaler',scaler),
                model
            ]
    pipeline = Pipeline(pipe)
    #initiate gridsearch
    grid = GridSearchCV(pipeline, param_grid=Parameter_Grid, cv=5)
    grid.fit(X_train,y_train)
    predictions = grid.predict_proba(X_test)[:,1]
    loss = log_loss(y_test,predictions)
    #Determine whether or not to save the model
    filename = ""
    if(loss < 0.50):
        filename = "./Data/_" + "_" + str(model[0])+ "_" + str(scaler)[0:str(scaler).find("(")] + "Loss_" + str(loss) +  "_Model.pkl"
        with open(filename, 'wb') as file:
            pickle.dump(grid, file)        
        
    return(model,scaler,loss, filename)

In [None]:
#Define pipeline maker function

In [None]:
def Classification_Search(param_path, label_path, merge_col, label_col, drop_cols):
    scores = [] # Store the models and results
    df = notebook_Load(param_path, label_path, merge_col)
    X_train,X_test,y_train,y_test = train_test(df, label_col, drop_cols)
    Parameter_Grid,scalers,models = pipeline_loader()
    for count,mod in enumerate(models):
        for sc in scalers:
            Model_Stats = gridSearch(Parameter_Grid[count], sc, mod, X_train,y_train, X_test,y_test)
            scores.append(Model_Stats)              
            
    return max_score(scores)

In [None]:
Final_scores = Classification_Search(clean_params, clean_labels, join_column, 
                      label_column, drop_columns)

In [None]:
Final_scores[0:3]

In [None]:
def predict_New_Scores(model_file, param_path, index_col, label_col, drop_cols):
    import pandas as pd
    import pickle 
    #Accepts a string to load a pickled model file
    #param_path is a csv filepath of the new values to predict
    #['Model','Scaler','metric']
    with open(model_file, 'rb') as file:
        model = pickle.load(file)
    new_Params = pd.read_csv(param_path, index_col = 0)
    new_Params = new_Params.drop(drop_cols,axis = 1)
    new_Params[label_col] = model.predict_proba(New_Params)[:,1]
    
    predict_file_Path = ("Predictions" + str(model_file[0:5]) + str(pd.datetime.now().month) + "-" 
                         + str(pd.datetime.now().day)+ "-" 
                        + str(pd.datetime.now().hour) + "-"
                         + str(pd.datetime.now().minute) + "-"
                         + str(pd.datetime.now().second) + "-"
                         +".csv")
    new_Params[[index_col,label_col]].to_csv(predict_file_Path,index = False)
    
    
    

In [None]:
#predict_New_Scores()

In [None]:
predict_New_Scores(Final_scores[0][2],clean_new_params,label_column, index_column, drop_columns)