<a href="https://colab.research.google.com/github/StevenBryceLee/Generalized_Trainers/blob/master/General_Classification_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#The path to the training value csv
clean_params = './Data/clean_train_values.csv'
#The path to the training labels csv
clean_labels = './Data/train_labels.csv'
#The column to be predicted
label_column = 'heart_disease_present'
#The column on which to join thw two data sets
join_column = 'patient_id'
#The column to turn into the index
index_column = join_column
#Any list of columns to drop from the merged data frames
drop_columns = ['patient_id']
#The path to the test dataset, which does not contain labels and must be predicted
clean_new_params = "./Data/clean_test_values.csv"


In [1]:
def notebook_Load(param_path, label_path, merge_col):
    '''
    This function takes two input csv files as parameters and 
    outputs the merged df.
    This is used when you have training parameters and labels in separate dataframes

    param_path is a string of the parameter dataframe path
    label_path is a string of the label dataframe path
    merge_col is a string of the column on which to merge
    '''
    import pandas as pd
    df_params = pd.read_csv(param_path, index_col = 0)
    df_labels = pd.read_csv(label_path, index_col = 0)
    df = pd.merge(df_params, df_labels, on=merge_col, how='inner')
    return df

In [None]:
#Create the joined dataframe
df = notebook_Load(clean_params, clean_labels, join_column)

In [None]:
def train_test(df, label_col, drop_cols):
    '''
    This function returns a train test split on the training data
    imports train_test_split to ensure that the function does not accidentally fail

    df is a pandas datafrome
    label_col is a string of the name of the label column
    drop_cols is a list of column names or string of a column name to drop from the parameters
    '''
    from sklearn.model_selection import train_test_split
    #Get a label list as a series
    df_labels = df[label_col]
    #Drop the label column from the params
    df_params = df.drop([label_col],axis = 1)
    #Drop any other drop_cols from the params
    df_params = df_params.drop(drop_cols,axis = 1)
    #return X_train,X_test,y_train,y_test
    return train_test_split(df_params, df_labels, test_size=0.2, random_state=42, stratify = df_labels)

In [None]:
#TEST
train_test(df, label_column, drop_columns)

[     slope_of_peak_exercise_st_segment  resting_blood_pressure  \
 139                                  1                     130   
 0                                    1                     128   
 169                                  2                     110   
 65                                   1                     126   
 98                                   1                     155   
 43                                   1                     140   
 156                                  1                     140   
 158                                  1                     120   
 73                                   1                     120   
 114                                  1                     112   
 38                                   1                     120   
 25                                   2                     120   
 123                                  1                     130   
 68                                   2                     13

In [None]:
def max_score(scores):
  '''
  This function finds the maximum score from the model search, sorted by the metric
  scores is any object which can be converted to a pandas dataframe
  Scores should contain the model, scaler, and metric used for scoring as a series

  returns a dataframe containing the ranked scores
  '''
  #import pandas to ensure that the function does not fail
  import pandas as pd
  #Create dataframe
  Score_df = pd.DataFrame(scores)
  #Set columns for readability
  Score_df.columns = ['Model','Scaler','metric', 'file Path']
  #Sort by the metric
  Score_df = Score_df.sort_values("metric")
  #Reset the index
  Score_df.reset_index()
  
  return(Score_df)

In [None]:
def pipeline_loader():
  '''
  This function contains a list of parameters, scalers, and returns them to any caller functions
  Prevents global variables
  '''

  #Imports to ensure that the function does not fail
  import numpy as np
  
  #Set of models with which to test
  from sklearn.naive_bayes import BernoulliNB
  from sklearn.neighbors import KNeighborsClassifier
  from sklearn.svm import SVC # probability=True
  from sklearn.tree import DecisionTreeClassifier
  from sklearn.ensemble import GradientBoostingClassifier
  from sklearn.linear_model import LogisticRegression 

  #Set of scalers
  from sklearn.preprocessing import MaxAbsScaler
  from sklearn.preprocessing import StandardScaler
  from sklearn.preprocessing import MinMaxScaler 
  
  #step for parameter searching for variables between 0 and 1
  value_range = np.arange(0, 1.1, 0.1)

  #number of neighbors for KNN algorithm
  neighbor_range = np.arange(1,6,1)

  #Support vector machine range for SVC hyper-parameters
  c_range = np.logspace(-3, 2, 6) 

  #List of maximum features for DTC and GBC algorithms
  max_feat = ['sqrt','log2']
  
  #Learning rate for GBC algorithm
  gbc_learn = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]

  #Full parameter grid
  Parameter_Grid = [
  {
      'NB__alpha':value_range, 
      'NB__binarize':value_range,
      'NB__fit_prior':[True,False] 
  },
  {
      'KNN__n_neighbors':neighbor_range,
      'KNN__weights':['uniform','distance'],
      'KNN__algorithm':['ball_tree','kd_tree','brute'],
  },
  {
      'SVC__C':c_range,
      'SVC__kernel':['linear','poly','rbf','sigmoid'],
      'SVC__gamma':c_range,
  },
  {
      'DTC__criterion':['gini','entropy'],
      'DTC__max_features':max_feat,
  },
  {
      'GBC__loss':['deviance','exponential'],
      'GBC__max_features':max_feat,
      'GBC__learning_rate':gbc_learn
  },
  {
      'LR__penalty':['l2'],
      'LR__C':c_range,
      'LR__solver':['liblinear','sag','saga']
  }]
  
  #Scaling data helping functions
  scalers = [StandardScaler(), MaxAbsScaler(), MinMaxScaler()]

  #Type of models. Note, this must match the length and type of the parameter grid
  models = [
          ('NB', BernoulliNB()),
          ('KNN', KNeighborsClassifier()),
          ('SVC', SVC(probability = True)),
          ('DTC', DecisionTreeClassifier()),
          ('GBC', GradientBoostingClassifier()),
          ('LR', LogisticRegression())
          ]
  
  return(Parameter_Grid,scalers,models)
  
  

In [None]:
def gridSearch(Parameter_Grid, scaler, model, X_train,y_train, X_test,y_test,warnings=False):
  '''
  This function takes parameters, scalers, models, and data as input and outputs predictions
  Parameter_Grid is a dictionary of all inputs for the used models
  scaler is a list of scalers to be used in the search
  model is a list of models to be used in the search
  X_train is training parameter data for the algorithms
  y_train is training label data for the algorithms
  X_test is split training data for testing
  y_test is the split training labels for testing
  '''
  #Imports to prevent function failure
  from sklearn.pipeline import Pipeline
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import log_loss
  import pickle
  import warnings

  #Ignore warnings during model search by default
  if not warnings:
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
    
  #Create the pipeline for gridsearch
  pipe = [
          ('scaler',scaler),
              model
          ]

  #Instantiate the pipeline
  pipeline = Pipeline(pipe)

  #initiate gridsearch
  grid = GridSearchCV(pipeline, param_grid=Parameter_Grid, cv=5)
  grid.fit(X_train,y_train)

  #Create predictions from model, only value not probability 
  predictions = grid.predict_proba(X_test)[:,1]

  #Get loss between predictions and testing data
  loss = log_loss(y_test,predictions)

  #Determine whether or not to save the model, loss is arbitrary
  filename = ""
  if(loss < 0.50):
      filename = "./Data/_" + str(model[0])+ "_" + str(scaler)[0:str(scaler).find("(")] + 
                  "Loss_" + str(loss) +  "_Model.pkl"
      with open(filename, 'wb') as file:
          pickle.dump(grid, file)        
  
  return(model,scaler,loss, filename)

In [None]:
def Classification_Search(param_path, label_path, merge_col, label_col, drop_cols):
  '''
  This function takes dataframes as inputs and outputs a series of models dumped in a folder
  in the local directory

  param_path is the path to the parameters csv with no labels
  label_path is the path to the labels in the training data
  merge_col is the name of the column on which to merge the parameters and labels
  label_col is the name of the column which is to be predicted
  drop_cols is the list of columns to be dropped

  returns a dataframe of models sorted by scores
  '''
  # Store the models and results
  scores = [] 
  # load the csv files
  df = notebook_Load(param_path, label_path, merge_col)

  #Split the data for training and testing 
  X_train,X_test,y_train,y_test = train_test(df, label_col, drop_cols)
  
  #Get the parameter grid, scalers, and models
  Parameter_Grid,scalers,models = pipeline_loader()
  
  #Search through the parameter grid, models, and scalars for the highest rated model
  for count,mod in enumerate(models):
      for sc in scalers:
          Model_Stats = gridSearch(Parameter_Grid[count], sc, mod, X_train,y_train, X_test,y_test)
          scores.append(Model_Stats)
                        
  #Sort and return the scored models
  return max_score(scores)

In [None]:
Final_scores = Classification_Search(clean_params, clean_labels, join_column, 
                      label_column, drop_columns)

In [None]:
Final_scores[0:3]

In [None]:
def predict_New_Scores(model_file, param_path, index_col, label_col, drop_cols):
  '''
  This function predicts labels for new data, given a pickled model file and 
  saves the results to a csv

  model_file is a pickled model file path as a string
  param_path is the path to the parameter csv on which we will perform predictions as a string
  index_col is the index on which the new predictions will be saved as a string
  label_col is the target label to be predicted as a string
  drop_cols is a list of columns which will be dropped from the predictions

  returns nothing
  '''
  import pandas as pd
  import pickle 
  #Accepts a string to load a pickled model file
  #param_path is a csv filepath of the new values to predict
  #['Model','Scaler','metric']
  with open(model_file, 'rb') as file:
      model = pickle.load(file)
  new_Params = pd.read_csv(param_path, index_col = 0)
  new_Params = new_Params.drop(drop_cols,axis = 1)
  new_Params[label_col] = model.predict_proba(New_Params)[:,1]
  
  predict_file_Path = ("Predictions" + str(model_file[0:5]) + str(pd.datetime.now().month) + "-" 
                        + str(pd.datetime.now().day)+ "-" 
                      + str(pd.datetime.now().hour) + "-"
                        + str(pd.datetime.now().minute) + "-"
                        + str(pd.datetime.now().second) + "-"
                        +".csv")
  new_Params[[index_col,label_col]].to_csv(predict_file_Path,index = False)
  
  return
  

In [None]:
predict_New_Scores(Final_scores[0][2],clean_new_params,label_column, index_column, drop_columns)