In [None]:
# Import required Libraries
import numpy as np
import pandas as pd
from pandas.core.common import flatten
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pickle

In [None]:
spark = SparkSession.builder.appName('Pandas_Model').enableHiveSupport().getOrCreate()

df = spark.read.parquet('parquet_file_path')

In [None]:
#Columns selection
usecols = ['FLAG','ITEM_PRICE_AMT','All_columns_that_needs_tobe_used_in_model']

cat_cols = ['VISIT_DEVICE_TYPE', 'All_categorical_cols']

num_cols = ['ITEM_PRICE_AMT', 'All_num_cols']

In [None]:
#Preprocessing function
def data_preprocessor(df):
  """
  Return the preprocessed dataframe after
  imputing the missing values and encoding
  categorical variables

  :param df: dataframe to be processed for imputation and encoding
  :return: preprocessed dataframe
  """
  df = df.select(usecols).toPandas()
  for col in num_cols:
    df[col] = pd.to_numeric(df[col])

  X = pd.DataFrame(df.drop(['FLAG'], axis = 1))
  y = pd.DataFrame(df['FLAG'])

  numeric_features = [var for var in X.columns if X[var].dtype != 'O']
  categorical_features = [var for var in X.columns if X[var].dtype == 'O']

  X = X[numeric_features + categorical_features]
  cols = numeric_features + categorical_features
  median_price = X.ITEM_PRICE_AMT.median()
  imputer = ColumnTransformer(transformers = [
                                              ('num_imputation', SimpleImputer(strategy = 'constant', fill_value = median_price),[0]),
                                              ('cat_imputation', SimpleImputer(missing_values= None, strategy ='constant', fill_value='missing'),
                                               slice(1,6))])
  X = imputer.fit_transform(X.values)
  X = pd.DataFrame(X, columns=cols)
  categorical_mappings = {}

  for var in categorical_features:
    categorical_mappings[var] = (X.join(Y)).groupby([var])['FLAG'].mean().to_dict()

  for var in categorical_features:
    X[var] = X[var].map(categorical_mappings[var])

  df = pd.concat([X.reset_index(drop = True), Y], axis=1)
  return df
  
preprocessed_df = data_preprocessor(df)

In [None]:
#Over Sampling
def over_sampling(preprocessed_df):
  """
  Use SMOTE technique to oversample
  The train data for handling unbalanced classes
  after a train test split in 70:30 ratio

  :param preprocessed_df: preprocessed dataframe after imputations and encoding
  :return: train dataframe with balanced distribution of classes and test dataframe
  """
  X = pd.DataFrame(preprocessed_df.drop(['FLAG'], axis = 1))
  Y = pd.DataFrame(preprocessed_df['FLAG'])

  Y = Y.values.ravel()

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y ,test_size=0.3, random_state=0)
  smt = SMOTE(random_state = 42)

  X_train_res, Y_train_res = smt.fit_resample(X_train, Y_train)
  Y_train_res = pd.DataFrame({'FLAG': Y_train_res[:]})

  train_df = pd.concat([X_train_res.reset_index(drop = True), Y_train_res], axis = 1)
  Y_test = pd.DataFrame({'FLAG': Y_test[:]})
  test_df = pd.concat([X_test.reset_index(drop = True), Y_test], axis = 1)
  return train_df, test_df

sampled_df, test_df = over_sampling(preprocessed_df)

In [None]:
# Hyper Parameter Tuning
def hyper_parameter_tuning(sampled_df):
  """
  Perform tuning using
  GridSearchCV for random forest clasifier
  :param sampled_df: train dataframe with balanced classes after oversampling
  :return: best set of hyperparameters
  """
  n_estimators = [300,500,750,800]
  max_depth = [5,8, None]
  min_samples_split = [2,5]
  min_samples_leaf = [1,2]
  forest = RandomForestClassifier(random_state=1)
  folds = 5
  skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state=1)
  kf = KFold(n_splits=folds, shuffle = True, random_state=1)
  hyperF = dict(n_estimators = n_estimators,
                max_depth = max_depth,
                min_samples_split = min_samples_split,
                min_samples_leaf = min_samples_leaf)
  gridF = GridSearchCV(forest, hyperF, cv=skf, verbose=1, n_jobs=-1, scoring='recall')
  X_train_res = pd.DataFrame(sampled_df.drop(['FLAG'], axis =1))
  Y_train_res = pd.DataFrame(sampled_df['FLAG']).values.ravel()
  best_ft = gridF.fit(X_train_res, Y_train_res)
  best_ft_param = best_ft.best_params_
  return best_ft_param

best_ft_param = hyper_parameter_tuning(sampled_df)

In [1]:
# Model Training
def train_model(sampled_df):
  """
  Train a random Forest Classifier
  Using the best params out of hyperparameter tuning
  :param: sampled_df: train dataframe for performing the training
  :return: trained model object
  """
  X_train_res = pd.DataFrame(sampled_df.drop(['FLAG'], axis=1))
  Y_train_res = pd.DataFrame(sampled_df['FLAG']).values.ravel()
  rf_model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight = None,
                                    criterion= 'gini',max_depth=best_ft_param['max_depth'],max_features='auto',
                                    min_samples_leaf=best_ft_param['min_samples_leaf'],
                                    min_samples_split = best_ft_param['min_samples_split'],
                                    n_estimators = best_ft_param['n_estimators'],oob_score = False)
  rf_model1 = rf_model.fit(X_train_res, Y_train_res)
  return rf_model1
  
rf_model1 = train_model(sampled_df)



In [1]:
#Metrics for predictions made on test set
def get_model_metrics_test_set():
  """
  Uses test set to evaluate model metrics and
  print precision, recall and F1
  :return: None
  """
  X_test = pd.DataFrame(test_df.drop(['FLAG'], axis=1))
  Y_test = pd.DataFrame(test_df['FLAG'])
  predictions = rf_model1.predict(X_test)
  rf_model1.score(X_test, Y_test)
  Y_pred = rf_model1.predict_proba(X_test)
  Y_pred_v2 = np.where(Y_pred[:,1] >=0.5, 1, 0)
  print('Accuracy on Test set: {:.2f}'.format(rf_model1.score(X_test, Y_test)))
  print(confusion_matrix(Y_test, Y_pred_v2))
  print(classification_report(Y_test, Y_pred_v2))

get_model_metrics_test_set()

In [None]:
#Saving Serialized model
def save_serialised_model(model_object, pkl_file_name):
  """
  save a serialised object of the trained model

  :param: model_object: trained model object
  :param: pkl_file_name: string, name of the pickled file to serialise the object to
  :return: None
  """
  pickle.dump(model_object, open(pkl_file_name, 'wb'))

save_serialised_model(rf_model1, 'trained_model.pkl')