In [None]:
import datetime
import pickle
from getpass import getuser
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import (classification_report,precision_score,recall_score,roc_auc_score,f1_score,confusion_matrix)
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import (Bucketizer,QuantileDiscretizer,VectorAssembler,StandardScaler,OneHotEncoder,StringIndexer)
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.stat import Correlation
from pyspark.ml.classification import GBTClassifier,GBTClassificationModel
from pyspark.mllib.evaluation import (BinaryClassificationMetrics,MulticlassMetrics)
import tensorflow as tf


In [None]:
files_address = 'hdfs:///user/{}/acc_final/final_models/'.format(getuser())

# Data Prep

In [None]:
# train_df= spark.sql('select * from feature_table_data')
train_df = spark.read.parquet('file_path')
train_df.count()

# Pre Processing

In [None]:
def preprocessing(data_df, serialized_objects=True, prediction=True):
    """
    preprocess the dataframe for model training and prediction
    :param data_df: dataframe to be preprocessed
    :param serialized_objects: dictionary to impute null values during prediction
    :param prediction: flag for training or prediction
    :return : preprocessed dataframe
    """
    if prediction:
        max_recency_acc_dig = serialized_objects['max_recency_acc_dig']
        max_recency_dig_2yr = serialized_objects['max_recency_dig_2yr']
        max_acc_recency_mf  = serialized_objects['max_acc_recency_mf']
    else:
        max_recency_acc_dig = data_df.approxQuantile('recency_acc_dig', [1.0], 0.00001)[0]
        max_recency_dig_2yr = data_df.approxQuantile('recency_dig_2yr', [1.0], 0.00001)[0]
        max_acc_recency_mf  = data_df.approxQuantile('acc_recency_mf', [1.0], 0.00001)[0]
        
    for i in data_df.columns:
        data_df = data_df.withColumnRenamed(i, i.lower())
        
    data_df = data_df.withColumn('acc_flag', F.when(F.col('acc_flag').isNull(),0.0).otherwise(F.col('acc_flag')))
    
    data_df = data_df.na.fill({
        'recency_acc_dig': max_recency_acc_dig,
        'recency_dig_2yr': max_recency_dig_2yr,
        'acc_recency_mf' : max_acc_recency_mf
    })
    
    freq_acc_upg_2yrs_split = [-float("inf"),0,1,2, float("inf")]
    bucketizer_freq_acc_upg_2yrs_split = Bucketizer(splits=freq_acc_upg_2yrs_split, inputCol="freq_acc_upg_acc_2yrs",
                                                   outputCol="freq_acc_upg_acc_2yrs_bkt")
    data_df = bucketizer_freq_acc_upg_2yrs_split.setHandleInvalid("keep").transform(data_df)
    
    tot_purchase_split = [-float("inf"),0,1,2,3, float("inf")]
    bucketizer_tot_purchase = Bucketizer(splits=tot_purchase_split, inputCol="tot_accsry_purchse",
                                        outputCol="tot_accsry_purchse_bkt")
    data_df = bucketizer_tot_purchase.setHandleInvalid("keep").transform(data_df)
    
    del_cols_new = ['freq_acc_upg_acc_2yrs','tot_accsry_purchse']
    data_df = data_df.drop(*del_cols_new)
    
    return data_df, max_recency_acc_dig, max_recency_dig_2yr, max_acc_recency_mf

# Save Object

In [None]:
def save_objects(pipelineModel, max_recency_acc_dig, max_recency_dig_2yr, max_acc_recency_mf,
                num_cols, cat_cols, bin_cols):
    """
    save object to impute null values and model files
    :param pipelineModel: model object,
    :param max_recency_acc_dig,max_recency_dig_2yr,max_acc_recency_mf: max values,
    :param num_cols: numeric columns list,
    :param cat_cols: categorical columns list,
    :param bin_cols: ordinal categorical columns list,
    :return: object file will be saved and return nothing
    """
    # storing trained pipeline
    pipelineModel.write().overwrite().save(files_address + 'pipelineModel.file')
    
    # storing saved Median values
    serialized_objects = {}
    serialized_objects['num_cols'] = num_cols
    serialized_objects['cat_cols'] = cat_cols
    serialized_objects['bin_cols'] = bin_cols
    serialized_objects['max_recency_acc_dig'] = max_recency_acc_dig
    serialized_objects['max_recency_dig_2yr'] = max_recency_dig_2yr
    serialized_objects['max_acc_recency_mf']  = max_acc_recency_mf
    serialized_objects_pickle = pickle.dumps(serialized_objects)
    serialized_objects_file = tf.io.gfile.GFile(files_address + 'pipelineModel_obj.file', 'wb')
    serialized_objects_file.write(serialized_objects_pickle)
    serialized_objects_file.close()
    print('Trained Objects Successfully Stored at :', files_address)
    

# Load Object

In [None]:
def load_objects():
    """
    load pickle files from the stored path
    :param: no parameters needed,
    :return: model and imputation dict objects
    """
    # Loading trained pipeline
    pipeline_model = PipelineModel.load(files_address + 'pipelineModel.file')
    
    # Load saved Median values
    serialized_objects_file = tf.io.gfile.GFile(files_address + 'pipelineModel_obj.file', 'rb')
    serialized_objects = pickle.load(serialized_objects_file)
    serialized_objects_file.close()
    print('Required objects loaded from :', files_address)
    return pipeline_model, serialized_objects
    

# Train Model

In [None]:
def train_model(sampled_df, feat_cat, feat_num, feat_bin):
    """
    pipeline creation and model training
    :param sampled_df: training_dataset,
    :param feat_cat: categorical feature list,
    :param feat_num: numerical feature list,
    :param feat_bin: ordinal categorical feature list,
    :return: trained model
    """
    #Vector for numerical features
    vector_assembler_NumVars = VectorAssembler(inputCols=feat_num, outputCol='num_features_all')
    
    #Indexing & Other stuff
    indexers_Cat = [StringIndexer(inputCol=tc, outputCol="{0}_index".format(tc)).setHandleInvalid('keep') for tc in
                   feat_cat]
    assembler_Cat = VectorAssembler(inputCols=[ict.getOutputCol() for ict in indexers_Cat], outputCol='cat_features')
    # Bin Assembler
    input_feat = [col for col in feat_bin] + ['num_features_all']
    assembler_bin = VectorAssembler(inputCols=input_feat, outputCol='bin_features')
    
    #All features into vector assembler
    assembler = VectorAssembler(inputCols = ['cat_features', 'bin_features'], outputCol='features')
    
    #Sampling training data
    train_df = sampled_df
    
    #Model building
    label = 'acc_flag'
    features = 'features'
    depth = 4
    maxIter = 50
    minInfoGain = 0.0
    checkpointInterval = 12
    lossType = 'logistic'
    
    model = GBTClassifier(labelCol=label,
                         featuresCol=features,
                         maxDepth = depth,
                         maxIter = maxIter,
                         minInfoGain = minInfoGain,
                         checkpointInterval = checkpointInterval,
                         lossType = lossType,
                         seed = 1)
    preprocessing_stages = [vector_assembler_NumVars] + indexers_Cat + [assembler_Cat] + [assembler_bin] + [assembler]
    stages = preprocessing_stages + [model]
    pipeline = Pipeline(stages = stages)
    pipelineModel = pipeline.fit(train_df)
    train_scored_df = pipelineModel.transform(train_df)
    
    df_train = train_scored_df.select(['probability', 'prediction', 'acc_flag', 'features']).toPandas()
    
    print('Training Summary')
    print('AUC on train set: ',roc_auc_score(df_train['acc_flag'], df_train['prediction']))
    print(classification_report(df_train['acc_flag'], df_train['prediction']))
    
    return pipelineModel
    

In [None]:
def training(df):
    """
    Sampling the train data and save the model object file
    :param df: dataframe to be sampled and trained
    :return: saves the model object and returns nothing
    """
    df,max_recency_acc_dig, max_recency_dig_2yr, max_acc_recency_mf = preprocessing(df, prediction = False)
    #Over sampling the training data
    target = 'acc_flag'
    zero = df.where(col(target) == 0)
    one  = df.where(col(target) == 1)
    ratio = float(zero.count() / one.count())
    tr = one.sample(True, ratio, 42)
    sampled_df = tr.union(zero)
    
    cat_cols = ['categorical_columns']
    num_cols = ['numerical_col1', 'numerical_col2', 'all_num_cols']
    bin_cols = ['binned_columns']
    
    print('Sampling Data details:')
    print('Number of Records :', sampled_df.count())
    print('Number of features:', len(sampled_df.columns))
    print('Target Distribution :')
    
    sampled_df.groupBy('acc_flag').count().withColumnRenamed('count','cnt_per_group') \
         .withColumn('% of_total_count',(F.col('cnt_per_group') / sampled_df.count()) * 100).show()
    
    pipelineModel = train_model(sampled_df, cat_cols, num_cols, bin_cols)
    
    save_objects(pipelineModel, max_recency_acc_dig, max_recency_dig_2yr, max_acc_recency_mf,
                 num_cols, cat_cols, bin_cols )  
       

In [None]:
training(train_df)

# Prediction

In [None]:
def prediction(df):
    """
    preprocess the data and score the model
    :param df: dataframe to be scored
    :return: scored dataframe and model object used for scoring
    """
    pipeline_model, serialized_objects = load_objects()
    df, _, _, _ = preprocessing(df, serialized_objects)
    
    print('Data Distribution:')
    print('Number of Records :', df.count())
    print('Number of Features :', len(df.columns))
    
    print('Target Distribution :')
    df.groupBy('acc_flag').count().withColumnRenamed('count','cnt_per_group') \
         .withColumn('% of_total_count',(F.col('cnt_per_group') / df.count()) * 100).show()
    
    df_pred = pipeline_model.transform(df)
    df_test = df_pred.select(['probability', 'prediction', 'acc_flag', 'features']).toPandas()
    print('Testing Summary')
    print('AUC on test set: ',roc_auc_score(df_test['acc_flag'], df_test['prediction']))
    print(classification_report(df_test['acc_flag'], df_test['prediction']))
    return df_pred, pipeline_model
    

In [None]:
test_df = spark.read.parquet('file path')
test_scored_df, model = prediction(test_df)

# Lift for test data

In [None]:
#firstelement = udf(lambda v:float(v[0]),FloatType())
secondelement = udf(lambda v:float(v[1]),FloatType())

def lift_cal(df_predictions_gbt):
    total = df_predictions_gbt.count()
    df_predictions_gbt = df_predictions_gbt.withColumn('prob_1', secondelement('probability'))
    qds = QuantileDiscretizer(numBuckets=10, inputCol='prob_1', outputCol = 'Decile', relativeError = 0.0001, handleInvalid='error')
    bucketizer = qds.fit(df_predictions_gbt)
    df_predictions_dec = bucketizer.setHandleInvalid('skip').transform(df_predictions_gbt)
    df_predictions_dec = df_predictions_dec.groupBy('Decile').agg(F.sum('acc_flag').alias('Buyer'),F.count('acc_flag').alias('decile_count'))
    avg = df_predictions_dec.agg(F.avg('Buyer')).collect()[0][0]
    df_predictions_dec = df_predictions_dec.withColumn('Lift', F.col('Buyer')/F.lit(avg))
    df_predictions_dec = df_predictions_dec.withColumn('Base', F.lit(total))
    return df_predictions_dec,avg

In [None]:
test_dec_df, avg_val = lift_cal(test_scored_df)
test_dec_df.orderBy(F.col('Decile').desc()).show()

# Train Metrics without sampling

In [None]:
train_scored, model = prediction(train_df)