In [1]:
# Imports Required Libraries
import json
import numpy as np
import os
import pandas as pd
import pickle
import statsmodels.api as sm
import uvicorn

from fastapi import FastAPI

from pydantic import BaseModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
def load_models():
    '''
    Returns loaded in, pre-trained Prediction, SimpleImputer and StandardScalar models
    '''
    
    mdl = sm.load('../models/glm_final_model.pickle')
    si = pickle.load(open('../models/glm_simple_imputer.pickle', 'rb'))
    ss = pickle.load(open('../models/glm_standard_scalar.pickle', 'rb'))
    
    return mdl, si, ss

In [3]:
def transform_json_to_df(json_str):
    '''
    Returns a DataFrame containing raw JSON data
    
    Keyword Arguments:
    json_str -- Raw JSON data
    '''
    
    obj = json.loads(json_str) 
    
    if isinstance(obj, list):
        df = pd.read_json(json_str, orient = 'records')
    
    else:
        json_str = '[' + json_str + ']'
        df = pd.read_json(json_str, orient = 'records')
    
    return df

In [4]:
def format_df_column_variables(df):
    '''
    Returns a DataFrame after having transformed those columns, whose values 
        consisted of those of data types String and represented quantitative 
        data, into values being that of the Float data type after removing any
        non mathematical symbols 
    
    Keyword Arguments:
    df -- A DataFrame containing raw JSON data
    '''
    
    # Formats the 'x12' and 'x63' columns', consisting of the data type String, 
    #    and respectively representing first monetary then percentage values,
    #    into the data type Float so as to be able to latter on apply mathermatical
    #    work upon said columns' values later on

    df['x12'] = df['x12'].str.replace('$', '')
    df['x12'] = df['x12'].str.replace(',', '')
    df['x12'] = df['x12'].str.replace(')', '')
    df['x12'] = df['x12'].str.replace('(', '-')
    df['x12'] = df['x12'].astype(float)
    
    df['x63'] = df['x63'].str.replace('%', '')
    df['x63'] = df['x63'].astype(float)
    
    return df

In [5]:
def impute_missing_df_data(si, df):
    '''
    Returns a DataFrame with no column variables whose data is of a qualitative nature, 
        as well as having filled in any remaining blank, NaN or NULL or otherwise missing 
        values via the usage of an imported, pre-trained SimpleImputer using a mean based 
        strategy
        
    Keyword Arguments:
    si -- A pre-trained, imported SimpleImputer
    df -- A formatted DataFrame
    '''
    
    #if df.shape[0] <= 1:
    #    df = pd.DataFrame(df.drop(columns = ['x5', 'x31', 'x81', 'x82']), 
    #                      columns = df.drop(columns = ['x5', 'x31', 'x81', 'x82']).columns)
    #    df = df.fillna(0)
        
    #else:
    df = pd.DataFrame(si.transform(df.drop(columns = ['x5', 'x31', 'x81', 'x82'])), 
                      columns = df.drop(columns = ['x5', 'x31', 'x81', 'x82']).columns)
    
    return df

In [6]:
def scale_df_data(ss, df):
    '''
    Returns a DataFrame whose column variable values have all been scaled via a 
        standardization method for the purpose of feature scaling, utilizing an
        imported. pre-trained StandardScalar
    
    Keyword Arguments:
    ss -- A pre-trained, imported StandardScalar
    df -- A formatted DataFrame without any blank, NaN, NULL or otherwise missing values
    '''
    
    # Of particular interest and focus is that of the 'x12' column representing
    #    monetary values which tend to outscale all other column variable values
    #    by some orders of magnitude
    
    df = pd.DataFrame(ss.transform(df), 
                      columns = df.columns)
    
    return df

In [7]:
def create_df_dummy_column_variables_new(df1, df2):
    '''
    Returns a DataFrame with dummy variables for those column variables consisting of
        qualitative data, as well as whose column variables of numeric quantitative data
        have no missing values and are scaled
        
    Keyword Arguments:
    df1 -- A DataFrame containing the original raw JSON data in order to retrieve those column
           variable values of a qualitative nature previously dropped and must now be 
           dummified
    df2 -- A DataFrame with no missing column variable values and whose said values have already been scaled
    '''
    
    # A list of column variable names of a quantitative nature which require dummification
    vars_to_dummify = ['x5', 'x31', 'x81', 'x82']
    
    for var in vars_to_dummify:

        var_dummy_vars = pd.get_dummies(df1[var], 
                                        drop_first = True, 
                                        prefix = var, 
                                        prefix_sep = '_', 
                                        dummy_na = True)


        df2 = pd.concat([df2, var_dummy_vars], 
                        axis = 1, 
                        sort = False)

    return df2

In [8]:
def create_df_dummy_column_variables_old(df1, df2):
    '''
    Returns a DataFrame with dummy variables for those column variables consisting of
        qualitative data, as well as whose column variables of numeric quantitative data
        have no missing values and are scaled
        
    Keyword Arguments:
    df1 -- A DataFrame containing the original raw JSON data in order to retrieve those column
           variable values of a qualitative nature previously dropped and must now be 
           dummified
    df2 -- A DataFrame with no missing column variable values and whose said values have already been scaled
    '''
    
    x5_dummy_variables = pd.get_dummies(df1['x5'], 
                                    drop_first = True, 
                                    prefix = 'x5', 
                                    prefix_sep = '_', 
                                    dummy_na = True)

    df2 = pd.concat([df2, x5_dummy_variables], axis = 1, sort = False)

    x31_dummy_variables = pd.get_dummies(df1['x31'], 
                                         drop_first = True, 
                                         prefix = 'x31', 
                                         prefix_sep = '_', 
                                         dummy_na = True)
    
    df2 = pd.concat([df2, x31_dummy_variables], axis = 1, sort = False)

    x81_dummy_variables = pd.get_dummies(df1['x81'], 
                                         drop_first = True, 
                                         prefix = 'x81', 
                                         prefix_sep = '_', 
                                         dummy_na = True)

    df2 = pd.concat([df2, x81_dummy_variables], axis = 1, sort = False)

    x82_dummy_variables = pd.get_dummies(df1['x82'], 
                                         drop_first = True, 
                                         prefix = 'x82', 
                                         prefix_sep = '_', 
                                         dummy_na = True)

    df2 = pd.concat([df2, x82_dummy_variables], axis = 1, sort = False)
    
    return df2

In [9]:
def filter_df_column_variables(ordr_clmn_names_lst, df):
    '''
    Returns a DataFrame containing only those column variables required by the pre-trained model
        for predictions, filtering out drom the given DataFrame only said columns
    
    Keyword Arguments:
    ordr_clmn_names_lst -- A list of DataFrame column variable names required by the pre-trained model
    df -- The DataFrame with dummy variables, and whose quantitative values have been scaled and 
          have none missing
    '''
    
    necessary_clmn_vars_set = set(ordr_clmn_names_lst)
    avlbl_clmn_vars_set = set(df.columns)

    # Depending upon the type and amount of JSON data originally having been passed in, 
    #    not all of the desired dummy variables will always be successfully generated, 
    #    necessitating their inclusion afterwards via the code below
    if necessary_clmn_vars_set.issubset(avlbl_clmn_vars_set) == False:
        nan_df = pd.DataFrame(np.nan, index = range(df.shape[0]), columns = ordr_clmn_names_lst)
        df = df.combine_first(nan_df)
        df = df.fillna(0)

    df = df[ordr_clmn_names_lst].copy(deep = True)  
    
    return df

In [10]:
def extract_transform_input_data_pipeline(json, si, ss, ordr_clmn_names_lst):
    '''
    Returns a DataFrame created from the passed in, raw JSON data, transforming said data via
        imputation, to fill in any and all missing values, scaling, and the creation of dummy
        variables for those qualitative column variables which require such action
    
    Keyword Arguments:
    json -- Raw JSON data
    si -- A pre-trained, imported SimpleImputer
    ss -- A pre-trained, imported StandardScalar
    ordr_clmn_names_lst -- A list of DataFrame column variable names required by the pre-trained model
    '''
    
    df = transform_json_to_df(json)
    
    if df.shape[0] < 1:
        df = pd.DataFrame()
 
    else:
        df = format_df_column_variables(df)
        #print('Format')
        #print(df.head())
        #print(df.columns)

        imputed_df = impute_missing_df_data(si, df)
        #print('Impute')
        #print(imputed_df.head())
        #print(imputed_df.columns)

        scaled_imputed_df = scale_df_data(ss, imputed_df)
        #print('Scale')
        #print(scaled_imputed_df.head())
        #print(scaled_imputed_df.columns)

        df = create_df_dummy_column_variables_old(df, scaled_imputed_df)
        #print(df.head())
        #print(df.columns)

        df = filter_df_column_variables(ordr_clmn_names_lst, df)
        #print('Filter')
        #print(df.head())
        #print(df.columns)
    
    return df

In [11]:
def predict_outcome(df, mdl, alphanum_ord_clmn_var_names_lst):
    '''
    Returns a JSON message containing either the model's predicted outcomes, 
        marked as 'business_outcome', predicted probability, marked as 'p_hat',
        and the inputs in the alphanumerical order of their variables' names or,
        should the predicted probability be under that of 75%, a message indicating
        as such; if no valid JSON data as far as the application can tell is passed in, 
        then a JSON encoded error message will be returned instead
        
    Keyword Arguments:
    df -- A DataFrame containing only those 25 column variables required by the pre-trained model
    mdl -- A pre-trained, imported prediction model
    alphanum_ord_clmn_var_names_lst -- A list of DataFrame column variable names required by 
                                       the pre-trained model order alphanumerically
    '''
    
    num_rows_df = df.shape[0]
    
    if num_rows_df == 0:
        
        return {'message': 'ERROR - No valid JSON data available for prediction.'}
    
    else:
    
        output_msgs_lst = []
    
        for row in range(num_rows_df):
            predicted_outcome = 0
            mdl_inputs = {}

            predicted_probability = mdl.predict(df.iloc[row])[0]

            if predicted_probability >= 0.75:
                predicted_outcome = 1
                
                for var in alphanum_ord_clmn_var_names_lst:
                    mdl_inputs[var] = df.iloc[row][var]

                mdl_predictions = {'business_outcome': str(predicted_outcome), 
                                    'p_hat': str(predicted_probability)}

                prediction_msg = mdl_predictions | mdl_inputs

                output_msgs_lst.append(prediction_msg)

            else:
                #output_msgs_lst.append({'message': 'Business outcome probability too low.'})
                pass
        
        return output_msgs_lst

In [None]:
def main(input_json):
    '''
    Returns a single event or a list of events as a JSON message containing the business outcome, 
        probability of said outcome, along with the input variables which led to said outcome 
        in alphanumerical order for all those predictions which met the minimum standard of 75% 
        chance of a successful sale to a potential buying customer
    
    Keyword Arguments:
    input_json -- Raw JSON data
    '''
    json = input_json
    
    # List of the already properly ordered column variables required by the pre-trained
    #    model in order for it to carry out accurate predictions
    final_df_column_variable_names_order = [
        'x5_saturday', 'x81_July', 'x81_December', 'x31_japan', 'x81_October', 
        'x5_sunday', 'x31_asia', 'x81_February', 'x91', 'x81_May', 
        'x5_monday', 'x81_September', 'x81_March', 'x53', 'x81_November', 
        'x44', 'x81_June', 'x12', 'x5_tuesday', 'x81_August', 
        'x81_January', 'x62', 'x31_germany', 'x58', 'x56']
    
    alphanumerically_sorted_df_column_variable_names = sorted(final_df_column_variable_names_order)
    
    mdl, si, ss = load_models()
    
    df = extract_transform_input_data_pipeline(json, 
                                               si, 
                                               ss, 
                                               final_df_column_variable_names_order)
    
    json_output_message = predict_outcome(df, mdl, alphanumerically_sorted_df_column_variable_names)
    
    #json_msgs_results = json_output_message.split("}, ")
    #print(f'Number of Predictions: {len(json_msgs_results)} \n')
    
    #print(json_output_message[:1000])
    
    #for json_msg in json_msgs_results:
    #    print(json_msg[:50])
        
    return json_output_message

In [None]:
# For Debugging & Testing Purposes
raw_testing_data1 = pd.read_csv(os.path.join('../testing', 'exercise_26_test.csv'))
raw_testing_data2 = pd.read_csv(os.path.join('../testing', 'exercise_26_test.csv'), nrows = 1)
raw_testing_data3 = pd.read_csv(os.path.join('../testing', 'exercise_26_test.csv'), nrows = 10)
raw_testing_data4 = pd.read_csv(os.path.join('../testing', 'exercise_26_test.csv'), nrows = 100)
raw_testing_data5 = pd.read_csv(os.path.join('../testing', 'exercise_26_test.csv'), nrows = 1000)

#raw_testing_data1.head()
#raw_testing_data2.head()
#raw_testing_data3.head()
#raw_testing_data4.head()
#raw_testing_data5.head()

In [None]:
sample_raw_json_1_row_v1 = "{\"x0\":0.042317,\"x1\":-3.344721,\"x2\":4.6351242122,\"x3\":-0.5983959993,\"x4\":-0.6477715046,\"x5\":\"monday\",\"x6\":0.184902,\"x7\":46.690015,\"x8\":3.034132,\"x9\":0.364704,\"x10\":14.260733,\"x11\":-1.559332,\"x12\":\"$5,547.78\",\"x13\":0.520324,\"x14\":31.212255,\"x15\":4.891671,\"x16\":0.357763,\"x17\":14.766366,\"x18\":-17.467243,\"x19\":0.224628,\"x20\":0.096752,\"x21\":1.305564,\"x22\":0.353632,\"x23\":3.909028,\"x24\":-91.273052,\"x25\":1.396952,\"x26\":4.401593,\"x27\":0.443086,\"x28\":14.048787,\"x29\":-0.932243,\"x30\":5.255472,\"x31\":\"germany\",\"x32\":0.54199153,\"x33\":2.98948039,\"x34\":-1.78334189,\"x35\":0.80127315,\"x36\":-2.60231221,\"x37\":3.39682926,\"x38\":-1.22322646,\"x39\":-2.20977636,\"x40\":-68.69,\"x41\":522.25,\"x42\":-428.69,\"x43\":381.37,\"x44\":0.0197503,\"x45\":0.75116479,\"x46\":0.8630479008,\"x47\":-1.0383166613,\"x48\":-0.2726187635,\"x49\":-0.3430207259,\"x50\":0.3109008666,\"x51\":-0.797841974,\"x52\":-2.0390175153,\"x53\":0.87182889,\"x54\":0.14373012,\"x55\":-1.15212514,\"x56\":-2.1703139704,\"x57\":-0.267842962,\"x58\":0.212110633,\"x59\":1.6926559407,\"x60\":-0.9522767913,\"x61\":-0.8625864974,\"x62\":0.0748487158,\"x63\":\"36.29%\",\"x64\":3.47125327,\"x65\":-3.16656509,\"x66\":0.65446814,\"x67\":14.60067029,\"x68\":-20.57521013,\"x69\":0.71083785,\"x70\":0.16983767,\"x71\":0.55082127,\"x72\":0.62814576,\"x73\":3.38608078,\"x74\":-112.45263714,\"x75\":1.48370808,\"x76\":1.77035368,\"x77\":0.75702363,\"x78\":14.75731742,\"x79\":-0.62550355,\"x80\":null,\"x81\":\"October\",\"x82\":\"Female\",\"x83\":-0.7116680715,\"x84\":-0.2653559892,\"x85\":0.5175495907,\"x86\":-1.0881027092,\"x87\":-1.8188638198,\"x88\":-1.3584469527,\"x89\":-0.654995195,\"x90\":-0.4933042262,\"x91\":0.373853,\"x92\":0.94143481,\"x93\":3.54679834,\"x94\":-99.8574882,\"x95\":0.403926,\"x96\":1.65378726,\"x97\":0.00771459,\"x98\":-32.02164582,\"x99\":-60.3127828}"

In [None]:
# Retrieves slices of raw testing data of various sizes whilst converting them into a JSON format
sample_raw_json_1_row_v2 = raw_testing_data2.to_json(orient = 'records')
sample_raw_json_10_rows = raw_testing_data3.to_json(orient = 'records')
sample_raw_json_100_rows = raw_testing_data4.to_json(orient = 'records')
sample_raw_json_1000_rows = raw_testing_data5.to_json(orient = 'records')
sample_raw_json_10000_rows = raw_testing_data1.to_json(orient = 'records')

In [None]:
# Saves raw JSON data files for proper testing when Docker becomes involved
#with open(os.path.join('../testing', 'sample_raw_json_10000_rows.json'), 'w') as file:
#    json.dump(sample_raw_json_10000_rows, file)

In [None]:
json_results1 = main(sample_raw_json_1_row_v1)
print(json_results1)

In [None]:
json_results2 = main(sample_raw_json_1_row_v2)
print(json_results2)

In [None]:
json_results3 = main(sample_raw_json_10_rows)
print(json_results3)

In [None]:
json_results4 = main(sample_raw_json_100_rows)
print(json_results4)

In [None]:
json_results5 = main(sample_raw_json_1000_rows)
print(json_results5)