In [41]:
# Imports required libraries
import json
import numpy as np
import os
import pandas as pd
import statsmodels.api as sm
import uvicorn

from fastapi import FastAPI

from pydantic import BaseModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# Creates and instantiates the FastAPI application object
app = FastAPI(title = "GLM Model API",
              description = "API for GLM dataset", version = "1.0")

In [4]:
# Sets up the index route, which opens automatically upon http://127.0.0.1:1313
@app.get('/')
def index():
    return json.dumps({'message': 'Successfully connected to GLM Model Application.'})

In [None]:
# Carries out a prediction upon passed in data
@app.post('/predict')
def predict():
    pass

In [7]:
# Starts up an instance of a uvicorn server in order for the FastAPI application object to run upon
if __name__ == '__main__':
    uvicorn.run(app, host = '127.0.0.1', port = 1212)

RuntimeError: asyncio.run() cannot be called from a running event loop

In [80]:
raw_testing_data = pd.read_csv(os.path.join('../training', 'exercise_26_test.csv'))

In [81]:
raw_testing_data.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99
0,0.042317,-3.344721,4.635124,-0.598396,-0.647772,monday,0.184902,46.690015,3.034132,0.364704,...,-0.493304,0.373853,0.941435,3.546798,-99.857488,0.403926,1.653787,0.007715,-32.021646,-60.312783
1,-1.03316,-0.34014,5.871823,,0.122133,tuesday,0.997773,51.581411,1.709219,0.844079,...,0.521119,0.148424,0.925301,3.830426,-101.105748,0.055775,0.56489,0.051716,-32.540612,-266.725795
2,2.029367,-3.239301,4.724436,2.211831,0.551611,tuesday,0.492405,87.179042,4.333755,0.513789,...,0.154492,-0.034504,0.904042,3.642968,-107.476487,1.046718,1.494123,0.231084,-32.740954,-4.327887
3,-0.065676,1.892277,4.818741,0.640313,1.944562,friday,0.208718,73.573314,4.929132,0.116004,...,0.305243,-0.099213,0.712234,3.853489,-91.650053,0.499861,2.804358,0.627921,-32.190043,103.192597
4,-0.357126,-1.852161,5.367849,-0.069869,-0.641455,saturday,0.940286,72.773335,,0.191044,...,0.617258,0.307445,0.376738,3.306958,-99.55714,1.275527,1.476482,0.122798,-32.957087,-111.509168


In [264]:
sample_json = raw_testing_data.to_json(orient = 'records')

In [299]:
def load_model():
    
    mdl = sm.load('glm_final_model.pickle')
    
    return mdl

In [229]:
def transform_json_to_df(json):
    
    df = pd.read_json(json, orient = 'records')
    
    return df

In [230]:
'''
Filters out and retrives only those columns previously determined to be the most useful during the
    creation of the prediction model
'''
def filter_df_column_variables(df):
    
    df = df[['x5', 'x12', 'x31', 'x44', 'x53', 'x56', 'x58', 'x62', 'x81','x91']].copy()
    
    return df

In [231]:
def format_df_column_variables(df):
    
    # Formats the 'x12' column's String monetary value into a Float in order to apply maths upon it
    df['x12'] = df['x12'].str.replace('$', '')
    df['x12'] = df['x12'].str.replace(',', '')
    df['x12'] = df['x12'].str.replace(')', '')
    df['x12'] = df['x12'].str.replace('(', '-')
    df['x12'] = df['x12'].astype(float)
    
    return df

In [232]:
def impute_missing_df_data(df):
    
    # Creates and instantiates a simple imputer
    si = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    
    # Imputes via a simple mean stategy those column values which are missing
    df = pd.DataFrame(si.fit_transform(df.drop(columns = ['x5', 'x31', 'x81'])), 
                      columns = df.drop(columns = ['x5', 'x31', 'x81']).columns)
    
    return df

In [233]:
def scale_df_data(df):
    
    # Creates and instantiates a standard scaler
    ss = StandardScaler()
    
    '''
    Scales all column values via a standardization method for feature scaling, 
        of particular interest and focus being being that of the monetary value column
    '''
    df = pd.DataFrame(ss.fit_transform(df), 
                      columns = df.columns)
    
    return df

In [272]:
def create_df_dummy_column_variables(df1, df2):
    
    '''
    Creates the dummy variables for the non-numeric, qualitative data type columns and then reconcatenates
        them back into the now imputed and standardized scaled dataframe
    '''
    vars_to_dummify = ['x5', 'x31', 'x81']
    
    for var in vars_to_dummify:

        var_dummy_vars = pd.get_dummies(df1[var], 
                                        drop_first = True, 
                                        prefix = var, 
                                        prefix_sep = '_', 
                                        dummy_na = True)


        df2 = pd.concat([df2, var_dummy_vars], 
                        axis = 1, 
                        sort = False)

    return df2

In [280]:
def drop_unwated_df_column_variables(df):
    
    '''
    Filters out and drops those dummy value columns previously unselected during the
        creation of the prediction model
    '''
    df = df.drop(['x5_wednesday', 'x5_thursday', 'x5_nan', 'x31_nan', 'x81_nan'], 
                 axis = 1, 
                 inplace = True)
    
    return df

In [300]:
def rearange_df_column_variable_order(df, ordr_clmn_names_lst):
    
    '''
    Rearranges the variable columns into the proper order as required by the pre-existing 
        and pre-trained model
    '''

    df = df[ordr_clmn_names_lst]
    
    return df

In [301]:
def extract_transform_input_data_pipeline(json_data, ordr_clmn_names_lst):
    
    df = transform_json_to_df(json_data)
    
    df = filter_df_column_variables(df)
    
    df = format_df_column_variables(df)
    
    imputed_df = impute_missing_df_data(df)
    
    scaled_imputed_df = scale_df_data(imputed_df)
    
    df = create_df_dummy_column_variables(df, scaled_imputed_df)
    
    df = drop_unwated_df_column_variables(df)
    
    df = rearange_df_column_variable_order(df, ordr_clmn_names_lst)
    
    return df

In [302]:
def predict_outcome(df, model, alphanum_ord_clmn_var_names_lst):
    
    num_rows_df = df.shape[0]
    
    for row in range(num_rows_df):
        predicted_outcome = 0
        model_inputs = {}
    
        for var in alphanum_ord_clmn_var_names_lst:
            model_inputs[var] = df.iloc[row][var]
    
        predicted_probability = model.predict(df.iloc[row])[0]

        if predicted_probability >= 0.75:
            predicted_outcome = 1

        model_predictions = {'business_outcome': str(predicted_outcome), 'p_hat': str(predicted_probability)}

        output = model_predictions | model_inputs

        return json.dumps(output)

In [305]:
def main():
    
    final_df_column_variable_names_order = [
        'x5_saturday', 'x81_July', 'x81_December', 'x31_japan', 'x81_October', 
        'x5_sunday', 'x31_asia', 'x81_February', 'x91', 'x81_May', 
        'x5_monday', 'x81_September', 'x81_March', 'x53', 'x81_November', 
        'x44', 'x81_June', 'x12', 'x5_tuesday', 'x81_August', 
        'x81_January', 'x62', 'x31_germany', 'x58', 'x56']
    
    alphanumerically_sorted_df_column_variable_names = sorted(final_df_column_variable_names_order)
    
    mdl = load_model()
    
    df = extract_transform_input_data_pipeline(sample_json, 
                                               final_df_column_variable_names_order)
    
    json_output_message = predict_outcome(df, mdl, alphanumerically_sorted_df_column_variable_names)
    
    print(type(json_output_message))

In [306]:
main()

<class 'str'>


  df['x12'] = df['x12'].str.replace('$', '')
  df['x12'] = df['x12'].str.replace(')', '')
  df['x12'] = df['x12'].str.replace('(', '-')


In [9]:
model = sm.load('glm_final_model.pickle')

In [108]:
'''
Filters out and retrives only those columns previously determined to be the most useful during the
    creation of the prediction model
'''
sample_testing_data = raw_testing_data[['x5', 
                                        'x12', 
                                        'x31', 
                                        'x44', 
                                        'x53', 
                                        'x56', 
                                        'x58', 
                                        'x62',
                                        'x81',
                                        'x91']].copy()

In [109]:
sample_testing_data.head()

Unnamed: 0,x5,x12,x31,x44,x53,x56,x58,x62,x81,x91
0,monday,"$5,547.78",germany,0.01975,0.871829,-2.170314,0.212111,0.074849,October,0.373853
1,tuesday,"$-5,483.24",america,0.495203,-0.057626,1.626547,0.59132,0.725599,November,0.148424
2,tuesday,"$5,515.72",germany,0.976415,0.411248,-0.763295,0.550376,1.292216,July,-0.034504
3,friday,"$-4,446.84",germany,0.265688,0.49825,0.763711,-0.254893,1.856975,October,-0.099213
4,saturday,"$6,523.20",,0.943232,0.099885,0.401473,1.979944,1.385314,May,0.307445


In [110]:
#sample_testing_data = sample_testing_data.iloc[:1]

In [111]:
sample_testing_data.head()

Unnamed: 0,x5,x12,x31,x44,x53,x56,x58,x62,x81,x91
0,monday,"$5,547.78",germany,0.01975,0.871829,-2.170314,0.212111,0.074849,October,0.373853
1,tuesday,"$-5,483.24",america,0.495203,-0.057626,1.626547,0.59132,0.725599,November,0.148424
2,tuesday,"$5,515.72",germany,0.976415,0.411248,-0.763295,0.550376,1.292216,July,-0.034504
3,friday,"$-4,446.84",germany,0.265688,0.49825,0.763711,-0.254893,1.856975,October,-0.099213
4,saturday,"$6,523.20",,0.943232,0.099885,0.401473,1.979944,1.385314,May,0.307445


In [112]:
print(sample_testing_data.shape)
print(type(sample_testing_data))

(10000, 10)
<class 'pandas.core.frame.DataFrame'>


In [113]:
# Formats the 'x12' column's String monetary value into a Float in order to apply maths upon it
sample_testing_data['x12'] = sample_testing_data['x12'].str.replace('$', '')
sample_testing_data['x12'] = sample_testing_data['x12'].str.replace(',', '')
sample_testing_data['x12'] = sample_testing_data['x12'].str.replace(')', '')
sample_testing_data['x12'] = sample_testing_data['x12'].str.replace('(', '-')
sample_testing_data['x12'] = sample_testing_data['x12'].astype(float)

  sample_testing_data['x12'] = sample_testing_data['x12'].str.replace('$', '')
  sample_testing_data['x12'] = sample_testing_data['x12'].str.replace(')', '')
  sample_testing_data['x12'] = sample_testing_data['x12'].str.replace('(', '-')


In [114]:
sample_testing_data.head()

Unnamed: 0,x5,x12,x31,x44,x53,x56,x58,x62,x81,x91
0,monday,5547.78,germany,0.01975,0.871829,-2.170314,0.212111,0.074849,October,0.373853
1,tuesday,-5483.24,america,0.495203,-0.057626,1.626547,0.59132,0.725599,November,0.148424
2,tuesday,5515.72,germany,0.976415,0.411248,-0.763295,0.550376,1.292216,July,-0.034504
3,friday,-4446.84,germany,0.265688,0.49825,0.763711,-0.254893,1.856975,October,-0.099213
4,saturday,6523.2,,0.943232,0.099885,0.401473,1.979944,1.385314,May,0.307445


In [115]:
# Creates and instantiates a simple imputer
simple_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [116]:
# Creates and instantiates a standard scaler
standard_scaler = StandardScaler()

In [117]:
# Imputes via a simple mean stategy those column values which are missing
imputed_sample_testing_data = pd.DataFrame(
    simple_imputer.fit_transform(sample_testing_data.drop(columns=['x5', 'x31', 'x81'])), 
    columns = sample_testing_data.drop(columns=['x5', 'x31', 'x81']).columns)

In [118]:
imputed_sample_testing_data.head()

Unnamed: 0,x12,x44,x53,x56,x58,x62,x91
0,5547.78,0.01975,0.871829,-2.170314,0.212111,0.074849,0.373853
1,-5483.24,0.495203,-0.057626,1.626547,0.59132,0.725599,0.148424
2,5515.72,0.976415,0.411248,-0.763295,0.550376,1.292216,-0.034504
3,-4446.84,0.265688,0.49825,0.763711,-0.254893,1.856975,-0.099213
4,6523.2,0.943232,0.099885,0.401473,1.979944,1.385314,0.307445


In [120]:
'''
Scales all column values via a standardization method for feature scaling, 
    of particular interest and focus being being that of the monetary value column
'''
scaled_standardized_imputed_sample_testing_data = (
    pd.DataFrame(standard_scaler.fit_transform(imputed_sample_testing_data), 
                 columns = imputed_sample_testing_data.columns))

In [121]:
scaled_standardized_imputed_sample_testing_data.head()

Unnamed: 0,x12,x44,x53,x56,x58,x62,x91
0,0.95316,-1.692562,1.642021,-2.190166,0.198172,0.080677,0.449576
1,-0.957358,-0.011565,-1.602544,1.641571,0.579915,0.7454,-0.335268
2,0.947607,1.689795,0.034213,-0.770223,0.538696,1.324182,-0.972141
3,-0.777859,-0.823031,0.337922,0.770809,-0.271951,1.901067,-1.197429
4,1.122098,1.572476,-1.052698,0.405244,1.977814,1.419279,0.218373


In [122]:
'''
Creates the dummy variables for the non-numeric, qualitative data type columns and then reconcatenates
    them back into the now imputed and standardized scaled dataframe
'''
x5_dummy_variables = pd.get_dummies(sample_testing_data['x5'], 
                                    drop_first = True, 
                                    prefix = 'x5', 
                                    prefix_sep = '_', 
                                    dummy_na = True)

scaled_standardized_imputed_sample_testing_data = (
    pd.concat([scaled_standardized_imputed_sample_testing_data, x5_dummy_variables], 
              axis = 1, 
              sort = False))

x31_dummy_variables = pd.get_dummies(sample_testing_data['x31'], 
                                     drop_first = True, 
                                     prefix = 'x31', 
                                     prefix_sep = '_', 
                                     dummy_na = True)

scaled_standardized_imputed_sample_testing_data = (
    pd.concat([scaled_standardized_imputed_sample_testing_data, x31_dummy_variables], 
              axis = 1, 
              sort = False))

x81_dummy_variables = pd.get_dummies(sample_testing_data['x81'], 
                                     drop_first = True, 
                                     prefix = 'x81', 
                                     prefix_sep = '_', 
                                     dummy_na = True)

scaled_standardized_imputed_sample_testing_data = (
    pd.concat([scaled_standardized_imputed_sample_testing_data, x81_dummy_variables], 
              axis = 1, 
              sort = False))

In [123]:
scaled_standardized_imputed_sample_testing_data.head()

Unnamed: 0,x12,x44,x53,x56,x58,x62,x91,x5_monday,x5_saturday,x5_sunday,...,x81_February,x81_January,x81_July,x81_June,x81_March,x81_May,x81_November,x81_October,x81_September,x81_nan
0,0.95316,-1.692562,1.642021,-2.190166,0.198172,0.080677,0.449576,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,-0.957358,-0.011565,-1.602544,1.641571,0.579915,0.7454,-0.335268,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.947607,1.689795,0.034213,-0.770223,0.538696,1.324182,-0.972141,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,-0.777859,-0.823031,0.337922,0.770809,-0.271951,1.901067,-1.197429,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1.122098,1.572476,-1.052698,0.405244,1.977814,1.419279,0.218373,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [124]:
scaled_standardized_imputed_sample_testing_data.columns

Index(['x12', 'x44', 'x53', 'x56', 'x58', 'x62', 'x91', 'x5_monday',
       'x5_saturday', 'x5_sunday', 'x5_thursday', 'x5_tuesday', 'x5_wednesday',
       'x5_nan', 'x31_asia', 'x31_germany', 'x31_japan', 'x31_nan',
       'x81_August', 'x81_December', 'x81_February', 'x81_January', 'x81_July',
       'x81_June', 'x81_March', 'x81_May', 'x81_November', 'x81_October',
       'x81_September', 'x81_nan'],
      dtype='object')

In [125]:
'''
Filters out and drops those dummy value columns previously unselected during the
    creation of the prediction model
'''
scaled_standardized_imputed_sample_testing_data.drop(['x5_wednesday', 
                                                      'x5_thursday', 
                                                      'x5_nan', 
                                                      'x31_nan', 
                                                      'x81_nan'], 
                                                     axis = 1, 
                                                     inplace = True)

In [126]:
scaled_standardized_imputed_sample_testing_data.head()

Unnamed: 0,x12,x44,x53,x56,x58,x62,x91,x5_monday,x5_saturday,x5_sunday,...,x81_December,x81_February,x81_January,x81_July,x81_June,x81_March,x81_May,x81_November,x81_October,x81_September
0,0.95316,-1.692562,1.642021,-2.190166,0.198172,0.080677,0.449576,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,-0.957358,-0.011565,-1.602544,1.641571,0.579915,0.7454,-0.335268,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.947607,1.689795,0.034213,-0.770223,0.538696,1.324182,-0.972141,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,-0.777859,-0.823031,0.337922,0.770809,-0.271951,1.901067,-1.197429,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1.122098,1.572476,-1.052698,0.405244,1.977814,1.419279,0.218373,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [147]:
'''
Rearranges the variable columns into the proper order as required by the pre-existing 
    and pre-trained model
'''

final_variable_column_order = ['x5_saturday', 
                               'x81_July', 
                               'x81_December', 
                               'x31_japan', 
                               'x81_October', 
                               'x5_sunday', 
                               'x31_asia', 
                               'x81_February', 
                               'x91', 
                               'x81_May', 
                               'x5_monday', 
                               'x81_September', 
                               'x81_March', 
                               'x53', 
                               'x81_November', 
                               'x44', 
                               'x81_June', 
                               'x12', 
                               'x5_tuesday', 
                               'x81_August', 
                               'x81_January', 
                               'x62', 
                               'x31_germany', 
                               'x58', 
                               'x56']

final_dataset = scaled_standardized_imputed_sample_testing_data[final_variable_column_order]

In [148]:
final_dataset.head()

Unnamed: 0,x5_saturday,x81_July,x81_December,x31_japan,x81_October,x5_sunday,x31_asia,x81_February,x91,x81_May,...,x44,x81_June,x12,x5_tuesday,x81_August,x81_January,x62,x31_germany,x58,x56
0,0,0,0,0,1,0,0,0,0.449576,0,...,-1.692562,0,0.95316,0,0,0,0.080677,1,0.198172,-2.190166
1,0,0,0,0,0,0,0,0,-0.335268,0,...,-0.011565,0,-0.957358,1,0,0,0.7454,0,0.579915,1.641571
2,0,1,0,0,0,0,0,0,-0.972141,0,...,1.689795,0,0.947607,1,0,0,1.324182,1,0.538696,-0.770223
3,0,0,0,0,1,0,0,0,-1.197429,0,...,-0.823031,0,-0.777859,0,0,0,1.901067,1,-0.271951,0.770809
4,1,0,0,0,0,0,0,0,0.218373,1,...,1.572476,0,1.122098,0,0,0,1.419279,0,1.977814,0.405244


In [145]:
alphabetically_sorted_variable_columns = sorted(final_variable_column_order)

In [146]:
print(alphabetically_sorted_variable_columns)

['x12', 'x31_asia', 'x31_germany', 'x31_japan', 'x44', 'x53', 'x56', 'x58', 'x5_monday', 'x5_saturday', 'x5_sunday', 'x5_tuesday', 'x62', 'x81_August', 'x81_December', 'x81_February', 'x81_January', 'x81_July', 'x81_June', 'x81_March', 'x81_May', 'x81_November', 'x81_October', 'x81_September', 'x91']


In [149]:
length_of_dataframe = final_dataset.shape[0]

In [150]:
print(length_of_dataframe)

10000


In [154]:
for row in range(1):
    predicted_outcome = 0
    model_inputs = {}
    
    for variable in alphabetically_sorted_variable_columns:
        model_inputs[variable] = final_dataset.iloc[row][variable]
    
    predicted_probability = model.predict(final_dataset.iloc[row])[0]
    
    if predicted_probability >= 0.75:
        predicted_outcome = 1
    
    model_predictions = {'business_outcome': str(predicted_outcome), 'p_hat': str(predicted_probability)}
    
    output = model_predictions | model_inputs
    
    print(json.dumps(output))

{"business_outcome": "0", "p_hat": "0.3678794702940152", "x12": 0.9531599637548858, "x31_asia": 0.0, "x31_germany": 1.0, "x31_japan": 0.0, "x44": -1.6925617919945477, "x53": 1.6420211519912549, "x56": -2.1901657838528488, "x58": 0.1981719865158904, "x5_monday": 1.0, "x5_saturday": 0.0, "x5_sunday": 0.0, "x5_tuesday": 0.0, "x62": 0.08067698890615446, "x81_August": 0.0, "x81_December": 0.0, "x81_February": 0.0, "x81_January": 0.0, "x81_July": 0.0, "x81_June": 0.0, "x81_March": 0.0, "x81_May": 0.0, "x81_November": 0.0, "x81_October": 1.0, "x81_September": 0.0, "x91": 0.44957587712936303}
