In [1]:
# Imports required libraries
import json
import numpy as np
import os
import pandas as pd
import statsmodels.api as sm
import uvicorn

from fastapi import FastAPI

from pydantic import BaseModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [123]:
!pip3 list

Package                                           Version
------------------------------------------------- ---------
absl-py                                           1.1.0
aiohttp                                           3.8.1
aiosignal                                         1.2.0
ale-py                                            0.7.5
analytics-python                                  1.4.0
anyio                                             3.6.1
appnope                                           0.1.3
argon2-cffi                                       21.3.0
argon2-cffi-bindings                              21.2.0
asgiref                                           3.5.2
asttokens                                         2.0.5
astunparse                                        1.6.3
async-timeout                                     4.0.2
attrs                                             21.4.0
AutoROM                                           0.4.2
AutoROM.accept-rom-lic

In [2]:
# Creates and instantiates the FastAPI application object
app = FastAPI(title = "GLM Model Application",
              description = "API Application for GLM Model", 
              version = "1.0")

In [3]:
# Sets up the index route, which opens automatically upon http://127.0.0.1:1313
@app.get('/')
def index():
    return json.dumps({'message': 'Successfully connected to GLM Model Application.'})

In [4]:
# Carries out a prediction upon passed in data
@app.post('/predict')
def predict():
    pass

In [5]:
# Starts up an instance of a uvicorn server in order for the FastAPI application object to run upon
#if __name__ == '__main__':
#    uvicorn.run(app, host = '127.0.0.1', port = 1313)

In [128]:
def load_model():
    
    mdl = sm.load('../models/glm_final_model.pickle')
    
    return mdl

In [7]:
def transform_json_to_df(json):
    
    df = pd.read_json(json, orient = 'records')
    
    return df

In [77]:
def drop_df_rows_missing_data(df):
    
    df = df.dropna()
    
    return df

In [8]:
def format_df_column_variables(df):
    
    # Formats the 'x12' column's String monetary value into a Float in order to apply maths upon it
    df['x12'] = df['x12'].str.replace('$', '')
    df['x12'] = df['x12'].str.replace(',', '')
    df['x12'] = df['x12'].str.replace(')', '')
    df['x12'] = df['x12'].str.replace('(', '-')
    df['x12'] = df['x12'].astype(float)
    
    df['x63'] = df['x63'].str.replace('%', '')
    df['x63'] = df['x63'].astype(float)
    
    return df

In [121]:
def impute_missing_df_data(df):
    
    if df.shape[0] <= 1:
        df = pd.DataFrame(df.drop(columns = ['x5', 'x31', 'x81', 'x82']), 
                          columns = df.drop(columns = ['x5', 'x31', 'x81', 'x82']).columns)
        df = df.fillna(0)
        
    else:
        # Creates and instantiates a simple imputer
        si = SimpleImputer(missing_values = np.nan, strategy = 'mean')

        # Imputes via a simple mean stategy those column values which are missing
        df = pd.DataFrame(si.fit_transform(df.drop(columns = ['x5', 'x31', 'x81', 'x82'])), 
                          columns = df.drop(columns = ['x5', 'x31', 'x81', 'x82']).columns)
    
    return df

In [10]:
def scale_df_data(df):
    
    # Creates and instantiates a standard scaler
    ss = StandardScaler()
    
    '''
    Scales all column values via a standardization method for feature scaling, 
        of particular interest and focus being being that of the monetary value column
    '''
    df = pd.DataFrame(ss.fit_transform(df), 
                      columns = df.columns)
    
    return df

In [36]:
def create_df_dummy_column_variables_new(df1, df2):
    
    '''
    Creates the dummy variables for the non-numeric, qualitative data type columns and then reconcatenates
        them back into the now imputed and standardized scaled dataframe
    '''
    vars_to_dummify = ['x5', 'x31', 'x81', 'x82']
    
    for var in vars_to_dummify:

        var_dummy_vars = pd.get_dummies(df1[var], 
                                        drop_first = True, 
                                        prefix = var, 
                                        prefix_sep = '_', 
                                        dummy_na = True)


        df2 = pd.concat([df2, var_dummy_vars], 
                        axis = 1, 
                        sort = False)

    return df2

In [37]:
def create_df_dummy_column_variables_old(df1, df2):
    
    x5_dummy_variables = pd.get_dummies(df1['x5'], 
                                    drop_first = True, 
                                    prefix = 'x5', 
                                    prefix_sep = '_', 
                                    dummy_na = True)

    df2 = pd.concat([df2, x5_dummy_variables], axis = 1, sort = False)

    x31_dummy_variables = pd.get_dummies(df1['x31'], 
                                         drop_first = True, 
                                         prefix = 'x31', 
                                         prefix_sep = '_', 
                                         dummy_na = True)
    
    df2 = pd.concat([df2, x31_dummy_variables], axis = 1, sort = False)

    x81_dummy_variables = pd.get_dummies(df1['x81'], 
                                         drop_first = True, 
                                         prefix = 'x81', 
                                         prefix_sep = '_', 
                                         dummy_na = True)

    df2 = pd.concat([df2, x81_dummy_variables], axis = 1, sort = False)

    x82_dummy_variables = pd.get_dummies(df1['x82'], 
                                         drop_first = True, 
                                         prefix = 'x82', 
                                         prefix_sep = '_', 
                                         dummy_na = True)

    df2 = pd.concat([df2, x82_dummy_variables], axis = 1, sort = False)
    
    return df2

In [98]:
'''
Filters out and retrives only those columns previously determined to be the most useful during the
    creation of the prediction model
'''
def filter_df_column_variables(df, ordr_clmn_names_lst):
    
    necessary_clmn_vars_set = set(ordr_clmn_names_lst)
    avlbl_clmn_vars_set = set(df.columns)
    
    '''
    Depending upon the type and amount of data passed in, not all dummy variables will always be
        successfully generated, necessitating their inclusion afterwards
    '''
    if necessary_clmn_vars_set.issubset(avlbl_clmn_vars_set) == False:
        nan_df = pd.DataFrame(np.nan, index = range(df.shape[0]), columns = ordr_clmn_names_lst)
        df = df.combine_first(nan_df)
        df = df.fillna(0)

    df = df[ordr_clmn_names_lst].copy(deep = True)  
    
    return df

In [119]:
def extract_transform_input_data_pipeline(json_data, ordr_clmn_names_lst):
    
    df = transform_json_to_df(json_data)
    
    #df = drop_df_rows_missing_data(df)
    print(df.shape)
    
    if df.shape[0] < 1:
        df = pd.DataFrame()
 
    else:
        df = format_df_column_variables(df)
        print('Format')
        print(df.head())
        #print(df.columns)

        imputed_df = impute_missing_df_data(df)
        print('Impute')
        print(imputed_df.head())
        #print(imputed_df.columns)

        scaled_imputed_df = scale_df_data(imputed_df)
        #print('Scale')
        #print(scaled_imputed_df.head())
        #print(scaled_imputed_df.columns)

        df = create_df_dummy_column_variables_old(df, scaled_imputed_df)
        #print(df.head())
        #print(df.columns)

        df = filter_df_column_variables(df, ordr_clmn_names_lst)
        print('Filter')
        print(df.head())
        print(df.columns)
    
    return df

In [81]:
def predict_outcome(df, model, alphanum_ord_clmn_var_names_lst):
    
    num_rows_df = df.shape[0]
    
    if num_rows_df == 0:
        return json.dumps({'message': 'Error'})
    
    else:
    
        for row in range(num_rows_df):
            predicted_outcome = 0
            model_inputs = {}

            for var in alphanum_ord_clmn_var_names_lst:
                model_inputs[var] = df.iloc[row][var]

            predicted_probability = model.predict(df.iloc[row])[0]

            if predicted_probability >= 0.75:
                predicted_outcome = 1

            model_predictions = {'business_outcome': str(predicted_outcome), 
                                 'p_hat': str(predicted_probability)}

            output = model_predictions | model_inputs

            return json.dumps(output)

In [130]:
def main():
    
    json = sample_json1
    
    final_df_column_variable_names_order = [
        'x5_saturday', 'x81_July', 'x81_December', 'x31_japan', 'x81_October', 
        'x5_sunday', 'x31_asia', 'x81_February', 'x91', 'x81_May', 
        'x5_monday', 'x81_September', 'x81_March', 'x53', 'x81_November', 
        'x44', 'x81_June', 'x12', 'x5_tuesday', 'x81_August', 
        'x81_January', 'x62', 'x31_germany', 'x58', 'x56']
    
    alphanumerically_sorted_df_column_variable_names = sorted(final_df_column_variable_names_order)
    
    mdl = load_model()
    
    df = extract_transform_input_data_pipeline(json, 
                                               final_df_column_variable_names_order)
    
    json_output_message = predict_outcome(df, mdl, alphanumerically_sorted_df_column_variable_names)
    
    print(json_output_message[:1000])

In [124]:
raw_testing_data1 = pd.read_csv(os.path.join('../training', 'exercise_26_test.csv'))
raw_testing_data2 = pd.read_csv(os.path.join('../training', 'exercise_26_test.csv'), nrows = 1)
raw_testing_data3 = pd.read_csv(os.path.join('../training', 'exercise_26_test.csv'), nrows = 10)
raw_testing_data4 = pd.read_csv(os.path.join('../training', 'exercise_26_test.csv'), nrows = 100)
raw_testing_data5 = pd.read_csv(os.path.join('../training', 'exercise_26_test.csv'), nrows = 1000)

sample_json1 = raw_testing_data1.to_json(orient = 'records')
sample_json2 = raw_testing_data2.to_json(orient = 'records')
sample_json3 = raw_testing_data3.to_json(orient = 'records')
sample_json4 = raw_testing_data4.to_json(orient = 'records')
sample_json5 = raw_testing_data5.to_json(orient = 'records')

#raw_testing_data1.head()
#raw_testing_data2.head()
#raw_testing_data3.head()
#raw_testing_data4.head()
#raw_testing_data5.head()

In [126]:
with open(os.path.join('../testing', 'sample_json1.json'), 'w') as file:
    json.dump(sample_json1, file)

In [131]:
main()

(10000, 100)
Format
         x0        x1        x2        x3        x4        x5        x6  \
0  0.042317 -3.344721  4.635124 -0.598396 -0.647772    monday  0.184902   
1 -1.033160 -0.340140  5.871823       NaN  0.122133   tuesday  0.997773   
2  2.029367 -3.239301  4.724436  2.211831  0.551611   tuesday  0.492405   
3 -0.065676  1.892277  4.818741  0.640313  1.944562    friday  0.208718   
4 -0.357126 -1.852161  5.367849 -0.069869 -0.641455  saturday  0.940286   

          x7        x8        x9  ...       x90       x91       x92       x93  \
0  46.690015  3.034132  0.364704  ... -0.493304  0.373853  0.941435  3.546798   
1  51.581411  1.709219  0.844079  ...  0.521119  0.148424  0.925301  3.830426   
2  87.179042  4.333755  0.513789  ...  0.154492 -0.034504  0.904042  3.642968   
3  73.573314  4.929132  0.116004  ...  0.305243 -0.099213  0.712234  3.853489   
4  72.773335       NaN  0.191044  ...  0.617258  0.307445  0.376738  3.306958   

          x94       x95       x96       x9

  df['x12'] = df['x12'].str.replace('$', '')
  df['x12'] = df['x12'].str.replace(')', '')
  df['x12'] = df['x12'].str.replace('(', '-')
