In [3]:
# Setup/Needed Imports

from google.cloud import bigquery
from IPython.display import display, HTML
from google.cloud import storage

import pandas as pd
import numpy as np

from shared_libs import data_prep as dp
from shared_libs import model_train as mt

BUCKET_NAME = 'python-testing-re'

  from numpy.core.umath_tests import inner1d


In [None]:
# Get the data

client = bigquery.Client(location='US')
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT sale_dollars,
           city,
           county_number,
           category,
           store_number,
           item_number,
           date
      FROM `bigquery-public-data.iowa_liquor_sales.sales`
     LIMIT 1000000
"""

query_job = client.query(
    query,
    location="US",
)

df = query_job.to_dataframe()

df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

df = df.drop(['date'], axis=1)

print('Length of dataframe:')
print(len(df))

In [None]:
# Categorical Vars to encode
cat_vars = [
    'city',
    'category',
    'county_number',
    'store_number',
    'item_number'
]

# Create dataframe with encoded categorical variables
df_cat, df_mapping = dp.category_columns(
    df,
    cat_vars
)

# Round sales_dollar column
df_cat = df_cat.round({'sale_dollars': 2})

print('Sample of Iowa Liquor Sales Data: ')
display(HTML(df_cat.head().to_html()))


print('Sample of Mapping Data: ')
display(df_mapping.head())

# Save categorical mapping file
df_mapping.to_hdf(
    'categorical_mapping.hdf',
    'df_cat_map',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob('categorical_mapping.hdf')
blob.upload_from_filename('categorical_mapping.hdf')

In [None]:
# Split data into test and train with 3 months of test data
df_train_rfr, df_test_rfr = mt.split_train_test(
    df_cat,
    'month',
    3
)

print('Length of Training Data: ', len(df_train_rfr))
print('Length of Test Data: ', len(df_test_rfr))

display(HTML(df_train_rfr.head().to_html()))
display(HTML(df_test_rfr.head().to_html()))

display(df_train_rfr.describe())

In [None]:
import pickle

# Set variable we are predicting for and predictors
y_col = 'sale_dollars'
x_cols = [
    'city_enc',
    'county_number_enc',
    'category_enc',
    'store_number_enc',
    'item_number_enc',
    'year',
    'month',
    'day'
]

# Create model object and importances
rfr_model, importances = mt.fit_model(
    y_col,
    x_cols,
    df_train_rfr,
    trees=150,
    leaves=5
)

# Save model object to shared location on edge node
model_filename = 'model_test.unicorn'

print('Save Model')
pickle.dump(
    rfr_model,
    open(model_filename, 'wb')
)

print('Model saved')

blob = bucket.blob(model_filename)
blob.upload_from_filename(model_filename)

print('Model saved to bucket')

In [None]:
df_results = mt.model_predict(
    rfr_model,
    df_test_rfr,
    y_col,
    x_cols
)

file_name = 'model_test_results.hdf'

# Save results file
df_results.to_hdf(
    file_name,
    'df_results',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(file_name)
blob.upload_from_filename(file_name)

In [None]:
display(HTML(df_results.head().to_html()))