In [1]:
# !pip install --user -r requirements.txt

In [4]:
# Setup/Needed Imports
import os

from google.cloud import bigquery
from IPython.display import display, HTML
from google.cloud import storage

import pandas as pd
import numpy as np

import data_prep as dp
import model_train as mt

bucket_path = os.path.expanduser('~/bucket.txt')
with open(bucket_path) as f:
    BUCKET_NAME = f.read().strip()


In [5]:
# Get the data
client = bigquery.Client()

query = """
    SELECT sale_dollars,
           city,
           county_number,
           category,
           store_number,
           item_number,
           date
      FROM `bigquery-public-data.iowa_liquor_sales.sales`
     LIMIT 1000
"""

query_job = client.query(
    query
)

df = query_job.to_dataframe()

df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

df = df.drop(['date'], axis=1)

print('Length of dataframe:')
print(len(df))

Length of dataframe:
1000


In [6]:
# Categorical Vars to encode
cat_vars = [
    'city',
    'category',
    'county_number',
    'store_number',
    'item_number'
]

# Create dataframe with encoded categorical variables
df_cat, df_mapping = dp.category_columns(
    df,
    cat_vars
)

# Round sales_dollar column
df_cat = df_cat.round({'sale_dollars': 2})

print('Sample of Iowa Liquor Sales Data: ')
display(HTML(df_cat.head().to_html()))


print('Sample of Mapping Data: ')
display(df_mapping.head())

# Save categorical mapping file
df_mapping.to_hdf(
    'categorical_mapping.hdf',
    'df_cat_map',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob('ai_platform_test/iowa_forecasting_testing/categorical_mapping.hdf')
blob.upload_from_filename('categorical_mapping.hdf')

Sample of Iowa Liquor Sales Data: 


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,324.54,Mount Vernon,57,1062300,5102,44266,2017,10,26,0,0,0,0,0
1,925.68,BETTENDORF,82,1081317,3952,82637,2012,11,6,1,1,1,1,1
2,91.44,COUNCIL BLUFFS,78,1081312,4312,82787,2013,7,2,2,2,2,2,2
3,132.78,Iowa Falls,42,1701100,4024,183,2016,6,13,3,3,3,3,3
4,992.04,Des Moines,77,1081330,2633,82847,2016,6,16,4,4,4,4,4


Sample of Mapping Data: 


Unnamed: 0,city,category,county_number,store_number,item_number,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,Mount Vernon,1062300,57,5102,44266,0,0,0,0,0
1,BETTENDORF,1081317,82,3952,82637,1,1,1,1,1
2,COUNCIL BLUFFS,1081312,78,4312,82787,2,2,2,2,2
3,Iowa Falls,1701100,42,4024,183,3,3,3,3,3
4,Des Moines,1081330,77,2633,82847,4,4,4,4,4


In [7]:
# Split data into test and train with 3 months of test data
df_train_rfr, df_test_rfr = mt.split_train_test(
    df_cat,
    'month',
    3
)

print('Length of Training Data: ', len(df_train_rfr))
print('Length of Test Data: ', len(df_test_rfr))

display(HTML(df_train_rfr.head().to_html()))
display(HTML(df_test_rfr.head().to_html()))

display(df_train_rfr.describe())

Length of Training Data:  533
Length of Test Data:  467


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
2,91.44,COUNCIL BLUFFS,78,1081312,4312,82787,2013,7,2,2,2,2,2,2
3,132.78,Iowa Falls,42,1701100,4024,183,2016,6,13,3,3,3,3,3
4,992.04,Des Moines,77,1081330,2633,82847,2016,6,16,4,4,4,4,4
5,174.24,Waterloo,7,1081400,2130,86251,2016,6,23,5,5,5,5,5
11,3596.4,Davenport,82,1012300,2614,5153,2018,3,14,10,10,1,11,11


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,324.54,Mount Vernon,57,1062300,5102,44266,2017,10,26,0,0,0,0,0
1,925.68,BETTENDORF,82,1081317,3952,82637,2012,11,6,1,1,1,1,1
6,404.88,Mason City,17,1042100,2515,28088,2018,11,29,6,6,6,6,6
7,13.26,Cresco,45,1062100,5244,45247,2018,12,3,7,7,7,7,7
8,2740.5,WEST DES MOINES,77,1701100,3899,1400,2012,10,26,8,3,4,8,8


Unnamed: 0,sale_dollars,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
count,533.0,533.0,533.0,533.0,533.0,533.0,533.0,533.0,533.0
mean,769.783377,2016.093809,5.30394,17.478424,52.153846,13.033771,15.21576,135.302064,105.958724
std,1986.923378,2.200184,2.386035,9.458159,53.991878,10.362181,18.127068,121.447352,93.257013
min,0.0,2012.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,48.74,2014.0,3.0,9.0,13.0,5.0,4.0,24.0,25.0
50%,252.0,2016.0,6.0,18.0,29.0,10.0,6.0,103.0,81.0
75%,810.0,2018.0,7.0,26.0,74.0,18.0,25.0,225.0,173.0
max,36616.8,2019.0,9.0,31.0,216.0,48.0,84.0,421.0,341.0


In [5]:
import pickle

# Set variable we are predicting for and predictors
y_col = 'sale_dollars'
x_cols = [
    'city_enc',
    'county_number_enc',
    'category_enc',
    'store_number_enc',
    'item_number_enc',
    'year',
    'month',
    'day'
]

# Create model object and importances
rfr_model, importances = mt.fit_model(
    y_col,
    x_cols,
    df_train_rfr,
    trees=150,
    leaves=5
)

# Save model object to shared location on edge node
print('Save Model')
pickle.dump(
    rfr_model,
    open('model_test.pkl', 'wb')
)

print('Model saved')

blob = bucket.blob('iowa_forecasting_testing/model_test.pkl')
blob.upload_from_filename('model_test.pkl')

print('Model saved to bucket')

Save Model
Model saved
Model saved to bucket


In [6]:
df_results = mt.model_predict(
    rfr_model,
    df_test_rfr,
    y_col,
    x_cols
)

# Save results file
df_results.to_hdf(
    'model_test_results.hdf',
    'df_results',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob('iowa_forecasting_testing/model_test_results.hdf')
blob.upload_from_filename('model_test_results.hdf')

In [7]:
display(HTML(df_results.head().to_html()))

Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc,predicted_sale_dollars
0,1257.6,Des Moines,77,Unknown,2528,73684,2016,11,17,0,0,0,0,0,1026.998163
2,813.9,URBANDALE,77,1011500,4733,27102,2012,11,29,2,2,0,2,2,1324.817773
3,675.0,WATERLOO,7,1701100,3663,40614,2015,10,12,3,3,2,3,3,957.116948
7,26.26,Des Moines,77,1091300,2673,86739,2019,11,13,0,7,0,7,7,1033.139487
8,10.49,Sioux City,97,1042100,2621,28865,2019,11,19,6,8,4,8,8,458.829069
