In [1]:
# Setup/Needed Imports

from google.cloud import bigquery
from IPython.display import display, HTML
from google.cloud import storage

import pandas as pd
import numpy as np

import data_prep as dp
import model_train as mt

BUCKET_NAME = 'python-testing-re'

  from numpy.core.umath_tests import inner1d


In [2]:
# Get the data

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT sale_dollars,
           city,
           county_number,
           category,
           store_number,
           item_number,
           date
      FROM `bigquery-public-data.iowa_liquor_sales.sales`
     LIMIT 1000
"""

query_job = client.query(
    query
)

df = query_job.to_dataframe()

df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

df = df.drop(['date'], axis=1)

print('Length of dataframe:')
print(len(df))

Client creating using default project: data-science-sandbox-d3c168
Length of dataframe:
1000


In [3]:
# Categorical Vars to encode
cat_vars = [
    'city',
    'category',
    'county_number',
    'store_number',
    'item_number'
]

# Create dataframe with encoded categorical variables
df_cat, df_mapping = dp.category_columns(
    df,
    cat_vars
)

# Round sales_dollar column
df_cat = df_cat.round({'sale_dollars': 2})

print('Sample of Iowa Liquor Sales Data: ')
display(HTML(df_cat.head().to_html()))


print('Sample of Mapping Data: ')
display(df_mapping.head())

# Save categorical mapping file
df_mapping.to_hdf(
    'categorical_mapping.hdf',
    'df_cat_map',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob('iowa_forecasting_testing/categorical_mapping.hdf')
blob.upload_from_filename('categorical_mapping.hdf')

Sample of Iowa Liquor Sales Data: 


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,1257.6,Des Moines,77,Unknown,2528,73684,2016,11,17,0,0,0,0,0
1,442.8,Mount Vernon,57,1092100,5102,75087,2017,5,1,1,1,1,1,1
2,813.9,URBANDALE,77,1011500,4733,27102,2012,11,29,2,2,0,2,2
3,675.0,WATERLOO,7,1701100,3663,40614,2015,10,12,3,3,2,3,3
4,74.95,DES MOINES,77,1052100,5128,48105,2016,2,24,4,4,0,4,4


Sample of Mapping Data: 


Unnamed: 0,city,category,county_number,store_number,item_number,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,Des Moines,Unknown,77,2528,73684,0,0,0,0,0
1,Mount Vernon,1092100,57,5102,75087,1,1,1,1,1
2,URBANDALE,1011500,77,4733,27102,2,2,0,2,2
3,WATERLOO,1701100,7,3663,40614,3,3,2,3,3
4,DES MOINES,1052100,77,5128,48105,4,4,0,4,4


In [4]:
# Split data into test and train with 3 months of test data
df_train_rfr, df_test_rfr = mt.split_train_test(
    df_cat,
    'month',
    3
)

print('Length of Training Data: ', len(df_train_rfr))
print('Length of Test Data: ', len(df_test_rfr))

display(HTML(df_train_rfr.head().to_html()))
display(HTML(df_test_rfr.head().to_html()))

display(df_train_rfr.describe())

Length of Training Data:  516
Length of Test Data:  484


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
1,442.8,Mount Vernon,57,1092100,5102,75087,2017,5,1,1,1,1,1,1
4,74.95,DES MOINES,77,1052100,5128,48105,2016,2,24,4,4,0,4,4
5,49.0,ONAWA,67,1051110,3723,55087,2015,3,11,5,5,3,5,5
6,793.44,DES MOINES,77,1081330,2633,82847,2012,5,24,4,6,0,6,6
9,46.5,Ottumwa,90,1042100,2543,28867,2019,5,8,7,8,5,9,9


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,1257.6,Des Moines,77,Unknown,2528,73684,2016,11,17,0,0,0,0,0
2,813.9,URBANDALE,77,1011500,4733,27102,2012,11,29,2,2,0,2,2
3,675.0,WATERLOO,7,1701100,3663,40614,2015,10,12,3,3,2,3,3
7,26.26,Des Moines,77,1091300,2673,86739,2019,11,13,0,7,0,7,7
8,10.49,Sioux City,97,1042100,2621,28865,2019,11,19,6,8,4,8,8


Unnamed: 0,sale_dollars,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
count,516.0,516.0,516.0,516.0,516.0,516.0,516.0,516.0,516.0
mean,569.567771,2016.306202,5.418605,17.160853,51.591085,12.29845,16.139535,134.618217,92.837209
std,951.256646,2.206619,2.36626,9.562025,52.000125,9.743112,18.333902,114.290023,90.491863
min,1.34,2012.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,46.5,2015.0,4.0,8.0,12.0,3.0,1.0,39.25,18.75
50%,211.29,2016.0,6.0,17.0,36.0,10.0,11.0,108.0,62.0
75%,685.8,2019.0,7.0,26.0,86.0,18.0,25.0,200.5,147.5
max,8100.0,2019.0,9.0,31.0,208.0,47.0,83.0,414.0,335.0


In [5]:
import pickle

# Set variable we are predicting for and predictors
y_col = 'sale_dollars'
x_cols = [
    'city_enc',
    'county_number_enc',
    'category_enc',
    'store_number_enc',
    'item_number_enc',
    'year',
    'month',
    'day'
]

# Create model object and importances
rfr_model, importances = mt.fit_model(
    y_col,
    x_cols,
    df_train_rfr,
    trees=150,
    leaves=5
)

# Save model object to shared location on edge node
print('Save Model')
pickle.dump(
    rfr_model,
    open('model_test.pkl', 'wb')
)

print('Model saved')

blob = bucket.blob('iowa_forecasting_testing/model_test.pkl')
blob.upload_from_filename('model_test.pkl')

print('Model saved to bucket')

Save Model
Model saved
Model saved to bucket


In [6]:
df_results = mt.model_predict(
    rfr_model,
    df_test_rfr,
    y_col,
    x_cols
)

# Save results file
df_results.to_hdf(
    'model_test_results.hdf',
    'df_results',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob('iowa_forecasting_testing/model_test_results.hdf')
blob.upload_from_filename('model_test_results.hdf')

In [7]:
display(HTML(df_results.head().to_html()))

Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc,predicted_sale_dollars
0,1257.6,Des Moines,77,Unknown,2528,73684,2016,11,17,0,0,0,0,0,1026.998163
2,813.9,URBANDALE,77,1011500,4733,27102,2012,11,29,2,2,0,2,2,1324.817773
3,675.0,WATERLOO,7,1701100,3663,40614,2015,10,12,3,3,2,3,3,957.116948
7,26.26,Des Moines,77,1091300,2673,86739,2019,11,13,0,7,0,7,7,1033.139487
8,10.49,Sioux City,97,1042100,2621,28865,2019,11,19,6,8,4,8,8,458.829069
