In [1]:
# !pip install --user -r requirements.txt

In [16]:
# Setup/Needed Imports
import os

from google.cloud import bigquery
from IPython.display import display, HTML
from google.cloud import storage

import pandas as pd
import numpy as np

import modeling.data_prep as dp
import modeling.model_train as mt

bucket_path = os.path.expanduser('~/bucket.txt')
with open(bucket_path) as f:
    BUCKET_NAME = f.read().strip()
    
project_path = os.path.expanduser('~/project.txt')
with open(project_path) as f:
    PROJECT_NAME = f.read().strip()


In [17]:
# Get the data
client = bigquery.Client()

query = """
    SELECT sale_dollars,
           city,
           county_number,
           category,
           store_number,
           item_number,
           date
      FROM `bigquery-public-data.iowa_liquor_sales.sales`
     LIMIT 1000
"""

query_job = client.query(
    query
)

df = query_job.to_dataframe()

df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

df = df.drop(['date'], axis=1)

print('Length of dataframe:')
print(len(df))

Length of dataframe:
1000


In [18]:
# Categorical Vars to encode
cat_vars = [
    'city',
    'category',
    'county_number',
    'store_number',
    'item_number'
]

# Create dataframe with encoded categorical variables
df_cat, df_mapping = dp.category_columns(
    df,
    cat_vars
)

# Round sales_dollar column
df_cat = df_cat.round({'sale_dollars': 2})

print('Sample of Iowa Liquor Sales Data: ')
display(HTML(df_cat.head().to_html()))


print('Sample of Mapping Data: ')
display(df_mapping.head())

# Save categorical mapping file
df_mapping.to_hdf(
    'categorical_mapping.hdf',
    'df_cat_map',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob('ai_platform_test/iowa_forecasting_testing/categorical_mapping.hdf')
blob.upload_from_filename('categorical_mapping.hdf')

Sample of Iowa Liquor Sales Data: 


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,265.56,Indianola,91,1701100,3651,100037,2016,11,15,0,0,0,0,0
1,269.46,WEST DES MOINES,77,1701100,3806,424,2013,10,4,1,0,1,1,1
2,701.76,Des Moines,77,1701100,5145,2140,2017,10,13,2,0,1,2,2
3,132.0,Fort Dodge,94,1701100,4702,100353,2017,10,18,3,0,2,3,3
4,488.52,BETTENDORF,48,1011500,3838,27102,2013,12,2,4,1,3,4,4


Sample of Mapping Data: 


Unnamed: 0,city,category,county_number,store_number,item_number,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,Indianola,1701100,91,3651,100037,0,0,0,0,0
1,WEST DES MOINES,1701100,77,3806,424,1,0,1,1,1
2,Des Moines,1701100,77,5145,2140,2,0,1,2,2
3,Fort Dodge,1701100,94,4702,100353,3,0,2,3,3
4,BETTENDORF,1011500,48,3838,27102,4,1,3,4,4


In [19]:
# Split data into test and train with 3 months of test data
df_train_rfr, df_test_rfr = mt.split_train_test(
    df_cat,
    'month',
    3
)

print('Length of Training Data: ', len(df_train_rfr))
print('Length of Test Data: ', len(df_test_rfr))

display(HTML(df_train_rfr.head().to_html()))
display(HTML(df_test_rfr.head().to_html()))

display(df_train_rfr.describe())

Length of Training Data:  505
Length of Test Data:  495


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
7,161.28,Davenport,82,1901200,2614,941294,2019,5,20,7,4,6,7,7
8,238.44,DeWitt,23,1701100,5298,22219,2019,5,20,8,0,7,8,8
9,522.72,DAVENPORT,82,1081400,3354,86251,2015,4,22,9,5,6,9,9
11,317.76,DES MOINES,77,1081400,2633,86251,2014,3,10,10,5,1,11,9
16,1727.04,West Des Moines,77,1081100,2619,67526,2016,9,6,14,2,1,16,15


Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
0,265.56,Indianola,91,1701100,3651,100037,2016,11,15,0,0,0,0,0
1,269.46,WEST DES MOINES,77,1701100,3806,424,2013,10,4,1,0,1,1,1
2,701.76,Des Moines,77,1701100,5145,2140,2017,10,13,2,0,1,2,2
3,132.0,Fort Dodge,94,1701100,4702,100353,2017,10,18,3,0,2,3,3
4,488.52,BETTENDORF,48,1011500,3838,27102,2013,12,2,4,1,3,4,4


Unnamed: 0,sale_dollars,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc
count,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0
mean,584.828871,2016.267327,5.487129,18.051485,53.407921,11.164356,18.293069,153.968317,101.861386
std,1268.672713,2.250622,2.439004,9.598142,51.947584,10.82173,20.054619,125.310626,91.674224
min,1.34,2012.0,1.0,1.0,0.0,0.0,0.0,5.0,4.0
25%,46.5,2015.0,4.0,9.0,12.0,2.0,4.0,44.0,27.0
50%,174.24,2016.0,6.0,18.0,33.0,9.0,9.0,112.0,65.0
75%,614.34,2019.0,8.0,28.0,81.0,16.0,27.0,254.0,159.0
max,17172.0,2019.0,9.0,31.0,215.0,52.0,90.0,442.0,336.0


In [20]:
import pickle

# Set variable we are predicting for and predictors
y_col = 'sale_dollars'
x_cols = [
    'city_enc',
    'county_number_enc',
    'category_enc',
    'store_number_enc',
    'item_number_enc',
    'year',
    'month',
    'day'
]

# Create model object and importances
rfr_model, importances = mt.fit_model(
    y_col,
    x_cols,
    df_train_rfr,
    trees=150,
    leaves=5
)

# Save model object to shared location on edge node
print('Save Model')
pickle.dump(
    rfr_model,
    open('model_test.pkl', 'wb')
)

print('Model saved')

blob = bucket.blob('ai_platform_test/iowa_forecasting_testing/model_test.pkl')
blob.upload_from_filename('model_test.pkl')

print('Model saved to bucket')

Save Model
Model saved
Model saved to bucket


In [21]:
df_results = mt.model_predict(
    rfr_model,
    df_test_rfr,
    y_col,
    x_cols
)

# Save results file
df_results.to_hdf(
    'model_test_results.hdf',
    'df_results',
    format='table',
    mode='w'
)

# Save mapping to storage
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob('ai_platform_test/iowa_forecasting_testing/model_test_results.hdf')
blob.upload_from_filename('model_test_results.hdf')





In [22]:
display(HTML(df_results.head().to_html()))

Unnamed: 0,sale_dollars,city,county_number,category,store_number,item_number,year,month,day,city_enc,category_enc,county_number_enc,store_number_enc,item_number_enc,predicted_sale_dollars
0,265.56,Indianola,91,1701100,3651,100037,2016,11,15,0,0,0,0,0,1722.707491
1,269.46,WEST DES MOINES,77,1701100,3806,424,2013,10,4,1,0,1,1,1,2602.377212
2,701.76,Des Moines,77,1701100,5145,2140,2017,10,13,2,0,1,2,2,1700.52647
3,132.0,Fort Dodge,94,1701100,4702,100353,2017,10,18,3,0,2,3,3,1633.175991
4,488.52,BETTENDORF,48,1011500,3838,27102,2013,12,2,4,1,3,4,4,2542.248551


In [23]:
#Save results to BigQuery
table_id = 'ai_platform_test.df_results_table'
project_id = PROJECT_NAME
df_results.to_gbq(table_id, project_id)


1it [00:03,  3.33s/it]


TableCreationError: Could not create the table because it already exists. Change the if_exists parameter to 'append' or 'replace' data.