# modeling

In [1]:

import pandas as pd
import wrangle as wr
import summaries as s
from importlib import reload
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
import statsmodels.api as sm
from math import sqrt
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
%config InlineBackend.figure_format = 'retina'

pd.options.display.float_format = '{:,.2f}'.format

In [2]:
df = wr.get_clean_data(start2018=False)
sdf = s.get_summary_df(df)
sdf = wr.add_date_features(sdf)

In [3]:
df.shape

(372837, 20)

In [8]:
df.groupby(lambda x: x.year).size()

order_date
2015    20596
2016    39198
2017    51157
2018    55843
2019    63452
2020    66299
2021    49931
2022    26361
dtype: int64

### Spit For modeling

In [9]:
train, test = wr.split_data(df)

In [10]:
validate = test.loc[:'2022-06'].copy() 
test = test.loc['2022-07':]

In [11]:
pd.concat([validate.tail(1), test.head(1)])

Unnamed: 0_level_0,customer_name,customer_type,customer_city,reseller_name,reseller_city,customer_zip,order_quantity,unit_price,po_number,shipped_date,order_date_copy,month_name,day_name,year,quarter,month,week,day_of_week,day_of_year,purchase_amount
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2022-06-30,Texas Parks and Wildlife Department,State Agency,Austin,"Presidio Networked Solutions Group, LLC",Irving,78744.0,8.0,50.12,20520,2022-07-19,2022-06-30,June,Thursday,2022,2,6,26,3,181,400.96
2022-07-01,City of Cleburne,Local Government,Cleburne,"Sequel Data Systems, Inc.",Austin,76031.0,1.0,432.9,0138-06302022-01,2022-07-06,2022-07-01,July,Friday,2022,3,7,26,4,182,432.9


In [12]:
X_train = pd.DataFrame(train.purchase_amount.copy().resample('D').sum())
X_validate = pd.DataFrame(validate.purchase_amount.copy().resample('D').sum())
X_test = pd.DataFrame(test.purchase_amount.copy().resample('D').sum())

In [None]:
ts = X_train.copy()
ts.plot()

In [None]:
baseline = X_train.purchase_amount.mean()

In [None]:
plt.figure(figsize=(14,8))
plt.plot(X_train)
plt.plot(X_validate)
plt.plot(X_test)
plt.ylabel(col)
plt.title(col)
plt.show()


### evaluate

In [None]:
def evaluate(target_var):
    '''
    This function will take the actual values of the target_var from validate, 
    and the predicted values stored in yhat_df, 
    and compute the rmse, rounding to 0 decimal places. 
    it will return the rmse. 
    '''
    rmse = round(sqrt(mean_squared_error(validate[target_var], yhat_df[target_var])), 0)
    return rmse

In [None]:
def plot_and_eval(target_var):
    '''
    This function takes in the target var name (string), and returns a plot
    of the values of train for that variable, validate, and the predicted values from yhat_df. 
    it will als lable the rmse. 
    '''
    plt.figure(figsize = (12,4))
    plt.plot(train[target_var], label='Train', linewidth=1)
    plt.plot(validate[target_var], label='Validate', linewidth=1)
    plt.plot(yhat_df[target_var])
    plt.title(target_var)
    rmse = evaluate(target_var)
    print(target_var, '-- RMSE: {:.0f}'.format(rmse))
    plt.show()

In [None]:
# empty dataframe
eval_df = pd.DataFrame(columns=['model_type', 'target_var', 'rmse'])
eval_df


In [None]:
# function to store the rmse so that we can compare
def append_eval_df(model_type, target_var):
    '''
    this function takes in as arguments the type of model run, and the name of the target variable. 
    It returns the eval_df with the rmse appended to it for that model and target_var. 
    '''
    rmse = evaluate(target_var)
    d = {'model_type': [model_type], 'target_var': [target_var],
        'rmse': [rmse]}
    d = pd.DataFrame(d)
    return eval_df.append(d, ignore_index = True)

### Simple Average

In [None]:
# compute simple average
purchase_amount = round(X_train.purchase_amount.mean(), 2)
purchase_amount

In [None]:
def make_predictions(sales=None, quantity=None):
    yhat_df = pd.DataFrame({'purchase_amount': [purchase_amount],},
                          index=X_validate.index)
    return yhat_df

In [None]:
yhat_df = make_predictions(purchase_amount)
yhat_df.head()

In [None]:
mean_squared_error(X_validate.purchase_amount, yhat_df.purchase_amount)

In [None]:
plot_and_eval("purchase_amount")

In [None]:
eval_df = append_eval_df(model_type='simple_averave', 
                        target_var = 'purchase_amount')
eval_df

In [None]:
X_train.shape