In [88]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [28]:
df = pd.read_csv("harlech_data_proportions.csv").dropna()

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Year,Poverty,Population,Bottles Sold PC,Sales Volume PC,Volume Sold PC,Vodka Sold PC,Gin Sold PC,...,Pacific Prop,Two+ Prop,Other Prop,HighIncome Prop,LowIncome Prop,MidIncome Prop,Middle-Old Prop,Middle-Young Prop,Old Prop,Young Prop
0,0,2012-01-03,2012,10.6,7466.0,0.045406,0.572024,0.042743,0.005224,0.000268,...,0.0,0.006464,0.001751,0.104878,0.262927,0.632195,0.290708,0.205457,0.391118,0.112716
1,1,2012-01-03,2012,10.2,3910.0,0.0289,0.381381,0.028885,0.004604,0.000767,...,0.0,0.016444,0.0,0.102477,0.222886,0.674637,0.308361,0.197993,0.366555,0.12709
2,2,2012-01-03,2012,9.7,5861.0,0.045897,0.496417,0.040203,0.012967,0.0,...,0.0,0.010223,0.0,0.126021,0.324971,0.549008,0.305332,0.186828,0.402554,0.105287
3,3,2012-01-03,2012,9.6,25829.0,0.004181,0.049647,0.003659,0.001161,0.0,...,0.0,0.010386,0.002519,0.173219,0.200028,0.626754,0.327041,0.246467,0.307464,0.119027
4,4,2012-01-03,2012,15.7,131904.0,0.00097,0.010843,0.000835,0.000273,0.0,...,0.000174,0.021593,0.015072,0.12784,0.284954,0.587206,0.253147,0.240095,0.287378,0.219379


In [30]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

In [31]:
predictive_df = df.drop([
    # Those we want to remove for all analyses
    'Unnamed: 0', 'Date',
    
    # Those we want to remove for the predictive model
    'Gin Sold PC', 'Rum Sold PC', 'Whiskey Sold PC', 'Tequila Sold PC', 'Other Alc Sold PC',
    
    # Remove targets
    'Sales Volume PC','Volume Sold PC', 'Bottles Sold PC', 'Vodka Sold PC',
    
    # Remove linearly dependent columns
    'DOW', 'Young Prop', 'LowIncome Prop', 'Other Prop'], axis = 1)

In [32]:
df['Volume Sold PC']

0        0.042743
1        0.028885
2        0.040203
3        0.003659
4        0.000835
           ...   
69438    0.051261
69439    0.021340
69440    0.040369
69441    0.111520
69442    0.096138
Name: Volume Sold PC, Length: 69443, dtype: float64

In [33]:
predictive_df.head()

Unnamed: 0,Year,Poverty,Population,is_weekend,White Prop,Black Prop,Native American Prop,Asian Prop,Pacific Prop,Two+ Prop,HighIncome Prop,MidIncome Prop,Middle-Old Prop,Middle-Young Prop,Old Prop,Month
0,2012,10.6,7466.0,0,0.984649,0.003636,0.000269,0.003232,0.0,0.006464,0.104878,0.632195,0.290708,0.205457,0.391118,1
1,2012,10.2,3910.0,0,0.973279,0.006937,0.000514,0.002826,0.0,0.016444,0.102477,0.674637,0.308361,0.197993,0.366555,1
2,2012,9.7,5861.0,0,0.976657,0.003237,0.006645,0.003237,0.0,0.010223,0.126021,0.549008,0.305332,0.186828,0.402554,1
3,2012,9.6,25829.0,0,0.977638,0.007015,0.00031,0.002132,0.0,0.010386,0.173219,0.626754,0.327041,0.246467,0.307464,1
4,2012,15.7,131904.0,0,0.852192,0.09155,0.002468,0.016951,0.000174,0.021593,0.12784,0.587206,0.253147,0.240095,0.287378,1


In [34]:
# get dummies for year 
year_dummies = pd.get_dummies(df['Year'])
year_columns = []
for i in range(2012, 2021):
    year_columns.append('year_{}'.format(i))
year_dummies.columns = year_columns
predictive_df = pd.concat([predictive_df, year_dummies], axis=1)
predictive_df= predictive_df.drop(columns=['year_2012', 'Year'])

In [35]:
# get dummies for month
month_dummies = pd.get_dummies(df['Month'])
month_columns = []
for i in range(1, 13):
    month_columns.append('month_{}'.format(i))
month_dummies.columns = month_columns
predictive_df = pd.concat([predictive_df, month_dummies], axis=1)
predictive_df= predictive_df.drop(columns=['month_1', 'Month'])

In [36]:
predictive_df.columns

Index(['Poverty ', 'Population', 'is_weekend', 'White Prop', 'Black Prop',
       'Native American Prop', 'Asian Prop', 'Pacific Prop', 'Two+ Prop',
       'HighIncome Prop', 'MidIncome Prop', 'Middle-Old Prop',
       'Middle-Young Prop', 'Old Prop', 'year_2013', 'year_2014', 'year_2015',
       'year_2016', 'year_2017', 'year_2018', 'year_2019', 'year_2020',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12'],
      dtype='object')

In [37]:
# standard scale non categorical columns
ct = make_column_transformer(
        (StandardScaler(), ['Poverty ', 'Population', 'White Prop', 'Black Prop',
                            'Native American Prop', 'Asian Prop', 'Pacific Prop', 'Two+ Prop',
                            'HighIncome Prop', 'MidIncome Prop', 'Middle-Old Prop','Middle-Young Prop', 'Old Prop'])
        , remainder='passthrough')

ct_array = ct.fit_transform(predictive_df)

In [38]:
# make numpy array
x_matrix = np.asmatrix(ct_array)

In [39]:
# check dimensions of matrix
n_rows = x_matrix.shape[0]
x_matrix.shape

(69443, 33)

In [40]:
# named columns of matrix in order (including intercept)
x_mat_columns = ['Intercept', 'Poverty ', 'Population', 'White Prop', 'Black Prop', 'Native American Prop', 'Asian Prop', 'Pacific Prop',
                 'Two+ Prop','HighIncome Prop', 'MidIncome Prop', 'Middle-Old Prop','Middle-Young Prop', 'Old Prop',
                'is_weekend', 'year_2013', 'year_2014', 'year_2015','year_2016', 'year_2017', 'year_2018', 'year_2019', 
                 'year_2020', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 
                 'month_10', 'month_11', 'month_12']

In [41]:
# adding intercept column to matrix
intercept_array = np.ones((n_rows,1))
x_mat = np.concatenate((intercept_array, x_matrix),1)

In [42]:
x_mat.shape

(69443, 34)

In [43]:
response_mat = np.asmatrix(df['Sales Volume PC']).getT()

In [44]:
# ols beta estimates
ols_betas = np.matmul(np.matmul(np.matmul(x_mat.getT(), x_mat).getI(), x_mat.getT()), response_mat)

In [45]:
variable_df = pd.DataFrame({'Variable':x_mat_columns})
betas_df = pd.DataFrame(ols_betas)
ols_betas_df = pd.concat([variable_df, betas_df], axis=1)
ols_betas_df.columns = ['Variable', 'Coefficient']

In [46]:
ols_betas_df.sort_values(by='Coefficient', ascending=False)

Unnamed: 0,Variable,Coefficient
0,Intercept,0.669425
33,month_12,0.225157
31,month_10,0.219571
13,Old Prop,0.205819
26,month_5,0.200926
27,month_6,0.190728
29,month_8,0.162023
17,year_2015,0.161984
28,month_7,0.156141
16,year_2014,0.136068


In [53]:
#basic training and test split 90/10
X_train, X_test, y_train, y_test = train_test_split(x_matrix, response_mat, test_size=0.1)

X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.22)

In [None]:
# k fold cross validation
# estimate beta using k-1 folds
# use remaining fold to predict response using beta estimates
# calculate RMSE between true and predicted
# get average for the k-folds 
# repeat this for each lambda 

In [64]:
x_mat[:, 1:]

matrix([[-0.21971903, -0.51893582,  1.06346061, ...,  0.        ,
          0.        ,  0.        ],
        [-0.33437472, -0.55837635,  0.83756025, ...,  0.        ,
          0.        ,  0.        ],
        [-0.47769432, -0.5367373 ,  0.9046844 , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 1.32813272, -0.53059274,  0.56149785, ...,  0.        ,
          0.        ,  1.        ],
        [ 0.84084606, -0.20318975, -0.72613453, ...,  0.        ,
          0.        ,  1.        ],
        [-0.73566961, -0.38144852,  0.6336721 , ...,  0.        ,
          0.        ,  1.        ]])

In [67]:
x_mat+x_mat

matrix([[ 2.        , -0.43943806, -1.03787164, ...,  0.        ,
          0.        ,  0.        ],
        [ 2.        , -0.66874943, -1.1167527 , ...,  0.        ,
          0.        ,  0.        ],
        [ 2.        , -0.95538864, -1.07347459, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 2.        ,  2.65626544, -1.06118547, ...,  0.        ,
          0.        ,  2.        ],
        [ 2.        ,  1.68169212, -0.40637949, ...,  0.        ,
          0.        ,  2.        ],
        [ 2.        , -1.47133923, -0.76289705, ...,  0.        ,
          0.        ,  2.        ]])

In [104]:
def run_cross_val(data, k, penalty="OLS", lam=100):
    # shuffle matrix
    np.random.shuffle(data)
    k_splits = np.array_split(data, k)
    rmse_list = []
    for i in range(k):
        # Get train/validation split
        validate = k_splits[i]
        train = np.concatenate([k_splits[j] for j in range(k) if j != i])
        
        y_train = train[:, 0]
        x_train = train[:, 1:]
        
        y_val = validate[:,0]
        x_val = validate[:, 1:]
        # Train the model on the training data
        if penalty == "OLS":
            # ols beta estimates
            betas = np.matmul(np.matmul(np.matmul(x_train.getT(), x_train).getI(), x_train.getT()), y_train)
        elif penalty == "Ridge":
            # make diagonal matrix for lambda
            lambda_mat = np.diag(np.full(x_train.shape[1], lam))
            # ols beta estimates
            betas = np.matmul(np.matmul((np.matmul(x_train.getT(), x_train)+lambda_mat).getI(), x_train.getT()), y_train)
        else:
            raise ValueError("not a recognizable penalty")
        
        # Make predictions on validation data
        y_pred = np.matmul(x_val, betas)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    
    return rmse_list

In [79]:
test_mat = np.concatenate((response_mat, x_mat), 1)

In [85]:
np.concatenate([response_mat, x_mat], 1)

matrix([[ 0.57202384,  1.        , -0.21971903, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.38138107,  1.        , -0.33437472, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.49641699,  1.        , -0.47769432, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.61455963,  1.        ,  1.32813272, ...,  0.        ,
          0.        ,  1.        ],
        [ 1.89978572,  1.        ,  0.84084606, ...,  0.        ,
          0.        ,  1.        ],
        [ 1.63848454,  1.        , -0.73566961, ...,  0.        ,
          0.        ,  1.        ]])

In [109]:
run_cross_val(test_mat, 5, 'Ridge', 10)

[0.7160186168814401,
 0.7275553905157041,
 0.7270133705980223,
 0.7536252317151977,
 0.7337816721797913]

In [102]:
lambda_mat = np.diag(np.full(x_mat.shape[1], 100))

In [98]:
lambda_mat.shape

(69443, 69443)

In [101]:
np.matmul(x_mat.getT(), x_mat).shape

(34, 34)

In [103]:
np.matmul(np.matmul((np.matmul(x_mat.getT(), x_mat)+lambda_mat).getI(), x_mat.getT()), response_mat)

matrix([[ 0.64775464],
        [-0.04290106],
        [ 0.01927644],
        [-0.09924198],
        [-0.04758646],
        [-0.023776  ],
        [-0.04547431],
        [ 0.03642179],
        [ 0.04857816],
        [ 0.00189941],
        [-0.05567671],
        [-0.06550173],
        [-0.0374891 ],
        [ 0.18543404],
        [ 0.01484429],
        [ 0.06237086],
        [ 0.15057238],
        [ 0.17635584],
        [-0.25494621],
        [-0.8380298 ],
        [-0.09046822],
        [-0.04234372],
        [ 0.02486694],
        [ 0.09170406],
        [ 0.06740273],
        [ 0.10235093],
        [ 0.18212432],
        [ 0.17199797],
        [ 0.13802218],
        [ 0.1437487 ],
        [ 0.10037995],
        [ 0.20089171],
        [ 0.09503907],
        [ 0.20614273]])