In [59]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import numpy as np

In [2]:
df = pd.read_csv("harlech_data_proportions.csv").dropna()

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Year,Poverty,Population,Bottles Sold PC,Sales Volume PC,Volume Sold PC,Vodka Sold PC,Gin Sold PC,...,Pacific Prop,Two+ Prop,Other Prop,HighIncome Prop,LowIncome Prop,MidIncome Prop,Middle-Old Prop,Middle-Young Prop,Old Prop,Young Prop
0,0,2012-01-03,2012,10.6,7466.0,0.045406,0.572024,0.042743,0.005224,0.000268,...,0.0,0.006464,0.001751,0.104878,0.262927,0.632195,0.290708,0.205457,0.391118,0.112716
1,1,2012-01-03,2012,10.2,3910.0,0.0289,0.381381,0.028885,0.004604,0.000767,...,0.0,0.016444,0.0,0.102477,0.222886,0.674637,0.308361,0.197993,0.366555,0.12709
2,2,2012-01-03,2012,9.7,5861.0,0.045897,0.496417,0.040203,0.012967,0.0,...,0.0,0.010223,0.0,0.126021,0.324971,0.549008,0.305332,0.186828,0.402554,0.105287
3,3,2012-01-03,2012,9.6,25829.0,0.004181,0.049647,0.003659,0.001161,0.0,...,0.0,0.010386,0.002519,0.173219,0.200028,0.626754,0.327041,0.246467,0.307464,0.119027
4,4,2012-01-03,2012,15.7,131904.0,0.00097,0.010843,0.000835,0.000273,0.0,...,0.000174,0.021593,0.015072,0.12784,0.284954,0.587206,0.253147,0.240095,0.287378,0.219379


In [11]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

In [44]:
predictive_df = df.drop([
    # Those we want to remove for all analyses
    'Unnamed: 0', 'Date',
    
    # Those we want to remove for the predictive model
    'Gin Sold PC', 'Rum Sold PC', 'Whiskey Sold PC', 'Tequila Sold PC', 'Other Alc Sold PC',
    
    # Remove targets
    'Sales Volume PC','Volume Sold PC', 'Bottles Sold PC', 'Vodka Sold PC',
    
    # Remove linearly dependent columns
    'DOW', 'Young Prop', 'LowIncome Prop', 'Other Prop'], axis = 1)

In [16]:
df['Volume Sold PC']

0        0.042743
1        0.028885
2        0.040203
3        0.003659
4        0.000835
           ...   
69438    0.051261
69439    0.021340
69440    0.040369
69441    0.111520
69442    0.096138
Name: Volume Sold PC, Length: 69443, dtype: float64

In [14]:
predictive_df.head()

Unnamed: 0,Year,Poverty,Population,is_weekend,White Prop,Black Prop,Native American Prop,Asian Prop,Pacific Prop,Two+ Prop,HighIncome Prop,MidIncome Prop,Middle-Old Prop,Middle-Young Prop,Old Prop,Month
0,2012,10.6,7466.0,0,0.984649,0.003636,0.000269,0.003232,0.0,0.006464,0.104878,0.632195,0.290708,0.205457,0.391118,1
1,2012,10.2,3910.0,0,0.973279,0.006937,0.000514,0.002826,0.0,0.016444,0.102477,0.674637,0.308361,0.197993,0.366555,1
2,2012,9.7,5861.0,0,0.976657,0.003237,0.006645,0.003237,0.0,0.010223,0.126021,0.549008,0.305332,0.186828,0.402554,1
3,2012,9.6,25829.0,0,0.977638,0.007015,0.00031,0.002132,0.0,0.010386,0.173219,0.626754,0.327041,0.246467,0.307464,1
4,2012,15.7,131904.0,0,0.852192,0.09155,0.002468,0.016951,0.000174,0.021593,0.12784,0.587206,0.253147,0.240095,0.287378,1


In [45]:
# get dummies for year 
year_dummies = pd.get_dummies(df['Year'])
year_columns = []
for i in range(2012, 2021):
    year_columns.append('year_{}'.format(i))
year_dummies.columns = year_columns
predictive_df = pd.concat([predictive_df, year_dummies], axis=1)
predictive_df= predictive_df.drop(columns=['year_2012', 'Year'])

In [46]:
# get dummies for month
month_dummies = pd.get_dummies(df['Month'])
month_columns = []
for i in range(1, 13):
    month_columns.append('month_{}'.format(i))
month_dummies.columns = month_columns
predictive_df = pd.concat([predictive_df, month_dummies], axis=1)
predictive_df= predictive_df.drop(columns=['month_1', 'Month'])

In [47]:
predictive_df.columns

Index(['Poverty ', 'Population', 'is_weekend', 'White Prop', 'Black Prop',
       'Native American Prop', 'Asian Prop', 'Pacific Prop', 'Two+ Prop',
       'HighIncome Prop', 'MidIncome Prop', 'Middle-Old Prop',
       'Middle-Young Prop', 'Old Prop', 'year_2013', 'year_2014', 'year_2015',
       'year_2016', 'year_2017', 'year_2018', 'year_2019', 'year_2020',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12'],
      dtype='object')

In [74]:
# standard scale non categorical columns
ct = make_column_transformer(
        (StandardScaler(), ['Poverty ', 'Population', 'White Prop', 'Black Prop',
                            'Native American Prop', 'Asian Prop', 'Pacific Prop', 'Two+ Prop',
                            'HighIncome Prop', 'MidIncome Prop', 'Middle-Old Prop','Middle-Young Prop', 'Old Prop'])
        , remainder='passthrough')

ct_array = ct.fit_transform(predictive_df)

In [90]:
# make numpy array
x_matrix = np.asmatrix(ct_array)

In [97]:
# check dimensions of matrix
n_rows = x_matrix.shape[0]
x_matrix.shape

(69443, 33)

In [95]:
# named columns of matrix in order (including intercept)
x_mat_columns = ['Intercept', 'Poverty ', 'Population', 'White Prop', 'Black Prop', 'Native American Prop', 'Asian Prop', 'Pacific Prop',
                 'Two+ Prop','HighIncome Prop', 'MidIncome Prop', 'Middle-Old Prop','Middle-Young Prop', 'Old Prop',
                'is_weekend', 'year_2013', 'year_2014', 'year_2015','year_2016', 'year_2017', 'year_2018', 'year_2019', 
                 'year_2020', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 
                 'month_10', 'month_11', 'month_12']

In [105]:
# adding intercept column to matrix
intercept_array = np.ones((n_rows,1))
x_mat = np.concatenate((intercept_array, x_matrix),1)

In [109]:
x_mat.shape

(69443, 34)

In [117]:
response_mat = np.asmatrix(df['Sales Volume PC']).getT()

In [119]:
# ols beta estimates
ols_betas = np.matmul(np.matmul(np.matmul(x_mat.getT(), x_mat).getI(), x_mat.getT()), response_mat)

In [130]:
variable_df = pd.DataFrame({'Variable':x_mat_columns})
betas_df = pd.DataFrame(ols_betas)
ols_betas_df = pd.concat([variable_df, betas_df], axis=1)
ols_betas_df.columns = ['Variable', 'Coefficient']

In [132]:
ols_betas_df.sort_values(by='Coefficient', ascending=False)

Unnamed: 0,Variable,Coefficient
0,Intercept,0.669425
33,month_12,0.225157
31,month_10,0.219571
13,Old Prop,0.205819
26,month_5,0.200926
27,month_6,0.190728
29,month_8,0.162023
17,year_2015,0.161984
28,month_7,0.156141
16,year_2014,0.136068


In [139]:
# shuffle matrix to randomize
np.random.shuffle(x_matrix)

In [145]:
# make training/holdout/test split
split70 = round(n_rows*0.7)
split90 = round(n_rows*0.9)

In [149]:
training = x_matrix[0:split70]

In [151]:
holdout = x_matrix[split70:split90]
test = x_matrix[split90:]

In [152]:
training

matrix([[ 0.32489547, -0.43005044,  0.02563374, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.32489547, -0.43005044,  0.02563374, ...,  0.        ,
          0.        ,  0.        ],
        [-0.8503253 , -0.42266366,  0.57275762, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.95550174, -0.35256687,  0.46714438, ...,  0.        ,
          0.        ,  0.        ],
        [-0.70700569, -0.47348382,  0.59398133, ...,  1.        ,
          0.        ,  0.        ],
        [-1.76757078,  0.47369894, -0.85108787, ...,  0.        ,
          0.        ,  0.        ]])

In [None]:
#### Create design matrix

```{r}
# Create X
X = as.matrix(predictive_scaled)

# Get number of rows
num_rows = dim(X)[1]

# Generate column of ones for the intercept
one_col = matrix(rep(1, num_rows), nrow = num_rows)

# Add column of 1s to left side of X
X = cbind(one_col, X)
```

***

```{r}
# Identify the number of parameters
num_param = dim(X)[2]

# Calculate hat matrix for OLS regression
OLS_hat_matrix = X %*% solve(t(X) %*% X) %*% t(X)

# Define lambda value
lambda = 1000

# Calculate matrix version of lambda (lambda * identity matrix)
lambda_matrix = lambda * diag(num_param)
```

```{r}
# Calculate hat matrix for RIDGE regression
RIDGE_hat_matrix = X %*% solve((t(X) %*% X) + lambda_matrix) %*% t(X)
```

```{r}
betas = solve((t(X) %*% X) + lambda_matrix) %*% t(X) %*% as.matrix(df$Rating)

data.frame(betas) %>% arrange(desc(betas)) %>% head(6)
```