In [5]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv("harlech_data_proportions.csv", low_memory = False)
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

In [7]:
def drop_vars(df):

    df = df.drop([
        # Those we want to remove for all analyses
        'Unnamed: 0', 'Date',

        # Those we want to remove for the interpretive model
        'Bottles Sold PC', 'Vodka Sold PC',
        'Gin Sold PC', 'Rum Sold PC', 'Whiskey Sold PC', 'Tequila Sold PC', 'Other Alc Sold PC',

        # Remove targets
        #'Sales Volume PC',
        'Volume Sold PC',

        # Remove linearly dependent columns
        'DOW', 'Young Prop', 'LowIncome Prop', 'Other Prop'], axis = 1)
    
    return df


def month_year_dummies(df):

    # get dummies for year 
    year_dummies = pd.get_dummies(df['Year'], drop_first = True)
    year_dummies.columns = ["year_" + str(column) for column in year_dummies.columns]
    df = df.drop(['Year'], axis = 1)

    # get dummies for month
    month_dummies = pd.get_dummies(df['Month'], drop_first = True)
    month_dummies.columns = ["month_" + str(column) for column in month_dummies.columns]
    df = df.drop(['Month'], axis = 1)

    df = pd.concat([df, year_dummies, month_dummies], axis = 1)
    
    return df


def get_intermediate_matrix(df, target):
    
    target_var = df[target]
    
    df = df.drop([target], axis = 1)
    
    numeric_columns = ['Poverty ', 'Population', 'White Prop', 'Black Prop',
                       'Native American Prop', 'Asian Prop', 'Pacific Prop', 'Two+ Prop',
                       'HighIncome Prop', 'MidIncome Prop', 'Middle-Old Prop','Middle-Young Prop', 'Old Prop']
    
    non_numeric_columns = [column for column in df.columns if column not in numeric_columns]
    
    # standard scale non categorical columns
    ct = make_column_transformer(
            (StandardScaler(), numeric_columns)
            , remainder='passthrough')

    ct_array = ct.fit_transform(df)

    column_order = ['Intercept'] + numeric_columns + non_numeric_columns

    x_mat = np.asmatrix(ct_array) 
    intercept_array = np.ones((x_mat.shape[0], 1))
    x_mat = np.concatenate((intercept_array, x_mat),1)
    response_mat = np.asmatrix(target_var).getT()
    
    matrix_without_target = x_mat
    
    matrix_with_target = np.concatenate((response_mat, x_mat), 1)
    
    return matrix_with_target, matrix_without_target, response_mat, column_order


def get_betas(x_mat, response_mat):
    
    # ols beta estimates
    ols_betas = np.matmul(np.matmul(np.matmul(x_mat.getT(), x_mat).getI(), x_mat.getT()), response_mat)
    
    return ols_betas


def show_beta_summary(betas, column_order):
    
    return pd.DataFrame({'Variable' : column_order, 'Coefficient' : betas.A1}).sort_values(by = 'Coefficient', ascending = False)

In [8]:
df = drop_vars(df)

df = month_year_dummies(df)

matrix_with_target, matrix_without_target, response_mat, column_order = get_intermediate_matrix(df, 'Sales Volume PC')

betas = get_betas(matrix_without_target, response_mat)

show_beta_summary(betas, column_order)

Unnamed: 0,Variable,Coefficient
0,Intercept,0.669425
33,month_12,0.225157
31,month_10,0.219571
13,Old Prop,0.205819
26,month_5,0.200926
27,month_6,0.190728
29,month_8,0.162023
17,year_2015,0.161984
28,month_7,0.156141
16,year_2014,0.136068
