In [3]:
cd ../

c:\Users\Nicole\Desktop\MDS\capstone\canucks_mds_capstone


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [4]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from src.cross_validation import *
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, r2_score
import altair as alt

In [6]:
X_train = pd.read_parquet('data/output/X_train.parquet')
y_train = pd.read_parquet('data/output/y_train.parquet')
X_test = pd.read_parquet('data/output/X_test.parquet')
y_test = pd.read_parquet('data/output/y_test.parquet')


In [34]:
y_train.value_counts()

target_host_sold-today
0.0                       62744
2.0                        6215
3.0                        2934
4.0                        2574
1.0                        2367
                          ...  
64.0                          1
61.0                          1
60.0                          1
58.0                          1
246.0                         1
Name: count, Length: 77, dtype: int64

In [4]:
def polynomial_reg(numeric_feats, categorical_feats, binary_feats, degree=2): 

    all_columns = set(X_train.columns)
    rest_columns = set(numeric_feats + categorical_feats + binary_feats)
    drop_feats = list(all_columns - rest_columns)   

    preprocessor = ColumnTransformer(
    transformers=[
        ('drop', 'drop', drop_feats), 
        ('numeric', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('poly', PolynomialFeatures(degree=degree)),
            ('scaler', StandardScaler())
        ]), numeric_feats),
        ('boolean','passthrough', binary_feats),
        ('categorical', Pipeline([
            ('onehot', OneHotEncoder(categories='auto', handle_unknown='ignore'))
        ]), categorical_feats)
        ]
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    model.fit(X_train, y_train)


    poly = model.named_steps['preprocessor'].transformers_[1][1].named_steps['poly']
    poly_feature_indices = poly.get_feature_names_out(input_features=numeric_feats)
    poly_feature_names = [f'poly_{name}' for name in poly_feature_indices]

    cat_feature_names = model.named_steps['preprocessor'].transformers_[3][1].named_steps['onehot'].get_feature_names_out(categorical_feats)
    feature_names = np.concatenate((poly_feature_names, binary_feats, cat_feature_names))
    coefficients = (model.named_steps['regressor'].coef_).reshape(-1)


    df_coef = pd.DataFrame({'feature': feature_names, 'coefficients': coefficients})
    df_coef = df_coef.sort_values(by='coefficients', ascending=False)

    return model, df_coef



All Features

In [5]:
numeric_feats = [
    'cap', 'revenue_to_date', 's/t-rate', 'opens', 'holds', 'holds_release', 
    'pcp_currentprice', 'pcp_forwardtix', 'pcp_forwardrev', 'prp_currentprice', 
    'prp_forwardtix', 'prp_forwardrev', 'prg_currentprice', 'prg_forwardtix', 
    'prg_forwardrev', 'pmt_currentprice', 'pmt_forwardtix', 'pmt_forwardrev', 
    'pio_currentprice', 'pio_forwardtix', 'pio_forwardrev', 'qualified_opens', 
    'unqualified_opens', 'pminventory', 'ticket_sold-total', 'ticket_sold-last_7days', 
    'ticket_sold-yesterday', 'host_sold-total', 'host_sold-last_7days', 'host_sold-yesterday', 
    'archtics_sold-total', 'archtics_sold-last_7days', 'archtics_sold-yesterday', 'comps-total', 
    'comps-last_7days', 'comps-yesterday', 'resale_sold-2_days_ago', 'resale_sold-last_7days', 
    'resale_sold-total', 'resale_asp-2_days_ago', 'resale_asp-last_7days', 'resale_asp-total', 
    'initial_price', 'last_price', 'number_of_postings', 'median_posting_price', 
    'posting_below_cp', 'lowest_post_price', 'highest_post_price', 'host_sold_at_current_price', 
    'prp_opportunity', 'prg_opportunity', 
    'pmt_opportunity', 'pio_opportunity',
    'tickets_sold_2_days_before_today', 'tickets_sold_3_days_before_today', 
    'tickets_sold_4_days_before_today', 'tickets_sold_5_days_before_today', 
    'tickets_sold_6_days_before_today', 'tickets_sold_7_days_before_today', 
    'days_until_game', 'opponent_rank', 'van_rank', 
    'host_sold_agg_last_day', 'unique_views',
]
categorical_feats = ['month','opponent']
binary_feats = ['bowl_location','weekend_game']

model, df_coef = polynomial_reg(numeric_feats, categorical_feats, binary_feats, degree=2)

In [6]:
eval_model(model, X_train, y_train)

Training date range: 2022-01-20 00:00:00 - 2023-02-13 00:00:00
Validation date range: 2023-02-14 00:00:00 - 2023-09-06 00:00:00
Validation Root Mean Squared Error: 4.946595558274364
Training date range: 2022-01-20 00:00:00 - 2023-09-06 00:00:00
Validation date range: 2023-09-07 00:00:00 - 2023-11-05 00:00:00
Validation Root Mean Squared Error: 3.8587158481275434
Training date range: 2022-01-20 00:00:00 - 2023-11-05 00:00:00
Validation date range: 2023-11-06 00:00:00 - 2023-12-31 00:00:00
Validation Root Mean Squared Error: 3.348263652571731
All Validation Scores: [4.946595558274364, 3.8587158481275434, 3.348263652571731]
Mean Validation Score: 4.051191686324546
Standard Deviation of Validation Scores: 0.6665590403527565


In [7]:
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Training RMSE: {rmse_train}")


Training RMSE: 3.0360370239571495


In [8]:
r2_score(y_train, y_train_pred)

0.3393449634259068

In [9]:
df_coef.head(30)

Unnamed: 0,feature,coefficients
4,poly_opens,7276552000000.0
9,poly_pcp_forwardrev,2091605000000.0
893,poly_prg_forwardrev unqualified_opens,2008912000000.0
6,poly_holds_release,1791376000000.0
683,poly_prp_forwardtix unqualified_opens,841403600000.0
262,poly_opens pcp_forwardtix,836004000000.0
297,poly_opens initial_price,814614800000.0
678,poly_prp_forwardtix pmt_forwardrev,754925600000.0
298,poly_opens last_price,711473600000.0
1284,poly_unqualified_opens resale_asp-total,637797900000.0


In [10]:
df_coef.tail(30)

Unnamed: 0,feature,coefficients
593,poly_pcp_forwardrev last_price,-509096100000.0
572,poly_pcp_forwardrev unqualified_opens,-510976400000.0
282,poly_opens host_sold-total,-514759300000.0
1331,poly_pminventory posting_below_cp,-520472900000.0
263,poly_opens pcp_forwardrev,-521706400000.0
1272,poly_unqualified_opens host_sold-yesterday,-536702000000.0
258,poly_opens^2,-539041400000.0
1179,poly_pio_forwardrev pminventory,-561723300000.0
86,poly_cap pio_forwardrev,-561748100000.0
993,poly_pmt_forwardtix pminventory,-562394100000.0


Reduced Features

In [11]:
numeric_feats = ['cap', 's/t-rate', 'opens', 'prp_forwardtix', 'ticket_sold-total', 'ticket_sold-last_7days', 
    'ticket_sold-yesterday', 'host_sold-total', 'host_sold-last_7days', 'host_sold-yesterday', 'resale_sold-last_7days', 
    'resale_sold-total', 'resale_asp-last_7days', 'resale_asp-total', 
    'initial_price', 'last_price', 'number_of_postings', 'median_posting_price', 
    'posting_below_cp', 'lowest_post_price', 'highest_post_price', 'host_sold_at_current_price', 
    'days_until_game', 'opponent_rank', 'van_rank', 
    'host_sold_agg_last_day', 'unique_views']

categorical_feats = ['month', 'opponent', 'price_code']
binary_feats = ['weekend_game']

all_columns = set(X_train.columns)
rest_columns = set(numeric_feats + categorical_feats)
drop_feats = list(all_columns - rest_columns)

model, df_coef = polynomial_reg(numeric_feats, categorical_feats, binary_feats, degree=2)

In [12]:
eval_model(model, X_train, y_train)

Training date range: 2022-01-20 00:00:00 - 2023-02-13 00:00:00
Validation date range: 2023-02-14 00:00:00 - 2023-09-06 00:00:00
Validation Root Mean Squared Error: 2.763638004569992
Training date range: 2022-01-20 00:00:00 - 2023-09-06 00:00:00
Validation date range: 2023-09-07 00:00:00 - 2023-11-05 00:00:00
Validation Root Mean Squared Error: 3.592796397553405
Training date range: 2022-01-20 00:00:00 - 2023-11-05 00:00:00
Validation date range: 2023-11-06 00:00:00 - 2023-12-31 00:00:00
Validation Root Mean Squared Error: 3.3645605267062235
All Validation Scores: [2.763638004569992, 3.592796397553405, 3.3645605267062235]
Mean Validation Score: 3.2403316429432074
Standard Deviation of Validation Scores: 0.3497146663919628


In [13]:
eval_model(model, X_train, y_train)

y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Training RMSE: {rmse_train}")

r2_score(y_train, model.predict(X_train))

Training date range: 2022-01-20 00:00:00 - 2023-02-13 00:00:00
Validation date range: 2023-02-14 00:00:00 - 2023-09-06 00:00:00
Validation Root Mean Squared Error: 2.763638004569992
Training date range: 2022-01-20 00:00:00 - 2023-09-06 00:00:00
Validation date range: 2023-09-07 00:00:00 - 2023-11-05 00:00:00
Validation Root Mean Squared Error: 3.592796397553405
Training date range: 2022-01-20 00:00:00 - 2023-11-05 00:00:00
Validation date range: 2023-11-06 00:00:00 - 2023-12-31 00:00:00
Validation Root Mean Squared Error: 3.3645605267062235
All Validation Scores: [2.763638004569992, 3.592796397553405, 3.3645605267062235]
Mean Validation Score: 3.2403316429432074
Standard Deviation of Validation Scores: 0.3497146663919628
Training RMSE: 3.1544437696782834


0.2868084256178707

In [14]:
from sklearn.model_selection import cross_val_score, cross_validate
scores = pd.DataFrame(cross_validate(model,
X_train,
y_train,
return_train_score=True,
scoring='neg_mean_squared_error',
cv=10
))

In [15]:
# add mean as bottom row

scores['test_score'] = np.sqrt(-scores['test_score'])
scores['train_score'] = np.sqrt(-scores['train_score'])

means = scores.mean().to_frame().transpose()

scores_with_mean = pd.concat([scores, means], ignore_index=True)
scores_with_mean


Unnamed: 0,fit_time,score_time,test_score,train_score
0,4.569424,0.100819,2.94981,3.189328
1,4.498721,0.084591,2.63246,3.157466
2,4.625869,0.090077,3.583957,3.058968
3,5.039016,0.096954,7.458909,2.631871
4,6.697103,0.08763,2.518143,3.168157
5,7.014241,0.084417,2.001271,3.205824
6,4.751561,0.096145,2.334346,3.184584
7,4.347068,0.102787,3.129604,3.111477
8,4.493299,0.101423,3.707375,3.055718
9,4.862108,0.091537,2.795161,3.162986


Ridge

In [16]:
def polynomial_ridge(numeric_feats, categorical_feats, binary_feats, degree=2, alpha=1.0): 

    all_columns = set(X_train.columns)
    rest_columns = set(numeric_feats + categorical_feats + binary_feats)
    drop_feats = list(all_columns - rest_columns)   

    preprocessor = ColumnTransformer(
    transformers=[
        ('drop', 'drop', drop_feats), 
        ('numeric', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('poly', PolynomialFeatures(degree=degree)),
            ('scaler', StandardScaler())
        ]), numeric_feats),
        ('boolean','passthrough', binary_feats),
        ('categorical', Pipeline([
            ('onehot', OneHotEncoder(categories='auto', handle_unknown='ignore'))
        ]), categorical_feats)
        ]
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=alpha))
    ])

    model.fit(X_train, y_train)


    poly = model.named_steps['preprocessor'].transformers_[1][1].named_steps['poly']
    poly_feature_indices = poly.get_feature_names_out(input_features=numeric_feats)
    poly_feature_names = [f'poly_{name}' for name in poly_feature_indices]

    cat_feature_names = model.named_steps['preprocessor'].transformers_[3][1].named_steps['onehot'].get_feature_names_out(categorical_feats)
    feature_names = np.concatenate((poly_feature_names, binary_feats, cat_feature_names))
    coefficients = (model.named_steps['regressor'].coef_).reshape(-1)


    df_coef = pd.DataFrame({'feature': feature_names, 'coefficients': coefficients})
    df_coef = df_coef.sort_values(by='coefficients', ascending=False)

    return model, df_coef



In [7]:
numeric_feats = ['cap', 's/t-rate', 'opens', 'prp_forwardtix', 'ticket_sold-total', 'ticket_sold-last_7days', 
    'ticket_sold-yesterday', 'host_sold-total', 'host_sold-last_7days', 'host_sold-yesterday', 'resale_sold-last_7days', 
    'resale_sold-total', 'resale_asp-last_7days', 'resale_asp-total', 
    'initial_price', 'last_price', 'number_of_postings', 'median_posting_price', 
    'posting_below_cp', 'lowest_post_price', 'highest_post_price', 'host_sold_at_current_price', 
    'days_until_game', 'opponent_rank', 'van_rank', 
    'host_sold_agg_last_day', 'unique_views']

categorical_feats = ['month', 'opponent', 'price_code']
binary_feats = ['weekend_game']

all_columns = set(X_train.columns)
rest_columns = set(numeric_feats + categorical_feats )
drop_feats = list(all_columns - rest_columns)

In [18]:
model, df_coef = polynomial_ridge(numeric_feats, categorical_feats, binary_feats, degree=2, alpha=1000)

In [19]:
eval_model(model, X_train, y_train)

y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Training RMSE: {rmse_train}")

r2_score(y_train, y_train_pred)

Training date range: 2022-01-20 00:00:00 - 2023-02-13 00:00:00
Validation date range: 2023-02-14 00:00:00 - 2023-09-06 00:00:00
Validation Root Mean Squared Error: 2.664158593337854
Training date range: 2022-01-20 00:00:00 - 2023-09-06 00:00:00
Validation date range: 2023-09-07 00:00:00 - 2023-11-05 00:00:00
Validation Root Mean Squared Error: 3.3161589786394137
Training date range: 2022-01-20 00:00:00 - 2023-11-05 00:00:00
Validation date range: 2023-11-06 00:00:00 - 2023-12-31 00:00:00
Validation Root Mean Squared Error: 3.0995160336523124
All Validation Scores: [2.664158593337854, 3.3161589786394137, 3.0995160336523124]
Mean Validation Score: 3.026611201876527
Standard Deviation of Validation Scores: 0.2711241554227353
Training RMSE: 3.1296587312345934


0.29797174835418827

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

def polynomial_search(numeric_feats, categorical_feats, binary_feats): 

    all_columns = set(X_train.columns)
    rest_columns = set(numeric_feats + categorical_feats + binary_feats)
    drop_feats = list(all_columns - rest_columns)   

    preprocessor = ColumnTransformer(
        transformers=[
            ('drop', 'drop', drop_feats), 
            ('numeric', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
                ('poly', PolynomialFeatures()),
                ('scaler', StandardScaler())
            ]), numeric_feats),
            ('boolean','passthrough', binary_feats),
            ('categorical', Pipeline([
                ('onehot', OneHotEncoder(categories='auto', handle_unknown='ignore'))
            ]), categorical_feats)
        ]
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge()) 
    ])

    # search space 
    param_dist = {
        'preprocessor__numeric__poly__degree': randint(2, 3),  # forced deg 2 for now
        'regressor__alpha': uniform(0, 10000)  # regularization 
    }

    # Perform random search
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42) 
    random_search.fit(X_train, y_train)

    best_model = random_search.best_estimator_

    poly = best_model.named_steps['preprocessor'].transformers_[1][1].named_steps['poly']
    poly_feature_indices = poly.get_feature_names_out(input_features=numeric_feats)
    poly_feature_names = [f'poly_{name}' for name in poly_feature_indices]

    cat_feature_names = best_model.named_steps['preprocessor'].transformers_[3][1].named_steps['onehot'].get_feature_names_out(categorical_feats)
    feature_names = np.concatenate((poly_feature_names, binary_feats, cat_feature_names))
    coefficients = (best_model.named_steps['regressor'].coef_).reshape(-1)


    df_coef = pd.DataFrame({'feature': feature_names, 'coefficients': coefficients})
    df_coef = df_coef.sort_values(by='coefficients', ascending=False)

    return best_model, df_coef


In [9]:
model, df_coef = polynomial_search(numeric_feats, categorical_feats, binary_feats)

In [10]:
eval_model(model, X_train, y_train)

y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Training RMSE: {rmse_train}")

r2_score(y_train, y_train_pred)

Training date range: 2022-01-20 00:00:00 - 2023-02-13 00:00:00
Validation date range: 2023-02-14 00:00:00 - 2023-09-06 00:00:00
Validation Root Mean Squared Error: 2.6549416636758654
Training date range: 2022-01-20 00:00:00 - 2023-09-06 00:00:00
Validation date range: 2023-09-07 00:00:00 - 2023-11-05 00:00:00
Validation Root Mean Squared Error: 3.26803053207042
Training date range: 2022-01-20 00:00:00 - 2023-11-05 00:00:00
Validation date range: 2023-11-06 00:00:00 - 2023-12-31 00:00:00
Validation Root Mean Squared Error: 3.0862336086223223
All Validation Scores: [2.6549416636758654, 3.26803053207042, 3.0862336086223223]
Mean Validation Score: 3.003068601456203
Standard Deviation of Validation Scores: 0.2571080239251386
Training RMSE: 3.131708995644922


0.29705163844715954

In [11]:
best_alpha = model.named_steps['regressor'].alpha
best_degree = model.named_steps['preprocessor'].transformers_[1][1].named_steps['poly'].degree

In [12]:
best_alpha

1559.9452033620264

In [31]:
best_degree

2