# Regression Analysis
Taylor Swearingen

In [None]:
# Setting up the libraries and data
# import libraries 
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf 
from sklearn.model_selection import train_test_split 
import sklearn.linear_model 

# specifying the path and file name
file = './Apprentice_Chef_Dataset.xlsx'

# reading the file 
Chef = pd.read_excel(file)

# viewing the file
Chef.head(n = 5)

In [None]:
# creating Continuous variables and response variables subset
continuous_x_variables = ['CROSS_SELL_SUCCESS', 'TOTAL_MEALS_ORDERED', 'UNIQUE_MEALS_PURCH',
                           'CONTACTS_W_CUSTOMER_SERVICE', 'PRODUCT_CATEGORIES_VIEWED',
                           'AVG_TIME_PER_SITE_VISIT', 'CANCELLATIONS_BEFORE_NOON',
                           'CANCELLATIONS_AFTER_NOON', 'TASTES_AND_PREFERENCES', 'PC_LOGINS',
                           'MOBILE_LOGINS', 'WEEKLY_PLAN', 'EARLY_DELIVERIES', 'LATE_DELIVERIES',
                           'PACKAGE_LOCKER', 'REFRIGERATED_LOCKER', 'AVG_PREP_VID_TIME',
                           'LARGEST_ORDER_SIZE', 'MASTER_CLASSES_ATTENDED', 'MEDIAN_MEAL_RATING',
                           'AVG_CLICKS_PER_VISIT', 'TOTAL_PHOTOS_VIEWED']

y_variable = ['REVENUE']

In [None]:
# preparing x-variables from the OLS model
x_data = Chef.loc[:, continuous_x_variables]


# preparing response variable
y_data = Chef.loc[:, y_variable]

# setting up the training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(
            x_data,         
            y_data,   
            test_size    = 0.25,
            random_state = 219)

# creating dataframes from training and testing

# training DF
chef_train = Chef.loc[x_train.index , continuous_x_variables]
chef_train['REVENUE'] = y_train

#testing DF
chef_test = Chef.loc[x_test.index , continuous_x_variables]
chef_test['REVENUE'] = y_test

In [None]:
# seeing what our r-squared is with standard linear regression function

# creating model name for output
model_name = "Linear Regression Model"


# initiating the model
model = sklearn.linear_model.LinearRegression()


# fitting the data to the training dataset
model_fit = model.fit(x_train, y_train)


# predicting the testing data
model_pred = model.predict(x_test)


# evaulating the results
model_train_score = model.score(x_train, y_train).round(4) # using R-square
model_test_score  = model.score(x_test, y_test).round(4)   # using R-square
model_gap         = abs(model_train_score - model_test_score).round(4)

# dynamically printing results
LR_model =  f"""\
Model Name:     {model_name}
Train_Score:    {model_train_score}
Test_Score:     {model_test_score}
Train-Test Gap: {model_gap}"""

print(LR_model)

In [None]:
# creating an OLS regession

# model formula OLS
lm_fit = smf.ols(formula = """ REVENUE   ~  CROSS_SELL_SUCCESS +
                                            TOTAL_MEALS_ORDERED +
                                            UNIQUE_MEALS_PURCH +
                                            CONTACTS_W_CUSTOMER_SERVICE +
                                            PRODUCT_CATEGORIES_VIEWED +
                                            AVG_TIME_PER_SITE_VISIT +
                                            CANCELLATIONS_BEFORE_NOON +
                                            CANCELLATIONS_AFTER_NOON +
                                            TASTES_AND_PREFERENCES +
                                            PC_LOGINS +
                                            MOBILE_LOGINS +
                                            WEEKLY_PLAN +
                                            EARLY_DELIVERIES +
                                            LATE_DELIVERIES +
                                            PACKAGE_LOCKER +
                                            REFRIGERATED_LOCKER +
                                            AVG_PREP_VID_TIME +
                                            LARGEST_ORDER_SIZE +
                                            MASTER_CLASSES_ATTENDED +
                                            MEDIAN_MEAL_RATING +
                                            AVG_CLICKS_PER_VISIT +
                                            TOTAL_PHOTOS_VIEWED """,
                                data = Chef)


# running the method through the formula
results_fit = lm_fit.fit()
results_rsquare = results_fit.rsquared.round(4)

# printing results
results_fit_summary = results_fit.summary()
print(f"Original R-Squared Value : {results_fit.rsquared.round(4)}")

In [None]:
# creating a hyperparameter to tune the model

# storing the results
results_as_html = results_fit_summary.tables[1].as_html()
summary_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
model_pvals = summary_df.loc[:, 'P>|t|']

# creating an empty list
drop_features = []

# creating a for loop to drop high p-values
for feature, pval in model_pvals.iteritems() :
    if pval  >= .05 :
        drop_features.append(feature)
    else:
        continue
        
# hyperparameter
drop_cols_param = np.array(drop_features)

# printing the features to be dropped
print(drop_features)

In [None]:
#OLS Train (FINAL MODEL (Train))

# naming the model
model_name_tuned_train = "OLS Model Tuned (train)"

# creating the model instance 
OLS_tuned_train = smf.ols(formula = """REVENUE ~  CROSS_SELL_SUCCESS +
                                            TOTAL_MEALS_ORDERED +
                                            UNIQUE_MEALS_PURCH +
                                            CONTACTS_W_CUSTOMER_SERVICE +
                                            PRODUCT_CATEGORIES_VIEWED +
                                            AVG_TIME_PER_SITE_VISIT +
                                            CANCELLATIONS_BEFORE_NOON +
                                            CANCELLATIONS_AFTER_NOON +
                                            TASTES_AND_PREFERENCES +
                                            PC_LOGINS +
                                            MOBILE_LOGINS +
                                            WEEKLY_PLAN +
                                            EARLY_DELIVERIES +
                                            LATE_DELIVERIES +
                                            PACKAGE_LOCKER +
                                            REFRIGERATED_LOCKER +
                                            AVG_PREP_VID_TIME +
                                            LARGEST_ORDER_SIZE +
                                            MASTER_CLASSES_ATTENDED +
                                            MEDIAN_MEAL_RATING +
                                            AVG_CLICKS_PER_VISIT +
                                            TOTAL_PHOTOS_VIEWED""", 
                                    data = chef_train, 
                                    drop_cols= drop_cols_param)

# fitting the data to train
OLS_fit_tuned_train = OLS_tuned_train.fit()
OLS_tuned_train_rsquare = OLS_fit_tuned_train.rsquared.round(4)

In [None]:
#OLS Test (FINAL MODEL (Test))

# naming the model
model_name_test = "OLS Model Tuned (test)"

# creating the model instance 
OLS_tuned_test = smf.ols(formula = """REVENUE ~  CROSS_SELL_SUCCESS +
                                            TOTAL_MEALS_ORDERED +
                                            UNIQUE_MEALS_PURCH +
                                            CONTACTS_W_CUSTOMER_SERVICE +
                                            PRODUCT_CATEGORIES_VIEWED +
                                            AVG_TIME_PER_SITE_VISIT +
                                            CANCELLATIONS_BEFORE_NOON +
                                            CANCELLATIONS_AFTER_NOON +
                                            TASTES_AND_PREFERENCES +
                                            PC_LOGINS +
                                            MOBILE_LOGINS +
                                            WEEKLY_PLAN +
                                            EARLY_DELIVERIES +
                                            LATE_DELIVERIES +
                                            PACKAGE_LOCKER +
                                            REFRIGERATED_LOCKER +
                                            AVG_PREP_VID_TIME +
                                            LARGEST_ORDER_SIZE +
                                            MASTER_CLASSES_ATTENDED +
                                            MEDIAN_MEAL_RATING +
                                            AVG_CLICKS_PER_VISIT +
                                            TOTAL_PHOTOS_VIEWED""", 
                                    data = chef_test, 
                                    drop_cols= drop_cols_param)

# fitting the data to train
OLS_fit_tuned_test = OLS_tuned_test.fit()
OLS_tuned_test_rsquare = OLS_fit_tuned_test.rsquared.round(4)

In [None]:
# FINAL MODEL EVALUATIONS

# evaluating the OLS Tuned Model with train and test
model_name = "Tuned OLS Model"
model_gap  = abs(OLS_tuned_train_rsquare - OLS_tuned_test_rsquare).round(4)

# dynamically printing results
OLS_model =  f"""\
Model Name:     Original OLS Model
Original Score: {results_rsquare}

Model Name:     {model_name}
Train_Score:    {OLS_tuned_train_rsquare}
Test_Score:     {OLS_tuned_test_rsquare}
Train-Test Gap: {model_gap}"""

print(OLS_model)

# Analyzing the Results

After tuning the model, we can see the r-squared value went up significantly. We went from our predictions being roughly 64% accurate to the tuned model predicting roughly 92% accurately. 