
# Project 5: Hackathon
Emily & Prab

DSB-318

May 31, 2024

Restaurant Revenue Predictions

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Function Definitions
# Imports
import pandas as pd
import time
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date, time
from string import capwords

# Define my_date()
def my_date():
  return datetime.now().strftime('%Y-%m-%d_h%H-m%M-s%S')
my_date()

# Define autoplots()
def autoplots(d, y, line = False):
  '''a function to make a ton of graphs.
  Each plot is based on a subset of d where all variables in the
  plot have no null values.  The size of this subset (n) is 
  displayed in the subtitle of the plot, and can be used 
  similarly to d.isnull().sum(), if desired.
  
  args:
    d: dataframe, the dataframe of the information
    y: string, the name of the column within the dataframe that is the target
    line: bool, whether to plot line graphs, default = False
  return:
    a ton of plots in a FOLDER
  raise:
    pls no'''
  
  # Need these
  from string import capwords
  import matplotlib.pyplot as plt
  import seaborn as sns
  
  # Make a folder 
  try: 
    os.mkdir('images')
  except:
    pass
  a = f'images/plots_{my_date()}'
  os.mkdir(a)
  
  # Define this once
  n_grand = len(d[y])
  print(n_grand)
  # in future versions, I'd like to raise a warning
  # if len(d[y])!=len(d[d[y].notna()])    
  # (i.e., if there are nulls in the target)
  
  # Give y a good(ish) name
  ty = capwords(y.replace('_', ' '))
  print(ty)
  
  # Plot the distributions of all variables
  for i in d.columns:
    print(i)
    # Give it a good(ish) name
    t = capwords(i.replace('_', ' '))
    print(t)
    
    # Extract the subset dataframe, drop NAs, get n
    df = d[i]
    print(df.shape)
    df.dropna(inplace = True)
    n = len(df)
    
    # Plot a histogram of it
    plt.figure(figsize = (16, 9));
    plt.hist(df, bins = 'auto', color = 'purple');
    plt.suptitle(f'Distribution of {t}', size = 24)
    plt.title(f'Based on {n} Observations out of {n_grand}', size = 18)
    plt.xlabel(f'{t}', size = 20);
    plt.ylabel('Frequency', size = 20);
    plt.xticks(size = 16, rotation = 60);
    plt.yticks(size = 16)
    #plt.tight_layout()
    plt.savefig(f'./{a}/{i}_histogram.png')
    plt.close()
    
    # Plot a boxplot of it
    plt.figure(figsize = (16, 9))
    sns.boxplot(data = df, color = 'purple', orient = 'h')
    plt.suptitle(f'Distribution of {t}', size = 24)
    plt.title(f'Based on {n} Observations out of {n_grand}', size = 18)
    plt.xlabel(f'{t}', size = 20);
    plt.xticks(size = 16, rotation = 60)
    #plt.tight_layout()
    plt.savefig(f'./{a}/{i}_boxplot.png')
    plt.close()
    
  # Drop y from the list
  X = [col for col in list(d.drop(columns = [y]).columns)]
  
  # Make plots of each x against y
  for i in X:
    # Give it a good(ish) name
    t = capwords(i.replace('_', ' '))
    print(t)
    
    # Extract the subset dataframe, drop NAs, get n
    df = d[[i, y]]
    df.dropna(inplace = True)
    n = len(df[y])
    
    # Plot a scatterplot of it against y
    plt.figure(figsize = (16, 9))
    plt.scatter(df[i], df[y], alpha = 0.5, color = 'purple')
    plt.suptitle(f'Relationship between {t} and {ty}', size = 24)
    plt.title(f'Based on {n} Observations out of {n_grand}', size = 18)
    plt.xlabel(f'{t}', size = 20);
    plt.ylabel(f'{ty}', size = 20);
    plt.xticks(size = 16, rotation = 60)
    plt.yticks(size = 16)
    # plt.tight_layout()
    plt.savefig(f'./{a}/{t}-by-{y}_scatterplot.png')
    plt.close()
    
    # Plot a line plot of it against y
    if line==True:
      plt.figure(figsize = (16, 9))
      plt.plot(i, y, data = df, color = 'purple')
      plt.suptitle(f'Relationship between {t} and {ty}', size = 24)
      plt.title(f'Based on {n} Observations out of {n_grand}', size = 18)
      plt.xlabel(f'{t}', size = 20);
      plt.ylabel(f'{ty}', size = 20);
      plt.xticks(size = 16, rotation = 60)
      plt.yticks(size = 16)
      # plt.tight_layout()
      plt.savefig(f'./{a}/{i}-by-{y}_lineplot.png')
      plt.close()
    
  # All together now
  n = len(d[y])
  
  # Plot a line plot of everything against y
  if line==True:
    plt.figure(figsize = (16, 9))
    for i in X:
      print(i)
      plt.plot(i, y, data = d)
    plt.suptitle(f'Relationship between Predictors and {ty}', size = 24)
    plt.title(f'Based on {n_grand} Observations out of {n_grand}', size = 18)
    plt.xlabel(f'{t}', size = 20);
    plt.ylabel(f'{ty}', size = 20);
    plt.xticks(size = 16, rotation = 60)
    plt.yticks(size = 16)
    plt.legend();
    # plt.tight_layout()
    plt.savefig(f'./{a}/all-by-{y}_lineplot.png')
    plt.close()
  
  # Get some correlations
  corr = round(d.corr(numeric_only = True), 2)
  
  # Plot a heatmap
  mask = np.zeros_like(corr)
  mask[np.triu_indices_from(mask)] = True
  plt.figure(figsize = (16, 9))
  sns.heatmap(corr, square = True, 
    annot = True, cmap = 'coolwarm', mask = mask);
  plt.suptitle(f'Relationships Between Variables', size = 24)
  plt.title(f'Based on {n_grand} Observations out of {n_grand}', size = 18)
  # plt.tight_layout()
  plt.savefig(f'./{a}/all_heatmap.png')
  plt.close()
  
  # Plot a heatmap column on y
  if y in corr:
    plt.figure(figsize = (16, 9))
    sns.heatmap(np.asarray([corr[y].sort_values(ascending = False)]).T, 
      vmin = 0, vmax = 1, annot = True, cmap = 'coolwarm')
    plt.suptitle(f'Relationship between Predictors and {ty}', size = 24)
    plt.title(f'Based on {n_grand} Observations out of {n_grand}', size = 18)
    plt.xlabel(f'{ty}', size = 20)
    plt.yticklabels = True
    # plt.tight_layout()
    plt.savefig(f'./{a}/all-by-{y}_heatmap.png')
    plt.close()

In [3]:
# Data 
data = pd.read_csv('train.csv')

In [4]:
# Data Cleaning
data.isna().sum().sum() # 0, hooray!
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 43 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Id          137 non-null    int64  
 1   Open Date   137 non-null    object 
 2   City        137 non-null    object 
 3   City Group  137 non-null    object 
 4   Type        137 non-null    object 
 5   P1          137 non-null    int64  
 6   P2          137 non-null    float64
 7   P3          137 non-null    float64
 8   P4          137 non-null    float64
 9   P5          137 non-null    int64  
 10  P6          137 non-null    int64  
 11  P7          137 non-null    int64  
 12  P8          137 non-null    int64  
 13  P9          137 non-null    int64  
 14  P10         137 non-null    int64  
 15  P11         137 non-null    int64  
 16  P12         137 non-null    int64  
 17  P13         137 non-null    float64
 18  P14         137 non-null    int64  
 19  P15         137 non-null    i

In [5]:
# Column names
col_names = [c.lower().replace(' ', '_') for c in list(data.columns)]
data.columns = col_names

In [6]:
# Dtype conversions
data['open_date'] = pd.to_datetime(data['open_date'])

In [7]:
# Are these worth including?
data['city'].nunique() # 37 is too many for the number of rows we have
data['city_group'].nunique()
data['type'].nunique()

3

In [8]:
data['city'].unique()
data['city_group'].unique()
data['type'].unique()
# FC: Food Court, IL: Inline, DT: Drive Thru, MB: Mobile
# inline = in store, like a freestanding one
# mobile = burger truck or something (we don't have any of these)

array(['IL', 'FC', 'DT'], dtype=object)

In [9]:
# The city column is not useful for us
# data_backup = data.copy()  # make a copy, we're going to drop city for modelling
data.drop(columns = 'city', inplace = True)

In [10]:
# What are the distributions here?
data['city_group'].value_counts(dropna = False) #fine
data['type'].value_counts(dropna = False) # bad!

type
FC    76
IL    60
DT     1
Name: count, dtype: int64

In [11]:
# Binarize type because of weird proportions
data['fc_type'] = [1 if t=='FC' else 0 for t in data['type']]
data[['type', 'fc_type']] #check
# Drop the old column
data.drop(columns = 'type', inplace = True)

In [12]:
# Binarize city_group because why import OHE and all that for just this
data['city_group_ohe'] = [1 if t=='Big Cities' else 0 for t in data['city_group']]
data[['city_group', 'city_group_ohe']] #check
# Drop the old column
data.drop(columns = 'city_group', inplace = True)

In [13]:
# Check this again
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 42 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              137 non-null    int64         
 1   open_date       137 non-null    datetime64[ns]
 2   p1              137 non-null    int64         
 3   p2              137 non-null    float64       
 4   p3              137 non-null    float64       
 5   p4              137 non-null    float64       
 6   p5              137 non-null    int64         
 7   p6              137 non-null    int64         
 8   p7              137 non-null    int64         
 9   p8              137 non-null    int64         
 10  p9              137 non-null    int64         
 11  p10             137 non-null    int64         
 12  p11             137 non-null    int64         
 13  p12             137 non-null    int64         
 14  p13             137 non-null    float64       
 15  p14   

In [14]:
# Visualizations
autoplots(data, 'revenue')

137
Revenue
id
Id
(137,)
open_date
Open Date
(137,)
p1
P1
(137,)
p2
P2
(137,)
p3
P3
(137,)
p4
P4
(137,)
p5
P5
(137,)
p6
P6
(137,)
p7
P7
(137,)
p8
P8
(137,)
p9
P9
(137,)
p10
P10
(137,)
p11
P11
(137,)
p12
P12
(137,)
p13
P13
(137,)
p14
P14
(137,)
p15
P15
(137,)
p16
P16
(137,)
p17
P17
(137,)
p18
P18
(137,)
p19
P19
(137,)
p20
P20
(137,)
p21
P21
(137,)
p22
P22
(137,)
p23
P23
(137,)
p24
P24
(137,)
p25
P25
(137,)
p26
P26
(137,)
p27
P27
(137,)
p28
P28
(137,)
p29
P29
(137,)
p30
P30
(137,)
p31
P31
(137,)
p32
P32
(137,)
p33
P33
(137,)
p34
P34
(137,)
p35
P35
(137,)
p36
P36
(137,)
p37
P37
(137,)
revenue
Revenue
(137,)
fc_type
Fc Type
(137,)
city_group_ohe
City Group Ohe
(137,)
Id


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


Open Date


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P7


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P8


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P9


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P11


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P12


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P13


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P14


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P15


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P16


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P17


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P18


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P19


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P20


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P21


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P22


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P23


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P24
P25


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P26


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P27


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P28


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P29


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P30


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P31


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P32


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P33


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P34


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P36


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


P37


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


Fc Type


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


City Group Ohe


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


In [15]:
# lr won't take a datetime
data['year'] = data['open_date'].dt.year
data['month'] = data['open_date'].dt.month

In [16]:
data.info() # awesome, they're just integers

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 44 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              137 non-null    int64         
 1   open_date       137 non-null    datetime64[ns]
 2   p1              137 non-null    int64         
 3   p2              137 non-null    float64       
 4   p3              137 non-null    float64       
 5   p4              137 non-null    float64       
 6   p5              137 non-null    int64         
 7   p6              137 non-null    int64         
 8   p7              137 non-null    int64         
 9   p8              137 non-null    int64         
 10  p9              137 non-null    int64         
 11  p10             137 non-null    int64         
 12  p11             137 non-null    int64         
 13  p12             137 non-null    int64         
 14  p13             137 non-null    float64       
 15  p14   

In [17]:
# Drop the old column
data.drop(columns = 'open_date', inplace = True)

In [18]:
# Train Test Split
# With more time, we might have checked each column in case 
# we needed to stratify, but we have short time and it's already
# weird, so we're just plowing ahead.
X = data.drop(columns = ['revenue', 'id'])
y = data['revenue']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size = 0.2, random_state = 19)  
# 3 rings for the elven kings
# 7 for the dwarf lords in their halls of stone
# 9 for mortal men doomed to die

In [20]:
X_train.shape
X_test.shape
y_train.shape
y_test.shape

(28,)

In [21]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)
lr.score(X_test, y_test)

-0.0564884312967624

In [22]:
# Well that was terrible.  Let's try regularization
ss = StandardScaler()
a = np.linspace(0.1, 50, 200)
lasso = LassoCV()
pipe = Pipeline([('ss', ss), ('lasso', lasso)])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)
pipe.score(X_test, y_test)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

0.016688192898308585

In [23]:
# Coefficients??
list(zip(X_train.columns, lasso.coef_))
# Good grief, everything is 0 except year, 
# and year is negative.

[('p1', 0.0),
 ('p2', 0.0),
 ('p3', -0.0),
 ('p4', 0.0),
 ('p5', -0.0),
 ('p6', 0.0),
 ('p7', 0.0),
 ('p8', -0.0),
 ('p9', -0.0),
 ('p10', -0.0),
 ('p11', 0.0),
 ('p12', -0.0),
 ('p13', -0.0),
 ('p14', -0.0),
 ('p15', -0.0),
 ('p16', -0.0),
 ('p17', 0.0),
 ('p18', -0.0),
 ('p19', 0.0),
 ('p20', -0.0),
 ('p21', 0.0),
 ('p22', 0.0),
 ('p23', 0.0),
 ('p24', -0.0),
 ('p25', 0.0),
 ('p26', -0.0),
 ('p27', -0.0),
 ('p28', 0.0),
 ('p29', -0.0),
 ('p30', -0.0),
 ('p31', -0.0),
 ('p32', -0.0),
 ('p33', -0.0),
 ('p34', -0.0),
 ('p35', -0.0),
 ('p36', -0.0),
 ('p37', -0.0),
 ('fc_type', 0.0),
 ('city_group_ohe', 0.0),
 ('year', -173242.13321384642),
 ('month', 0.0)]

In [24]:
# SLRs
# year
lr_year = lr.fit(X_train[['year']], y_train)
lr_year.score(X_train[['year']], y_train)
lr_year.score(X_test[['year']], y_test)
lr_year.coef_

array([-171386.19105246])

In [25]:
# food court or not
logr = LogisticRegression()
logr_fc = logr.fit(X_train[['fc_type']], y_train)
logr_fc.score(X_train[['fc_type']], y_train)
logr_fc.score(X_test[['fc_type']], y_test)

0.0

In [26]:
# Get some predictions
# full model
pipe_preds_X_train = pipe.predict(X_train)
pipe_preds_X_test = pipe.predict(X_test)

In [27]:
# SLR
lr_year_preds_X_train = lr_year.predict(X_train[['year']])
lr_year_preds_X_test = lr_year.predict(X_test[['year']])

In [28]:
# RMSE
print(f'''
Full model, training set: {mean_squared_error(y_train, pipe_preds_X_train)}
Full model, testing set: {mean_squared_error(y_test, pipe_preds_X_test)}
Year only, training set: {mean_squared_error(y_train, lr_year_preds_X_train)}
Year only, testing set: {mean_squared_error(y_test, lr_year_preds_X_test)}''')


Full model, training set: 4891874257132.587
Full model, testing set: 11995032646679.521
Year only, training set: 4602303922794.159
Year only, testing set: 11045386798843.324


In conclusion, the best way to make money in the restaurant industry is to master time travel first.