In [1]:
import seaborn as sns
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn import linear_model

In [2]:
data = sns.load_dataset("tips")
data.iloc[[193, 90, 25, 26, 190], :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
193,15.48,2.02,Male,Yes,Thur,Lunch,2
90,28.97,3.0,Male,Yes,Fri,Dinner,2
25,17.81,2.34,Male,No,Sat,Dinner,4
26,13.37,2.0,Male,No,Sat,Dinner,2
190,15.69,1.5,Male,Yes,Sun,Dinner,2


In [3]:
# Let's create a copy of the dataset that only has 3 features in order to keep things simple
three_features = ['total_bill', 'size', 'day']

three_feature_data = pd.DataFrame(data[three_features])
three_feature_data.head(5)

Unnamed: 0,total_bill,size,day
0,16.99,2,Sun
1,10.34,3,Sun
2,21.01,3,Sun
3,23.68,2,Sun
4,24.59,4,Sun


In [4]:
# let's create "dummies" that represent whether it is thursday, friday, saturday, or sunday
dummies = pd.get_dummies(three_feature_data['day'])
dummies.iloc[[193, 90, 25, 26, 190], :]

Unnamed: 0,Thur,Fri,Sat,Sun
193,True,False,False,False
90,False,True,False,False
25,False,False,True,False
26,False,False,True,False
190,False,False,False,True


In [5]:
data_w_dummies = pd.concat([three_feature_data, dummies], axis=1)
data_w_dummies.iloc[[193, 90, 25, 26, 190], :]

Unnamed: 0,total_bill,size,day,Thur,Fri,Sat,Sun
193,15.48,2,Thur,True,False,False,False
90,28.97,2,Fri,False,True,False,False
25,17.81,4,Sat,False,False,True,False
26,13.37,2,Sat,False,False,True,False
190,15.69,2,Sun,False,False,False,True


In [6]:
#drop the non-numeric column
del data_w_dummies["day"]
data_w_dummies.head(5)

Unnamed: 0,total_bill,size,Thur,Fri,Sat,Sun
0,16.99,2,False,False,False,True
1,10.34,3,False,False,False,True
2,21.01,3,False,False,False,True
3,23.68,2,False,False,False,True
4,24.59,4,False,False,False,True


In [8]:
f_with_day = linear_model.LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_dummies, data["tip"])

In [9]:
f_with_day.coef_

array([0.09299361, 0.18713231, 0.66829361, 0.74578683, 0.62112858,
       0.73228865])

In [10]:
## Prediction for a party of 3, with a $50 total bill, eating on a thursday
f_with_day.predict([[50, 3, 1, 0, 0, 0]])



array([5.87937107])

In [11]:
## Prediction for a party of 3 with a $50 total bill, eating on a saturday
f_with_day.predict([[50, 3, 0, 0, 0, 1]])



array([5.94336612])

In [12]:
## Let's compute the MSE
from sklearn.metrics import mean_squared_error
mean_squared_error(data['tip'], f_with_day.predict(data_w_dummies))

1.0121298853078433

In [13]:
px.scatter(data, x = "total_bill", y =  "tip", trendline = "ols")

In [14]:
px.scatter(data, x = "total_bill", y =  "tip", color = "day", trendline = "ols")