# Data Modeling --- Chapter 12 Linear Models

In [1]:
import pandas as pd

import seaborn as sns


In [2]:
tips = sns.load_dataset('tips')

print(tips.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [3]:
import statsmodels.formula.api as smf

In [4]:
model = smf.ols(formula='tip ~ total_bill', data=tips)

In [5]:
results = model.fit()

In [6]:
type(results), type(model)

(statsmodels.regression.linear_model.RegressionResultsWrapper,
 statsmodels.regression.linear_model.OLS)

In [7]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.457
Model:                            OLS   Adj. R-squared:                  0.454
Method:                 Least Squares   F-statistic:                     203.4
Date:                Tue, 26 Jul 2022   Prob (F-statistic):           6.69e-34
Time:                        08:53:42   Log-Likelihood:                -350.54
No. Observations:                 244   AIC:                             705.1
Df Residuals:                     242   BIC:                             712.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9203      0.160      5.761      0.0

In [8]:
print(results.params)

Intercept     0.920270
total_bill    0.105025
dtype: float64


In [9]:
print(results.conf_int())

                   0         1
Intercept   0.605622  1.234918
total_bill  0.090517  0.119532


In [10]:
from sklearn import linear_model

In [11]:
lr = linear_model.LinearRegression()

In [12]:
predicted = lr.fit(X=tips['total_bill'], y=tips['tip'])

ValueError: Expected 2D array, got 1D array instead:
array=[16.99 10.34 21.01 23.68 24.59 25.29  8.77 26.88 15.04 14.78 10.27 35.26
 15.42 18.43 14.83 21.58 10.33 16.29 16.97 20.65 17.92 20.29 15.77 39.42
 19.82 17.81 13.37 12.69 21.7  19.65  9.55 18.35 15.06 20.69 17.78 24.06
 16.31 16.93 18.69 31.27 16.04 17.46 13.94  9.68 30.4  18.29 22.23 32.4
 28.55 18.04 12.54 10.29 34.81  9.94 25.56 19.49 38.01 26.41 11.24 48.27
 20.29 13.81 11.02 18.29 17.59 20.08 16.45  3.07 20.23 15.01 12.02 17.07
 26.86 25.28 14.73 10.51 17.92 27.2  22.76 17.29 19.44 16.66 10.07 32.68
 15.98 34.83 13.03 18.28 24.71 21.16 28.97 22.49  5.75 16.32 22.75 40.17
 27.28 12.03 21.01 12.46 11.35 15.38 44.3  22.42 20.92 15.36 20.49 25.21
 18.24 14.31 14.    7.25 38.07 23.95 25.71 17.31 29.93 10.65 12.43 24.08
 11.69 13.42 14.26 15.95 12.48 29.8   8.52 14.52 11.38 22.82 19.08 20.27
 11.17 12.26 18.26  8.51 10.33 14.15 16.   13.16 17.47 34.3  41.19 27.05
 16.43  8.35 18.64 11.87  9.78  7.51 14.07 13.13 17.26 24.55 19.77 29.85
 48.17 25.   13.39 16.49 21.5  12.66 16.21 13.81 17.51 24.52 20.76 31.71
 10.59 10.63 50.81 15.81  7.25 31.85 16.82 32.9  17.89 14.48  9.6  34.63
 34.65 23.33 45.35 23.17 40.55 20.69 20.9  30.46 18.15 23.1  15.69 19.81
 28.44 15.48 16.58  7.56 10.34 43.11 13.   13.51 18.71 12.74 13.   16.4
 20.53 16.47 26.59 38.73 24.27 12.76 30.06 25.89 48.33 13.27 28.17 12.9
 28.15 11.59  7.74 30.14 12.16 13.42  8.58 15.98 13.42 16.27 10.09 20.45
 13.28 22.12 24.01 15.69 11.61 10.77 15.53 10.07 12.6  32.83 35.83 29.03
 27.18 22.67 17.82 18.78].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [13]:
predicted = lr.fit(X=tips['total_bill'].values.reshape(-1, 1),

                  y=tips['tip'])

In [14]:
tips['total_bill'].values.reshape(-1, 1)

array([[16.99],
       [10.34],
       [21.01],
       [23.68],
       [24.59],
       [25.29],
       [ 8.77],
       [26.88],
       [15.04],
       [14.78],
       [10.27],
       [35.26],
       [15.42],
       [18.43],
       [14.83],
       [21.58],
       [10.33],
       [16.29],
       [16.97],
       [20.65],
       [17.92],
       [20.29],
       [15.77],
       [39.42],
       [19.82],
       [17.81],
       [13.37],
       [12.69],
       [21.7 ],
       [19.65],
       [ 9.55],
       [18.35],
       [15.06],
       [20.69],
       [17.78],
       [24.06],
       [16.31],
       [16.93],
       [18.69],
       [31.27],
       [16.04],
       [17.46],
       [13.94],
       [ 9.68],
       [30.4 ],
       [18.29],
       [22.23],
       [32.4 ],
       [28.55],
       [18.04],
       [12.54],
       [10.29],
       [34.81],
       [ 9.94],
       [25.56],
       [19.49],
       [38.01],
       [26.41],
       [11.24],
       [48.27],
       [20.29],
       [13.81],
       [

In [15]:
print(predicted.coef_)

[0.10502452]


In [16]:
predicted.intercept_

0.9202696135546735

In [18]:
model = smf.ols(formula='tip ~ total_bill + size', data=tips).fit()
print(model.sumamry())

AttributeError: 'OLSResults' object has no attribute 'sumamry'

In [20]:
print(tips.day.unique())

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']


In [21]:
model = smf.ols(formula='tip ~ total_bill + size + sex + smoker + day + time',data=tips).fit()

In [22]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.470
Model:                            OLS   Adj. R-squared:                  0.452
Method:                 Least Squares   F-statistic:                     26.06
Date:                Tue, 26 Jul 2022   Prob (F-statistic):           1.20e-28
Time:                        09:16:00   Log-Likelihood:                -347.48
No. Observations:                 244   AIC:                             713.0
Df Residuals:                     235   BIC:                             744.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.5908      0.256      2.

In [23]:
lr = linear_model.LinearRegression()

In [24]:
predicted = lr.fit(X=tips[['total_bill', 'size']],y=tips['tip'])
print(predicted.coef_)

[0.09271334 0.19259779]


In [25]:
print(predicted.intercept_)

0.6689447408125031


In [26]:
tips_dummy = pd.get_dummies(tips[['total_bill', 'size','sex', 'smoker', 'day', 'time']])

print(tips_dummy.head())

   total_bill  size  sex_Male  sex_Female  smoker_Yes  smoker_No  day_Thur  \
0       16.99     2         0           1           0          1         0   
1       10.34     3         1           0           0          1         0   
2       21.01     3         1           0           0          1         0   
3       23.68     2         1           0           0          1         0   
4       24.59     4         0           1           0          1         0   

   day_Fri  day_Sat  day_Sun  time_Lunch  time_Dinner  
0        0        0        1           0            1  
1        0        0        1           0            1  
2        0        0        1           0            1  
3        0        0        1           0            1  
4        0        0        1           0            1  


In [27]:
x_tips_dummy_ref = pd.get_dummies(tips[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], drop_first=True)

print(x_tips_dummy_ref.head())

   total_bill  size  sex_Female  smoker_No  day_Fri  day_Sat  day_Sun  \
0       16.99     2           1          1        0        0        1   
1       10.34     3           0          1        0        0        1   
2       21.01     3           0          1        0        0        1   
3       23.68     2           0          1        0        0        1   
4       24.59     4           1          1        0        0        1   

   time_Dinner  
0            1  
1            1  
2            1  
3            1  
4            1  


In [28]:
lr = linear_model.LinearRegression()

predicted = lr.fit(X=x_tips_dummy_ref,y=tips['tip'])

print(predicted.coef_)

[ 0.09448701  0.175992    0.03244094  0.08640832  0.1622592   0.04080082
  0.13677854 -0.0681286 ]


In [31]:
import numpy as np

# create and fit the model

lr = linear_model.LinearRegression()

predicted = lr.fit(X=x_tips_dummy_ref, y=tips['tip'])



# get the intercept along with other coefficients

values = np.append(predicted.intercept_, predicted.coef_)

In [33]:
x_tips_dummy_ref.columns

Index(['total_bill', 'size', 'sex_Female', 'smoker_No', 'day_Fri', 'day_Sat',
       'day_Sun', 'time_Dinner'],
      dtype='object')

In [34]:
# get the names of the values

names = np.append('intercept', x_tips_dummy_ref.columns)



# put everything in a labeled dataframe

results = pd.DataFrame(values, index = names,

    columns=['coef'] # you need the square brackets here

)
print(results)

                 coef
intercept    0.590837
total_bill   0.094487
size         0.175992
sex_Female   0.032441
smoker_No    0.086408
day_Fri      0.162259
day_Sat      0.040801
day_Sun      0.136779
time_Dinner -0.068129
