In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

from scipy import stats

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
# from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

In [2]:
from patsy import dmatrices
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
import sklearn.metrics as metrics

In [3]:
from sklearn.model_selection import cross_val_score, train_test_split

In [4]:
carseats = sm.datasets.get_rdataset(dataname="Carseats", package="ISLR", cache=True)

In [5]:
carseats_df = carseats.data

In [6]:
carseats_df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [7]:
carseats_df = pd.get_dummies(carseats_df, columns = ["ShelveLoc", "Urban", "US"], drop_first=True)

In [8]:
X2, y = carseats_df.iloc[:, carseats_df.columns.isin(["Advertising", "Price"])], carseats_df["Sales"]

In [9]:
x2 = np.array(X2)
lm2 = LinearRegression()
lm2.fit(x2, y)
lm2.score(x2, y)

0.28185560334302273

In [10]:
X3, y = carseats_df.iloc[:, carseats_df.columns != "Sales"], carseats_df["Sales"]

In [11]:
x3 = np.array(X3)
lm3 = LinearRegression()
lm3.fit(x3, y)
lm3.score(x3, y)

0.8734133434127013

In [12]:
Xvar = carseats_df.loc[:, ~carseats_df.columns.isin(['Sales'])]
features = Xvar.columns[np.arange(0, 7)]
correlation_matrix = Xvar[features].corr()
print(correlation_matrix)

             CompPrice    Income  Advertising  Population     Price       Age  \
CompPrice     1.000000 -0.080653    -0.024199   -0.094707  0.584848 -0.100239   
Income       -0.080653  1.000000     0.058995   -0.007877 -0.056698 -0.004670   
Advertising  -0.024199  0.058995     1.000000    0.265652  0.044537 -0.004557   
Population   -0.094707 -0.007877     0.265652    1.000000 -0.012144 -0.042663   
Price         0.584848 -0.056698     0.044537   -0.012144  1.000000 -0.102177   
Age          -0.100239 -0.004670    -0.004557   -0.042663 -0.102177  1.000000   
Education     0.025197 -0.056855    -0.033594   -0.106378  0.011747  0.006488   

             Education  
CompPrice     0.025197  
Income       -0.056855  
Advertising  -0.033594  
Population   -0.106378  
Price         0.011747  
Age           0.006488  
Education     1.000000  


In [13]:
Xvif = add_constant(np.array(Xvar.values, dtype=float))
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(Xvif, i) for i in range(1, Xvif.shape[1])]
vif['features'] = Xvar.columns
vif

Unnamed: 0,VIF Factor,features
0,1.554618,CompPrice
1,1.024731,Income
2,2.103136,Advertising
3,1.145534,Population
4,1.537068,Price
5,1.021051,Age
6,1.026342,Education
7,1.511411,ShelveLoc_Good
8,1.517882,ShelveLoc_Medium
9,1.022705,Urban_Yes


In [14]:
y = carseats.data.Sales
X = carseats.data.iloc[:, 1]
X = pd.get_dummies(X, columns=["ShelveLoc", "Urban", "US"], drop_first=True)
print(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

   85   86   88   89   93   94   95   96   97   98   ...  152  153  154  155  \
0    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
1    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
2    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
3    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
4    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   

   156  157  159  161  162  175  
0    0    0    0    0    0    0  
1    0    0    0    0    0    0  
2    0    0    0    0    0    0  
3    0    0    0    0    0    0  
4    0    0    0    0    0    0  

[5 rows x 72 columns]


In [15]:
print(X_train.shape[0] / X.shape[0], X_test.shape[0] / X.shape[0])

0.7 0.3


In [16]:
lm = LinearRegression()
lm.fit(X=X_train, y=y_train)

In [17]:
print("Train R2: {}".format(lm.score(X_train, y_train)))


Train R2: 0.15187317834905856
