### 1. Load Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.formula.api as sm
import sklearn
from sklearn import metrics

## 2. Load Dataset

In [3]:
data = pd.read_csv('Advertising.csv', index_col=0)
data.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
data.describe()

Unnamed: 0,TV,radio,newspaper,sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


## 3. Train Test Split

In [6]:
features = ['TV', 'radio', 'newspaper']                # create a Python list of feature names
target = ['sales']                                     # Define the target variable

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.05, random_state=5000)

In [9]:
# checking train and test data sizes :
print('Train cases as below')
print('X_train shape: ',X_train.shape)
print('y_train shape: ',y_train.shape)
print('\nTest cases as below')
print('X_test shape: ',X_test.shape)
print('y_test shape: ',y_test.shape)

Train cases as below
X_train shape:  (190, 3)
y_train shape:  (190, 1)

Test cases as below
X_test shape:  (10, 3)
y_test shape:  (10, 1)


## 4. Linear regression in scikit-learn

In [10]:
#Instantiating the model
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression(fit_intercept=True)

In [11]:
#Fit the model
lr_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
#Interpreting Model Coefficients
print('Intercept:',lr_model.intercept_)          # print the intercept 
print('Coefficients:',lr_model.coef_)  

Intercept: [2.98804537]
Coefficients: [[ 0.04574295  0.18812717 -0.00031962]]


In [13]:
X_train.columns
(lr_model.coef_).T

array([[ 0.04574295],
       [ 0.18812717],
       [-0.00031962]])

In [14]:
pd.DataFrame((lr_model.coef_).T,index=X_train.columns,\
             columns=['Co-efficients']).sort_values('Co-efficients',ascending=False)

Unnamed: 0,Co-efficients
radio,0.188127
TV,0.045743
newspaper,-0.00032


## 5. Using the Model for Prediction

In [16]:
y_pred_train = lr_model.predict(X_train) 
y_pred_train                            # make predictions on the training set

array([[17.02755225],
       [13.85715676],
       [17.98486857],
       [17.38773256],
       [13.95430738],
       [18.00624166],
       [15.00293682],
       [20.925685  ],
       [12.60984156],
       [ 8.94223668],
       [18.68431423],
       [20.88297629],
       [ 9.18845319],
       [22.95622551],
       [12.02277   ],
       [17.122956  ],
       [ 4.56793713],
       [ 9.96950398],
       [10.65892863],
       [21.30848509],
       [14.81321194],
       [16.60213615],
       [ 9.53961681],
       [17.669057  ],
       [14.47856617],
       [15.03281197],
       [ 7.62665857],
       [ 6.62163142],
       [13.27147858],
       [20.49960471],
       [23.31680333],
       [21.15520632],
       [11.8009309 ],
       [10.64681483],
       [15.68391965],
       [17.35562597],
       [ 9.87985222],
       [23.34302283],
       [ 6.65368948],
       [14.42946875],
       [15.4506998 ],
       [ 7.09505724],
       [ 9.97134351],
       [ 8.20238456],
       [23.84092868],
       [19

In [18]:
y_pred_test = lr_model.predict(X_test)           # make predictions on the testing set
y_pred_test

array([[15.22009711],
       [19.45981854],
       [15.12423951],
       [12.40258987],
       [ 7.91426381],
       [11.41350004],
       [15.66607952],
       [12.38771194],
       [15.66955342],
       [16.44776759]])

## 6.  Model evaluation

In [20]:
#Computing the MAE for our Sales predictions

MAE_train = metrics.mean_absolute_error(y_train, y_pred_train)
MAE_test = metrics.mean_absolute_error(y_test, y_pred_test)

print('MAE for training set is {}'.format(MAE_train))
print('MAE for test set is {}'.format(MAE_test))

MAE for training set is 1.2370315486253916
MAE for test set is 1.3491035416575239


In [21]:
#Computing the MSE for our Sales predictions

MSE_train = metrics.mean_squared_error(y_train, y_pred_train)
MSE_test = metrics.mean_squared_error(y_test, y_pred_test)

print('MSE for training set is {}'.format(MSE_train))
print('MSE for test set is {}'.format(MSE_test))

MSE for training set is 2.7655529731609403
MSE for test set is 3.209816499356171


In [22]:
#Computing the RMSE for our Sales predictions

RMSE_train = np.sqrt( metrics.mean_squared_error(y_train, y_pred_train))
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))

print('RMSE for training set is {}'.format(RMSE_train))
print('RMSE for test set is {}'.format(RMSE_test))

RMSE for training set is 1.6629951813402648
RMSE for test set is 1.7915960759490883


In [23]:
data['sales'].mean()

RMSE_test/data['sales'].mean()

0.1277658103725504