**Problem Statement:  
The Advertising dataset captures sales revenue generated with respect to advertisement spends  
across multiple channels like radio, tv, and newspaper.  
Objective:  
Build a linear regression model to:  
•Interpret the coefficients of the model  
•Make predictions  
•Find and analyze model residuals  
•Evaluate model efficiency using RMSE and R Square values  
Access:**

In [1]:
# import required libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import joblib as jb
from sklearn import metrics

In [2]:
# load the data
data = pd.read_csv(r'C:\Users\USER\Downloads\Advertising.csv')
data.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [3]:
# view the size of the dataset
data.shape

(200, 4)

In [4]:
data.values

array([[230.1,  37.8,  69.2,  22.1],
       [ 44.5,  39.3,  45.1,  10.4],
       [ 17.2,  45.9,  69.3,   9.3],
       [151.5,  41.3,  58.5,  18.5],
       [180.8,  10.8,  58.4,  12.9],
       [  8.7,  48.9,  75. ,   7.2],
       [ 57.5,  32.8,  23.5,  11.8],
       [120.2,  19.6,  11.6,  13.2],
       [  8.6,   2.1,   1. ,   4.8],
       [199.8,   2.6,  21.2,  10.6],
       [ 66.1,   5.8,  24.2,   8.6],
       [214.7,  24. ,   4. ,  17.4],
       [ 23.8,  35.1,  65.9,   9.2],
       [ 97.5,   7.6,   7.2,   9.7],
       [204.1,  32.9,  46. ,  19. ],
       [195.4,  47.7,  52.9,  22.4],
       [ 67.8,  36.6, 114. ,  12.5],
       [281.4,  39.6,  55.8,  24.4],
       [ 69.2,  20.5,  18.3,  11.3],
       [147.3,  23.9,  19.1,  14.6],
       [218.4,  27.7,  53.4,  18. ],
       [237.4,   5.1,  23.5,  12.5],
       [ 13.2,  15.9,  49.6,   5.6],
       [228.3,  16.9,  26.2,  15.5],
       [ 62.3,  12.6,  18.3,   9.7],
       [262.9,   3.5,  19.5,  12. ],
       [142.9,  29.3,  12.6,  15. ],
 

In [5]:
# view the fields in this dataset
data.columns

Index(['TV', 'radio', 'newspaper', 'sales'], dtype='object')

In [6]:
# let's get the full details
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   radio      200 non-null    float64
 2   newspaper  200 non-null    float64
 3   sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


In [7]:
data.describe()

Unnamed: 0,TV,radio,newspaper,sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [9]:
data.query('sales < 10 and newspaper > 30')

Unnamed: 0,TV,radio,newspaper,sales
2,17.2,45.9,69.3,9.3
5,8.7,48.9,75.0,7.2
12,23.8,35.1,65.9,9.2
22,13.2,15.9,49.6,5.6
44,25.1,25.7,43.3,8.5
49,66.9,11.7,36.8,9.7
56,7.3,28.1,41.4,5.5
75,16.9,43.7,89.4,8.7
91,28.6,1.5,33.0,7.3
121,18.8,21.7,50.4,7.0


In [11]:
data['result'] = data['sales'] > 20

In [12]:
data

Unnamed: 0,TV,radio,newspaper,sales,result
0,230.1,37.8,69.2,22.1,True
1,44.5,39.3,45.1,10.4,False
2,17.2,45.9,69.3,9.3,False
3,151.5,41.3,58.5,18.5,False
4,180.8,10.8,58.4,12.9,False
...,...,...,...,...,...
195,38.2,3.7,13.8,7.6,False
196,94.2,4.9,8.1,9.7,False
197,177.0,9.3,6.4,12.8,False
198,283.6,42.0,66.2,25.5,True


In [10]:
data.isnull().sum()

TV           0
radio        0
newspaper    0
sales        0
dtype: int64

In [9]:
feature_names = ["TV", "radio", "newspaper"]
X = data[feature_names]
y = data["sales"]

# Build Model and Make Predictions

In [24]:
# Define the model to be used
model = LinearRegression()
model.fit(X, y)

prediction = model.predict([[50.0, 64.2, 50.0]])
prediction



array([17.27887408])

# Interpret the coefficients of the model

In [25]:
print(model.coef_)
print(model.intercept_)

[ 0.04576465  0.18853002 -0.00103749]
2.938889369459405


# Analyze model residuals

In [26]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)
model.fit(Xtrain, ytrain)

LinearRegression()

In [13]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(ytrain, model.predict(Xtrain)))
print(mean_squared_error(ytest, model.predict(Xtest)))
print("R2/Coefficient of determination: {}".format(model.score(Xtest, ytest)))

3.016830607659677
1.9918855518287892
R2/Coefficient of determination: 0.8927605914615384


In [14]:
residual = pd.DataFrame({"Actual_value": ytest, "Predicted_value": model.predict(Xtest)})
residual.head(10)

Unnamed: 0,Actual_value,Predicted_value
58,23.8,21.735772
40,16.6,16.456938
34,9.5,7.659932
102,14.8,17.892027
184,17.6,18.677307
198,25.5,23.862719
95,16.9,16.336236
4,12.9,13.456492
29,10.5,9.177296
168,17.1,17.360562
