#### 1: Import the dataset

In [1]:
#Import the required libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
#Import the advertising dataset
advertising_data = pd.read_csv('Advertising Budget and Sales.csv')
advertising_data

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
5,6,8.7,48.9,75.0,7.2
6,7,57.5,32.8,23.5,11.8
7,8,120.2,19.6,11.6,13.2
8,9,8.6,2.1,1.0,4.8
9,10,199.8,2.6,21.2,10.6


#### 2: Analyze the dataset

In [3]:
#View the initial few records of the dataset
advertising_data.head(8)

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
5,6,8.7,48.9,75.0,7.2
6,7,57.5,32.8,23.5,11.8
7,8,120.2,19.6,11.6,13.2


In [4]:
#Check the total number of elements in the dataset
len(advertising_data)

200

#### 3: Find the features or media channels used by the firm

In [6]:
advertising_data.columns

Index(['Unnamed: 0', 'TV Ad Budget ($)', 'Radio Ad Budget ($)',
       'Newspaper Ad Budget ($)', 'Sales ($)'],
      dtype='object')

In [11]:
#Check the number of observations (rows) and attributes (columns) in the dataset
advertising_data.drop('Unnamed: 0' , axis = 1 , inplace=True)

In [12]:
print("Media Channels used =" , advertising_data.columns[:-1] )

Media Channels used = Index(['TV Ad Budget ($)', 'Radio Ad Budget ($)', 'Newspaper Ad Budget ($)'], dtype='object')


In [15]:
print("No of rows( observations ) in the advertinsing dataset =" , len(advertising_data))

No of rows( observations ) in the advertinsing dataset = 200


In [17]:
#View the names of each of the attributes
print("Attributes :")
for attr in advertising_data.columns :
    print(attr[:-4])

Attributes :
TV Ad Budget
Radio Ad Budget
Newspaper Ad Budget
Sales


#### 4: Create objects to train and test the model; find the sales figures for each channel

In [20]:
#Create a feature object from the columns
X = advertising_data.iloc[: , :-1].values

In [22]:
#View the feature object
X[ :10 , :]

array([[230.1,  37.8,  69.2],
       [ 44.5,  39.3,  45.1],
       [ 17.2,  45.9,  69.3],
       [151.5,  41.3,  58.5],
       [180.8,  10.8,  58.4],
       [  8.7,  48.9,  75. ],
       [ 57.5,  32.8,  23.5],
       [120.2,  19.6,  11.6],
       [  8.6,   2.1,   1. ],
       [199.8,   2.6,  21.2]])

In [34]:
#Create a target object (Hint: use the sales column as it is the response of the dataset)
target = advertising_data['Sales ($)'].values
y = target.reshape(len(target) , 1)

In [35]:
#View the target object
y[:10 , :]

array([[22.1],
       [10.4],
       [ 9.3],
       [18.5],
       [12.9],
       [ 7.2],
       [11.8],
       [13.2],
       [ 4.8],
       [10.6]])

In [37]:
#Verify if all the observations have been captured in the feature object
len(X)

200

In [38]:
#Verify if all the observations have been captured in the target object
len(y)

200

#### 5: Split the original dataset into training and testing datasets for the model

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
#Split the dataset (by default, 75% is the training data and 25% is the testing data)
X_train , X_test , y_train , y_test = train_test_split(X, y , random_state = 101 , test_size = 0.25)

In [45]:
#Verify if the training and testing datasets are split correctly (Hint: use the shape() method)
print("X_train =" , X_train.shape)
print("y_train =" , y_train.shape)
print("X_test =" , X_test.shape)
print("y_test =" , y_test.shape)

X_train = (150, 3)
y_train = (150, 1)
X_test = (50, 3)
y_test = (50, 1)


#### 6: Create a model  to predict the sales outcome

In [47]:
#Create a linear regression model
from sklearn.linear_model import LinearRegression

lin_reg= LinearRegression()
lin_reg.fit(X_train , y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [48]:
#Print the intercept and coefficients 
lin_reg.intercept_

array([2.9617275])

In [50]:
#Predict the outcome for the testing dataset
predictions = lin_reg.predict(X_test)

predictions

array([[15.7263887 ],
       [19.55970806],
       [11.35638413],
       [17.00154366],
       [ 9.05640744],
       [ 6.88256478],
       [20.25202883],
       [17.23637795],
       [ 9.63243796],
       [19.19775722],
       [12.33684688],
       [13.78919583],
       [13.60946471],
       [21.31349216],
       [18.42170403],
       [ 9.88302868],
       [15.45083867],
       [ 7.53200526],
       [ 7.42033885],
       [20.3890307 ],
       [ 7.66977854],
       [18.22207646],
       [24.71977128],
       [22.843015  ],
       [ 7.83227551],
       [12.54236433],
       [21.42803762],
       [ 7.93472305],
       [12.31244402],
       [12.48247057],
       [10.7244511 ],
       [19.22531219],
       [ 9.93329519],
       [ 6.59231873],
       [17.28054591],
       [ 7.62464387],
       [ 9.13268517],
       [ 8.13034377],
       [10.5171423 ],
       [10.49809833],
       [13.00081752],
       [ 9.63933072],
       [10.11131993],
       [ 7.94723108],
       [11.4796586 ],
       [ 9

#### 7: Calculate the Mean Square Error (MSE)

In [51]:
#Import required libraries for calculating MSE (mean square error)
MSE = np.mean((predictions - y_test)**2)

In [52]:
#Calculate the MSE
MSE

2.1086081578175144

In [53]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions , y_test)

2.1086081578175144