##  Project 1 : Machine Learning Regression : Advertising Dataset

In [1]:
# Import all Libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
# Step 1: Extract Data 

df = pd.read_csv('Advertising.csv')

In [3]:
df.shape

(200, 5)

In [4]:
df.columns

Index(['Unnamed: 0', 'TV Ad Budget ($)', 'Radio Ad Budget ($)',
       'Newspaper Ad Budget ($)', 'Sales ($)'],
      dtype='object')

In [5]:
#  Tv    |     Radio    |      Newspaper    |                    Sales 

In [6]:
df

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5


In [7]:
# Drop  column Unnamed: 0.

df.drop('Unnamed: 0', axis = 1, inplace= True )

In [8]:
df.columns

Index(['TV Ad Budget ($)', 'Radio Ad Budget ($)', 'Newspaper Ad Budget ($)',
       'Sales ($)'],
      dtype='object')

In [9]:
# Rename all Columns 

df.rename( columns= {'TV Ad Budget ($)' : 'TV',
                     'Radio Ad Budget ($)' : 'Radio', 
                     'Newspaper Ad Budget ($)' : 'Newspaper', 
                     'Sales ($)' : 'Sales'}, inplace= True )

In [10]:
# Identify data types 
# Identify null values
# Identify duplicate rows
# Do statistical Analysis. Decsribe Data. 

In [11]:
# Identify data types 

df.dtypes

TV           float64
Radio        float64
Newspaper    float64
Sales        float64
dtype: object

In [12]:
# Identify null values

df.isnull().sum()

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

In [13]:
# Identify duplicate rows

df.duplicated().sum()

0

In [14]:
# Do statistical Analysis. Decsribe Data. 

df.describe().round(2)

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.04,23.26,30.55,14.02
std,85.85,14.85,21.78,5.22
min,0.7,0.0,0.3,1.6
25%,74.38,9.98,12.75,10.38
50%,149.75,22.9,25.75,12.9
75%,218.82,36.52,45.1,17.4
max,296.4,49.6,114.0,27.0


In [15]:
# Step : 5  : Divide dataset into Independant (Input) and Dependant (Output) Features.

In [16]:
X = df.drop('Sales', axis = 1 )    # Input 
Y = df['Sales']                    # Output

In [17]:
X.shape, Y.shape

((200, 3), (200,))

In [18]:
# Step : 6 :  Split dataset into training and testing.     ratio = 80 : 20 

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train , X_test, Y_train, Y_test = train_test_split(X, Y, test_size= .20 , random_state= 123 )

In [21]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((160, 3), (40, 3), (160,), (40,))

#### Linear Regression : 

In [22]:
# Sales (Y) =  F(TV (X1), Radio (X2), Newspaper(X3) )  

In [23]:
# Y  =  w1 * X1  +   w2 * X2   +   w3 * X3   +  C 

In [24]:
# Step : 7 : Apply Linear Regression 

In [25]:
from sklearn.linear_model import LinearRegression 

In [26]:
 LR = LinearRegression()

In [27]:
# Train Model with training Dataset. 

In [28]:
model = LR.fit(X_train , Y_train )

In [29]:
# Y  =  0.04558897 * X1  +   0.18653389 * X2   +   -0.00063032 * X3   +  2.9242267872501255

In [30]:
LR.coef_

array([ 0.04558897,  0.18653389, -0.00063032])

In [31]:
LR.intercept_

2.924226787250122

In [32]:
# Predicting Sales for Tv = 198.8   Radio = 3.1   Newspaper = 34.6   [50]

In [33]:
Y  =  0.04558897 * 198.8  +   0.18653389 * 3.1   +   -0.00063032 * 34.6   +  2.9242267872501255
Y   # Predicted Outcome

12.543760010250127

In [34]:
Y_test[50]   # Actual Outcome 

11.4

In [35]:
# Predict outcomes for Testing Data 

Y_predict = LR.predict(X_test)

In [36]:
pd.DataFrame({'Actual' : Y_test , 
             'Predicted' :  Y_predict})

Unnamed: 0,Actual,Predicted
50,11.4,12.589349
127,8.8,6.574663
37,14.7,15.515691
149,10.1,9.761643
19,14.6,14.085603
104,20.7,20.178291
179,12.6,12.328005
53,21.2,19.829638
162,14.9,14.873316
158,7.3,10.312228


In [37]:
# Save your trained model 
import joblib

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [39]:
# Mean Absolute Error 

print(" Mean Absolute Error : ", mean_absolute_error(Y_test, Y_predict).round(2))

 Mean Absolute Error :  1.15


In [40]:
# Mean Squared Error 

print(" Mean Squared Error : ", mean_squared_error(Y_test, Y_predict)  ** (1/2) )

 Mean Squared Error :  1.4842550249545559


In [41]:
# r2_score

r2_score(Y_test, Y_predict)

0.9288609285248163

In [42]:
# Save your trained model 
import joblib

In [43]:
# model is your trained model
joblib.dump(model, 'sales_model.pkl')

['sales_model.pkl']