##1. Mounting the drive with Google Colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##2. Importing the necessary libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

##3. Loading the data

In [4]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Advertising.csv")
df = pd.DataFrame(data)

##4. Performing Exploratory Data Analysis

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  200 non-null    int64  
 1   TV          200 non-null    float64
 2   Radio       200 non-null    float64
 3   Newspaper   200 non-null    float64
 4   Sales       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [None]:
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0,200.0
mean,100.5,147.0425,23.264,30.554,14.0225
std,57.879185,85.854236,14.846809,21.778621,5.217457
min,1.0,0.7,0.0,0.3,1.6
25%,50.75,74.375,9.975,12.75,10.375
50%,100.5,149.75,22.9,25.75,12.9
75%,150.25,218.825,36.525,45.1,17.4
max,200.0,296.4,49.6,114.0,27.0


In [None]:
data.isnull().sum()

Unnamed: 0    0
TV            0
Radio         0
Newspaper     0
Sales         0
dtype: int64

--->there are no null values


In [None]:
data.shape

(200, 5)

In [None]:
data.size

1000

In [None]:
data.columns

Index(['Unnamed: 0', 'TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

--->Unique value counts of all the columns

In [None]:
cols= ['Unnamed: 0', 'TV', 'Radio', 'Newspaper', 'Sales']
header=["value","value count"]
for i in cols:
  arr = pd.DataFrame(data[i].value_counts())
  print(arr)
  print("_____________________________")

     Unnamed: 0
1             1
138           1
128           1
129           1
130           1
..          ...
70            1
71            1
72            1
73            1
200           1

[200 rows x 1 columns]
_____________________________
       TV
199.8   2
109.8   2
17.2    2
177.0   2
222.4   2
...    ..
139.3   1
216.8   1
199.1   1
26.8    1
232.1   1

[190 rows x 1 columns]
_____________________________
      Radio
4.1       3
5.7       3
13.9      2
14.3      2
36.9      2
...     ...
42.8      1
14.5      1
30.6      1
33.0      1
8.6       1

[167 rows x 1 columns]
_____________________________
      Newspaper
9.3           3
25.6          3
8.7           3
34.6          2
8.5           2
...         ...
27.2          1
31.7          1
19.3          1
31.3          1
66.2          1

[172 rows x 1 columns]
_____________________________
      Sales
9.7       5
11.7      4
12.9      4
15.9      4
20.7      3
...     ...
17.0      1
18.3      1
22.3      1
14.0      1
25.5

In [None]:
data.dtypes

Unnamed: 0      int64
TV            float64
Radio         float64
Newspaper     float64
Sales         float64
dtype: object

##5. Data Visualisation

In [10]:
fig = px.scatter(data, x="Sales", y="TV",title = 'Effect of TV advertisments on Sales')
fig.show()

In [11]:
fig = px.scatter(data, x="Sales", y="Radio",title = 'Effect of Radio advertisments on Sales')
fig.show()

In [12]:
fig = px.scatter(data, x="Sales", y="Newspaper",title = 'Effect of Newspaper advertisments on Sales')
fig.show()

##6. Splitting the data into test dataset and train dataset

In [None]:
from sklearn.model_selection import train_test_split
X = data.drop("Sales",axis=1)
y = data['Sales']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=30)

##7. Implementing Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


model = LinearRegression()
model.fit(X_train, y_train)
print(f'accuracy is: {model.score(X_test, y_test)}')
print()
print('predicted')
print(model.predict(X_test))


accuracy is: 0.9134450736738814

predicted
[ 8.91951647 14.78160327 14.94990697 18.20702974 19.42035123  8.42649211
 10.62277142 18.98021569  8.10502747 14.28908438 10.38377041 15.24503122
 14.9102453  12.06730347  6.37313205 18.22733628 20.77086471 13.12339727
 21.17119671 23.77911458 11.56504914 10.35316768 19.05275484 12.87557713
 16.81847961 14.87872151  6.0963072  23.27295152 14.06424773 14.672587
 19.14092951 12.8135812  22.98807105 15.10880576 20.14879781 15.73393477
 12.54763209  9.7540892  17.90034211  8.06353385  9.22697837  6.09395895
 10.08904912 16.07494069 16.41471173 23.15384413 20.64653571  7.56292416
 20.23852551 10.95292549 15.6353146  17.71863358 22.81745363 12.09464707
 17.92081752 19.10313032 12.61427299 21.65035947 18.21987593 17.084871  ]


In [None]:
model.intercept_

3.382440082396931

In [None]:
model.coef_

array([-0.0019716 ,  0.04447439,  0.1777563 ,  0.00452776])

##8. Implementing Support Vector Regression

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)
print(f'accuracy is: {regressor.score(X_test, y_test)}')
print()
print('predicted')
print(regressor.predict(X_test))

accuracy is: 0.654874024838006

predicted
[11.10964744 16.51910248 14.58794232 15.89215045 17.8140778   9.4303022
  8.88731406 17.03604482  9.60328811 14.2678364   8.6854275  16.03320174
 16.48104027 13.42953713  7.71721924 15.18988252 18.17429718 12.4803087
 18.75285996 16.25920429 10.93684458  9.06537632 17.42658452 15.69375631
 16.1146988  13.62733235  8.31633526 17.48006997 16.09152263 15.61254507
 16.34449494 13.79233651 18.55064865 12.74550372 18.20432055 16.64998683
 10.30225798  9.31836313 14.3350012   8.45549578 10.5971086   8.34932702
 12.28108908 15.15171553 17.15701932 18.36473793 17.95588546  8.3291118
 15.80262936  9.57858499 15.00221909 16.04778398 18.0136658   9.92687002
 17.46659096 18.48568515  9.86323186 17.70116176 17.84596056 15.70067628]


##9. Best fit model
#    Linear regression model predicts the number of sales with the highest accuracy of 91.3%