In [1]:
# Import essential python libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
# To read csv file 
df = pd.read_csv('Advertising.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [3]:
# To display/discribe stats about data from the dataset
df.describe()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0,200.0
mean,100.5,147.0425,23.264,30.554,14.0225
std,57.879185,85.854236,14.846809,21.778621,5.217457
min,1.0,0.7,0.0,0.3,1.6
25%,50.75,74.375,9.975,12.75,10.375
50%,100.5,149.75,22.9,25.75,12.9
75%,150.25,218.825,36.525,45.1,17.4
max,200.0,296.4,49.6,114.0,27.0


In [4]:
# To draw information about datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  200 non-null    int64  
 1   TV          200 non-null    float64
 2   Radio       200 non-null    float64
 3   Newspaper   200 non-null    float64
 4   Sales       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [5]:
#checking for null values
df.isnull().sum()

Unnamed: 0    0
TV            0
Radio         0
Newspaper     0
Sales         0
dtype: int64

In [6]:
x = df.iloc[:,0:-1]
x

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper
0,1,230.1,37.8,69.2
1,2,44.5,39.3,45.1
2,3,17.2,45.9,69.3
3,4,151.5,41.3,58.5
4,5,180.8,10.8,58.4
...,...,...,...,...
195,196,38.2,3.7,13.8
196,197,94.2,4.9,8.1
197,198,177.0,9.3,6.4
198,199,283.6,42.0,66.2


In [7]:
y = df.iloc[:,-1]
y

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: Sales, Length: 200, dtype: float64

In [8]:
# For training
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=40)
print('x_train: ',x_train)

x_train:       Unnamed: 0     TV  Radio  Newspaper
86           87   76.3   27.5       16.0
182         183   56.2    5.7       29.7
69           70  216.8   43.9       27.2
125         126   87.2   11.8       25.9
42           43  293.6   27.7        1.8
..          ...    ...    ...        ...
50           51  199.8    3.1       34.6
184         185  253.8   21.3       30.0
165         166  234.5    3.4       84.8
7             8  120.2   19.6       11.6
70           71  199.1   30.6       38.7

[160 rows x 4 columns]


In [9]:
print('x_test:',x_test)

x_test:      Unnamed: 0     TV  Radio  Newspaper
96           97  197.6    3.5        5.9
5             6    8.7   48.9       75.0
116         117  139.2   14.3       25.6
35           36  290.7    4.1        8.5
183         184  287.6   43.0       71.8
160         161  172.5   18.1       30.7
54           55  262.7   28.8       15.9
134         135   36.9   38.6       65.6
90           91  134.3    4.9        9.3
191         192   75.5   10.8        6.0
139         140  184.9   43.9        1.7
142         143  220.5   33.2       37.9
177         178  170.2    7.8       35.2
26           27  142.9   29.3       12.6
89           90  109.8   47.8       51.4
140         141   73.4   17.0       12.9
171         172  164.5   20.9       47.4
23           24  228.3   16.9       26.2
132         133    8.4   27.2        2.1
37           38   74.7   49.4       45.7
151         152  121.0    8.4       48.7
28           29  248.8   27.1       22.9
85           86  193.2   18.4       65.7
93      

In [10]:
print('y_train:',y_train)

y_train: 86     12.0
182     8.7
69     22.3
125    10.6
42     20.7
       ... 
50     11.4
184    17.6
165    11.9
7      13.2
70     18.3
Name: Sales, Length: 160, dtype: float64


In [11]:
print('y_test:',y_test)

y_test: 96     11.7
5       7.2
116    12.2
35     12.8
183    26.2
160    14.4
54     20.2
134    10.8
90     11.2
191     9.9
139    20.7
142    20.1
177    11.7
26     15.0
89     16.7
140    10.9
171    14.5
23     15.5
132     5.7
37     14.7
151    11.6
28     18.9
85     15.2
93     22.2
174    11.5
75      8.7
18     11.3
105    19.2
121     7.0
130     1.6
33     17.4
46     10.6
168    17.1
169    15.0
11     17.4
166     8.0
81     12.3
111    21.8
67     13.4
147    25.4
Name: Sales, dtype: float64


In [12]:
#Changeing the datatype to integer
x_train = x_train.astype(int)
x_test = x_test.astype(int)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [13]:
#Standard Scaler
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.fit_transform(x_test)

In [14]:
#Linear Regression
lr = LinearRegression()
lr.fit(x_train_sc,y_train)

In [15]:
# Prediction Accuracy
pred = lr.predict(x_test_sc)
r2_score(y_test, pred)

0.7973514481041017