In [16]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# import dataset
df = pd.read_csv('online.csv')

In [3]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [4]:
df.shape

(50, 5)

In [5]:
df.describe()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [6]:
# check missing value
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

## Seperate x, y

In [7]:
# seperate independent variable
x = df.drop(['Profit'], axis=1)
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,Dhaka
1,162597.7,151377.59,443898.53,Ctg
2,153441.51,101145.55,407934.54,Rangpur
3,144372.41,118671.85,383199.62,Dhaka
4,142107.34,91391.77,366168.42,Rangpur


In [8]:
# seperate dependent variable
y = df['Profit']
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

## One hot encoding

In [9]:
# convert categorical to numerical data
city = pd.get_dummies(x['Area'], drop_first=True, dtype=int)

In [10]:
city.head()

Unnamed: 0,Dhaka,Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [11]:
# drop the area column
x = x.drop(['Area'], axis=1)
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [12]:
# concatination city
x = pd.concat([x, city], axis=1)

In [13]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,1,0
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,1,0
4,142107.34,91391.77,366168.42,0,1


## Split the dataset into Train & Test

In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 0) # train = 75%, test = 25%

In [18]:
# create object
reg = LinearRegression()

In [19]:
# train the model
reg.fit(xtrain, ytrain)

In [22]:
# show xtest value
xtest

Unnamed: 0,Marketing Spend,Administration,Transport,Dhaka,Rangpur
28,66051.52,182645.56,118148.2,0,1
11,100671.96,91790.61,249744.55,0,0
10,101913.08,110594.11,229160.95,0,1
41,27892.92,84710.77,164470.71,0,1
2,153441.51,101145.55,407934.54,0,1
27,72107.6,127864.55,353183.81,1,0
38,20229.59,65947.93,185265.1,1,0
31,61136.38,152701.92,88218.23,1,0
22,73994.56,122782.75,303319.26,0,1
4,142107.34,91391.77,366168.42,0,1


In [23]:
# show ytest value
ytest

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
33     96778.92
35     96479.51
26    105733.54
Name: Profit, dtype: float64

In [27]:
# predict the testset result
pred = reg.predict(xtest)

In [28]:
pred

array([103501.0825284 , 128011.28068627, 126695.43891127,  70573.91718775,
       173381.96874259, 124238.07860872,  69298.09250304,  98399.41936876,
       116419.1480864 , 161430.98134847,  94740.73303076,  89920.22800514,
       105956.86065332])

In [32]:
# calculate accuracy
reg.score(xtest, ytest)

0.884097862392347

## R-Square value

In [29]:
from sklearn.metrics import r2_score

In [30]:
score = r2_score(ytest, pred)

In [31]:
score

0.884097862392347