# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import dataset

In [2]:
dataset= pd.read_csv("investment_data.csv")

In [3]:
dataset

Unnamed: 0,Capital investment,Employee salary,Advertisement expenditure,City,Turn over
0,165361.76,136897.8,471784.1,Kolkata,192274.39
1,162610.26,151377.59,443898.53,Bengaluru,191804.62
2,153454.07,101145.55,407934.54,Chennai,191062.95
3,144384.97,118671.85,383199.62,Kolkata,182914.55
4,142119.9,91391.77,366168.42,Chennai,166200.5
5,131889.46,99814.71,362861.36,Kolkata,157003.68
6,134628.02,147198.87,127716.82,Bengaluru,156135.07
7,130310.69,145530.06,323876.68,Chennai,155765.16
8,120555.08,148718.95,311613.29,Kolkata,152224.33
9,123347.44,108679.17,304981.62,Bengaluru,149772.52


# Creating feature matrix X and dependent variable vector Y

In [4]:
X= dataset.iloc[:,:-1].values
Y= dataset.iloc[:,-1].values

In [5]:
X

array([[165361.76, 136897.8, 471784.1, 'Kolkata'],
       [162610.26, 151377.59, 443898.53, 'Bengaluru'],
       [153454.07, 101145.55, 407934.54, 'Chennai'],
       [144384.97, 118671.85, 383199.62, 'Kolkata'],
       [142119.9, 91391.77, 366168.42, 'Chennai'],
       [131889.46, 99814.71, 362861.36, 'Kolkata'],
       [134628.02, 147198.87, 127716.82, 'Bengaluru'],
       [130310.69, 145530.06, 323876.68, 'Chennai'],
       [120555.08, 148718.95, 311613.29, 'Kolkata'],
       [123347.44, 108679.17, 304981.62, 'Bengaluru'],
       [101925.64, 110594.11, 229160.95, 'Chennai'],
       [100684.52, 91790.61, 249744.55, 'Bengaluru'],
       [93876.31, 127320.38, 249839.44, 'Chennai'],
       [92004.95, 135495.07, 252664.93, 'Bengaluru'],
       [119955.8, 156547.42, 256512.92, 'Chennai'],
       [114536.17, 122616.84, 261776.23, 'Kolkata'],
       [78025.67, 121597.55, 264346.06, 'Bengaluru'],
       [94669.72, 145077.58, 282574.31, 'Kolkata'],
       [91761.72, 114175.79, 294919.57, 'Chen

In [6]:
Y

array([192274.39, 191804.62, 191062.95, 182914.55, 166200.5 , 157003.68,
       156135.07, 155765.16, 152224.33, 149772.52, 146134.51, 144271.96,
       141598.08, 134319.91, 132615.21, 129929.6 , 127005.49, 125382.93,
       124279.46, 122789.42, 118486.59, 111325.58, 110364.81, 108746.55,
       108564.6 , 107416.9 , 105746.1 , 105020.87, 103294.94, 101017.2 ,
        99950.15,  97496.12,  97440.4 ,  96791.48,  96725.36,  96492.07,
        90720.75,  89961.7 ,  81241.62,  81018.32,  78252.47,  77811.39,
        71511.05,  69771.54,  65212.89,  64938.64,  49503.31,  42572.29,
        35685.97,  14693.96])

# Replacing missing data

In [7]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,0:3])
X[:,0:3]= imputer.transform(X[:,0:3])

In [8]:
X

array([[165361.76, 136897.8, 471784.1, 'Kolkata'],
       [162610.26, 151377.59, 443898.53, 'Bengaluru'],
       [153454.07, 101145.55, 407934.54, 'Chennai'],
       [144384.97, 118671.85, 383199.62, 'Kolkata'],
       [142119.9, 91391.77, 366168.42, 'Chennai'],
       [131889.46, 99814.71, 362861.36, 'Kolkata'],
       [134628.02, 147198.87, 127716.82, 'Bengaluru'],
       [130310.69, 145530.06, 323876.68, 'Chennai'],
       [120555.08, 148718.95, 311613.29, 'Kolkata'],
       [123347.44, 108679.17, 304981.62, 'Bengaluru'],
       [101925.64, 110594.11, 229160.95, 'Chennai'],
       [100684.52, 91790.61, 249744.55, 'Bengaluru'],
       [93876.31, 127320.38, 249839.44, 'Chennai'],
       [92004.95, 135495.07, 252664.93, 'Bengaluru'],
       [119955.8, 156547.42, 256512.92, 'Chennai'],
       [114536.17, 122616.84, 261776.23, 'Kolkata'],
       [78025.67, 121597.55, 264346.06, 'Bengaluru'],
       [94669.72, 145077.58, 282574.31, 'Kolkata'],
       [91761.72, 114175.79, 294919.57, 'Chen

# Encoding

# Feature matrix using OneHotEncoder

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [10]:
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
X=np.array(ct.fit_transform(X))

In [11]:
X

array([[0.0, 0.0, 1.0, 165361.76, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162610.26, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153454.07, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144384.97, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142119.9, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131889.46, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134628.02, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130310.69, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120555.08, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123347.44, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101925.64, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100684.52, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93876.31, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 92004.95, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119955.8, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114536.17, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78025.67, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94669.72, 145077.5

# Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
sc= StandardScaler()
X[:,3:]= sc.fit_transform(X[:,3:])

In [14]:
X

array([[0.0, 0.0, 1.0, 2.016411493158463, 0.5607529145307771,
        2.1539430885717437],
       [1.0, 0.0, 0.0, 1.9558603364325031, 1.0828065830760836,
        1.923600395642144],
       [0.0, 1.0, 0.0, 1.7543637361407838, -0.7282570276886139,
        1.6265276693147557],
       [0.0, 0.0, 1.0, 1.5547836905426, -0.09636463069766295,
        1.4222102362410107],
       [0.0, 1.0, 0.0, 1.5049372036935105, -1.0799193536742695,
        1.2815277086174903],
       [0.0, 0.0, 1.0, 1.2798000145910104, -0.7762390705391548,
        1.2542104579362325],
       [1.0, 0.0, 0.0, 1.3400664059278975, 0.9321472084702611,
        -0.6881499302965387],
       [0.0, 1.0, 0.0, 1.2450566565302281, 0.8719800111141467,
        0.932185978099574],
       [0.0, 0.0, 1.0, 1.03036886075798, 0.9869521013922136,
        0.8308869091888099],
       [1.0, 0.0, 0.0, 1.091819207112805, -0.45664024606220305,
        0.7761074398639786],
       [0.0, 1.0, 0.0, 0.6203982479442911, -0.3875990892467969,
        0.1498072

# Spliting the dataset into training and testing

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
Xtrain,Xtest,Ytrain,Ytest= train_test_split(X,Y,test_size=0.2,random_state=1)

In [17]:
Xtrain

array([[1.0, 0.0, 0.0, -0.226948675490503, 0.28392381291480906,
        -1.3624497800708548],
       [1.0, 0.0, 0.0, -0.7738203591339539, -1.3831215613922057,
        -0.2975832762317315],
       [0.0, 0.0, 1.0, 0.10272359925435644, 1.1691860859036511,
        0.7327877912137449],
       [0.0, 1.0, 0.0, -0.9915701530696946, 0.2059246909295515,
        -0.08176257339610735],
       [0.0, 0.0, 1.0, 0.2794416495089463, 1.1598365748038926,
        -1.743126975815244],
       [1.0, 0.0, 0.0, -1.1021055579535226, -0.9069375345894082,
        -0.520595959451942],
       [1.0, 0.0, 0.0, -1.6223620208201208, -0.1572255061100761,
        -1.3699847273953474],
       [0.0, 1.0, 0.0, 0.03537020282496413, 0.8217179162739844,
        -0.6358354945508684],
       [0.0, 1.0, 0.0, 0.006006577918180455, 0.05184956481365981,
        0.7623758762266404],
       [1.0, 0.0, 0.0, 0.40207760283447386, 0.5101789529969049,
        0.34395678776100586],
       [0.0, 1.0, 0.0, -1.008533719724479, -1.3207958138681

In [18]:
Xtest

array([[0.0, 0.0, 1.0, -0.03551899384108417, 0.2350685433218798,
        1.1742711585795813],
       [0.0, 0.0, 1.0, -0.6097499413683858, -1.3086575295015461,
        -0.04549315867841826],
       [1.0, 0.0, 0.0, -0.9895770154121732, -0.10090021798884612,
        -0.31578588311480577],
       [0.0, 0.0, 1.0, -1.1771775488623126, -1.997270365938937,
        -0.2127848655456062],
       [0.0, 1.0, 0.0, 1.7543637361407838, -0.7282570276886139,
        1.6265276693147557],
       [0.0, 0.0, 1.0, 1.5547836905426, -0.09636463069766295,
        1.4222102362410107],
       [0.0, 0.0, 1.0, -1.610433343915412, -2.5094088368614424,
        -1.743126975815244],
       [0.0, 0.0, 1.0, -0.1786085403324519, 1.1424567667457468,
        -0.8581336633129413],
       [0.0, 1.0, 0.0, -1.5934132203015923, -0.1993217411284577,
        0.7111224738005978],
       [0.0, 0.0, 1.0, -0.2769582312370445, 1.1305539146923995,
        -1.0144194520428105]], dtype=object)

In [19]:
Ytrain

array([ 97440.4 ,  81018.32, 111325.58,  90720.75, 122789.42,  71511.05,
        14693.96, 105746.1 , 110364.81, 134319.91,  77811.39, 125382.93,
        64938.64, 108564.6 , 108746.55, 166200.5 ,  96791.48, 132615.21,
        99950.15, 146134.51, 103294.94,  65212.89,  96725.36, 124279.46,
       118486.59, 107416.9 , 156135.07, 155765.16,  42572.29, 191804.62,
       127005.49, 192274.39, 129929.6 , 157003.68, 144271.96, 149772.52,
       152224.33, 141598.08,  69771.54,  89961.7 ])

In [20]:
Ytest

array([105020.87,  96492.07,  78252.47,  81241.62, 191062.95, 182914.55,
        35685.97, 101017.2 ,  49503.31,  97496.12])

# Building multiple regression model

# Training the model

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lr = LinearRegression()
lr.fit(Xtrain,Ytrain)

LinearRegression()

# Testing the model

In [23]:
Yestimated = lr.predict(Xtest)

In [24]:
Yestimated

array([114676.97715867,  90605.7153162 ,  75705.40151575,  70234.44679651,
       179802.81514873, 171589.48018521,  49766.1475203 , 102289.21888936,
        58661.93795761,  98284.58561131])

In [25]:
Ytest

array([105020.87,  96492.07,  78252.47,  81241.62, 191062.95, 182914.55,
        35685.97, 101017.2 ,  49503.31,  97496.12])

In [26]:
# we use reshape to get the transpose of row vector.
Yestimated.reshape(len(Yestimated),1)

array([[114676.97715867],
       [ 90605.7153162 ],
       [ 75705.40151575],
       [ 70234.44679651],
       [179802.81514873],
       [171589.48018521],
       [ 49766.1475203 ],
       [102289.21888936],
       [ 58661.93795761],
       [ 98284.58561131]])

In [27]:
Ytest.reshape(len(Ytest),1)

array([[105020.87],
       [ 96492.07],
       [ 78252.47],
       [ 81241.62],
       [191062.95],
       [182914.55],
       [ 35685.97],
       [101017.2 ],
       [ 49503.31],
       [ 97496.12]])

In [28]:
# Concatenate function is used to combine the estimated values and true values
np.concatenate((Yestimated.reshape(len(Yestimated),1),Ytest.reshape(len(Ytest),1)),1)

array([[114676.97715867, 105020.87      ],
       [ 90605.7153162 ,  96492.07      ],
       [ 75705.40151575,  78252.47      ],
       [ 70234.44679651,  81241.62      ],
       [179802.81514873, 191062.95      ],
       [171589.48018521, 182914.55      ],
       [ 49766.1475203 ,  35685.97      ],
       [102289.21888936, 101017.2       ],
       [ 58661.93795761,  49503.31      ],
       [ 98284.58561131,  97496.12      ]])

# coefficient and intercept

In [29]:
lr.coef_

array([-2.85177769e+02,  2.97560876e+02, -1.23831070e+01,  3.51868131e+04,
       -2.61932328e+02,  3.50088661e+03])

In [30]:
lr.intercept_

111889.74233952185

# find the net turnover for the following row vector 
# CI=50661,ES=115641,AE=92496,CITY=BENGALURU

In [31]:
print(lr.predict([[50661,115641,92496,1.0, 0.0, 0.0]]))

[18964535.02635903]
