# Implementation of Multiple Regression



'''
Multiple linear regression (MLR), also known simply as multiple regression, is a statistical technique that uses several 
explanatory variables to predict the outcome of a response variable. 
'''

![](index.png)

In [None]:
    # y=m1x1+m2x2+c
    # we have two independent variables x1 and x2
    # m1 and m2 are the coef. of x1 and x2 respectively
    # c it is an intercept of the y
 
     #y = m1x1 + m2x2 + m3x3 + .........+ mnxn + c
    
# Multiple regression is like linear regression, but with more than one independent value, 
# meaning that we try to predict a value based on two or more variables.

#import libraries

import pandas as pd # used to represent the data in dataframe
import numpy as np # used for numerical operations
from sklearn import linear_model #linear
from sklearn.model_selection import train_test_split 
#used to split the data into training data and testing data


In [None]:
# Reading the dataset
df = pd.read_csv("cars.csv") #reading the dataset file
print (df)

           Car       Model  Volume  Weight  CO2
0       Toyoty        Aygo    1000     790   99
1   Mitsubishi  Space Star    1200    1160   95
2        Skoda      Citigo    1000     929   95
3         Fiat         500     900     865   90
4         Mini      Cooper    1500    1140  105
5           VW         Up!    1000     929  105
6        Skoda       Fabia    1400    1109   90
7     Mercedes     A-Class    1500    1365   92
8         Ford      Fiesta    1500    1112   98
9         Audi          A1    1600    1150   99
10     Hyundai         I20    1100     980   99
11      Suzuki       Swift    1300     990  101
12        Ford      Fiesta    1000    1112   99
13       Honda       Civic    1600    1252   94
14      Hundai         I30    1600    1326   97
15        Opel       Astra    1600    1330   97
16         BMW           1    1600    1365   99
17       Mazda           3    2200    1280  104
18       Skoda       Rapid    1600    1119  104
19        Ford       Focus    2000    13

In [None]:
X = df[['Weight', 'Volume']] #independent variables
y = df['CO2'] #dependent variable

print (X)
print (y)

    Weight  Volume
0      790    1000
1     1160    1200
2      929    1000
3      865     900
4     1140    1500
5      929    1000
6     1109    1400
7     1365    1500
8     1112    1500
9     1150    1600
10     980    1100
11     990    1300
12    1112    1000
13    1252    1600
14    1326    1600
15    1330    1600
16    1365    1600
17    1280    2200
18    1119    1600
19    1328    2000
20    1584    1600
21    1428    2000
22    1365    2100
23    1415    1600
24    1415    2000
25    1465    1500
26    1490    2000
27    1725    2000
28    1523    1600
29    1705    2000
30    1605    2100
31    1746    2000
32    1235    1600
33    1390    1600
34    1405    1600
35    1395    2500
0      99
1      95
2      95
3      90
4     105
5     105
6      90
7      92
8      98
9      99
10     99
11    101
12     99
13     94
14     97
15     97
16     99
17    104
18    104
19    105
20     94
21     99
22     99
23     99
24     99
25    102
26    104
27    114
28    109
29    1

In [None]:
# all column name
df.columns

Index(['Car', 'Model', 'Volume', 'Weight', 'CO2'], dtype='object')

In [None]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 3)
print (X_train)
print (X_test)
print (y_train)
print (y_test)

    Weight  Volume
35    1395    2500
34    1405    1600
1     1160    1200
30    1605    2100
16    1365    1600
2      929    1000
14    1326    1600
7     1365    1500
12    1112    1000
20    1584    1600
32    1235    1600
6     1109    1400
31    1746    2000
28    1523    1600
9     1150    1600
25    1465    1500
11     990    1300
10     980    1100
19    1328    2000
29    1705    2000
21    1428    2000
0      790    1000
8     1112    1500
3      865     900
24    1415    2000
    Weight  Volume
4     1140    1500
26    1490    2000
18    1119    1600
22    1365    2100
17    1280    2200
23    1415    1600
15    1330    1600
27    1725    2000
13    1252    1600
33    1390    1600
5      929    1000
35    120
34    109
1      95
30    115
16     99
2      95
14     97
7      92
12     99
20     94
32    104
6      90
31    117
28    109
9      99
25    102
11    101
10     99
19    105
29    114
21     99
0      99
8      98
3      90
24     99
Name: CO2, dtype: int64
4   

In [None]:
# Builiding the model
from sklearn.linear_model import LinearRegression
# instantiate or object
model = LinearRegression() # we are creating object of the linear regression class.
# fit
model.fit(X_train, y_train) #Training data set #Fitting the model 
# predict
y_pred = model.predict(X_test) # Testing data set

In [None]:
#predict the CO2 emission of a car where the weight is 2300kg, and the volume is 1300cm3:
predictedCO2 = model.predict([[10000, 5405]]) #Putting new value to get the prediction of CO2 emission

print(predictedCO2) 

[188.9334561]


In [None]:
import sklearn.metrics as sm
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_pred), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_pred), 2)) 
print("R2 score =", round(sm.r2_score(y_test, y_pred), 2))

# R2 score: This is pronounced as R-squared, and this score refers to the coefficient of determination. 
# This tells us how well the unknown samples will be predicted by our model. 
# The best possible score is 1.0, but the score can be negative as well.

Mean absolute error = 5.91
Mean squared error = 41.51
R2 score = -0.49
