# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error

## Importing Dataset

In [2]:
df = pd.read_csv("competition_edu_dataset.csv")
df

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Car Name
0,13.0,8,400.0,190,4422,12.5,72,1,chrysler newport royal
1,13.0,8,350.0,145,3988,13.0,73,1,chevrolet malibu
2,15.5,8,350.0,170,4165,11.4,77,1,chevrolet monte carlo landau
3,17.0,6,231.0,110,3907,21.0,75,1,buick century
4,13.0,8,318.0,150,3755,14.0,76,1,dodge d100
...,...,...,...,...,...,...,...,...,...
295,22.0,6,232.0,112,2835,14.7,82,1,ford granada l
296,18.0,6,250.0,78,3574,21.0,76,1,ford granada ghia
297,18.1,8,302.0,139,3205,11.2,78,1,ford futura
298,15.0,8,318.0,150,4135,13.5,72,1,plymouth fury iii


In [3]:
df.info

<bound method DataFrame.info of       MPG  Cylinders  Displacement Horsepower  Weight  Acceleration  \
0    13.0          8         400.0        190    4422          12.5   
1    13.0          8         350.0        145    3988          13.0   
2    15.5          8         350.0        170    4165          11.4   
3    17.0          6         231.0        110    3907          21.0   
4    13.0          8         318.0        150    3755          14.0   
..    ...        ...           ...        ...     ...           ...   
295  22.0          6         232.0        112    2835          14.7   
296  18.0          6         250.0         78    3574          21.0   
297  18.1          8         302.0        139    3205          11.2   
298  15.0          8         318.0        150    4135          13.5   
299  40.9          4          85.0          ?    1835          17.3   

     Model Year  Origin                      Car Name  
0            72       1        chrysler newport royal  
1  

In [4]:
# Converting all the '?' with the mean of their respective columns
df.Horsepower = df.Horsepower.str.replace('?','NaN', regex = False).astype(float)
df.Horsepower.fillna(df.Horsepower.mean(),inplace = True)
df.Horsepower = df.Horsepower.astype(int)

In [5]:
df = df.drop("Car Name",axis = 1)
df

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,13.0,8,400.0,190,4422,12.5,72,1
1,13.0,8,350.0,145,3988,13.0,73,1
2,15.5,8,350.0,170,4165,11.4,77,1
3,17.0,6,231.0,110,3907,21.0,75,1
4,13.0,8,318.0,150,3755,14.0,76,1
...,...,...,...,...,...,...,...,...
295,22.0,6,232.0,112,2835,14.7,82,1
296,18.0,6,250.0,78,3574,21.0,76,1
297,18.1,8,302.0,139,3205,11.2,78,1
298,15.0,8,318.0,150,4135,13.5,72,1


In [6]:
# Checking duplicates and null values
print('sum of duplicated values: {}\n'.format(df.duplicated().sum()))
print('sum of null values: {}'.format(df.isnull().sum()))


sum of duplicated values: 0

sum of null values: MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64


In [7]:
df.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,23.679333,5.43,192.118333,104.35,2952.82,15.567,76.043333,1.563333
std,7.803218,1.703361,104.71123,38.064645,836.196298,2.811469,3.782744,0.792517
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,98.0,75.75,2219.75,13.875,73.0,1.0
50%,23.0,4.0,144.5,94.0,2789.5,15.5,76.0,1.0
75%,29.0,8.0,263.25,125.0,3565.75,17.125,79.0,2.0
max,44.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [8]:
# Checking multicollinearity using Variance Inflation Factor
# A variable/feature affected by multicollinearity will have a value greater than 10
x1 = sm.tools.add_constant(df) 

series1 = pd.Series([variance_inflation_factor(x1.values,i) for i in range(x1.shape[1])],
                  index = x1.columns)

series1

const           778.530178
MPG               5.738554
Cylinders        11.682240
Displacement     24.478012
Horsepower        9.195574
Weight           12.498465
Acceleration      2.575253
Model Year        1.975163
Origin            1.825230
dtype: float64

In [9]:
# Doing Feature Selection to get rid of the multicollinearity by dropping 'Cylinder','Displacement','Weight' 
# since they have VIF value greater than 10'

df1 = df.drop(['Cylinders','Displacement','Weight'],axis = 1)
#df1['Car Name'] = Car Name
x2 = sm.tools.add_constant(df1)

series2 = pd.Series([variance_inflation_factor(x2.values,i) for i in range(x2.shape[1])],
                  index = x2.columns)

series2

const           696.413687
MPG               3.992778
Horsepower        4.234574
Acceleration      2.076946
Model Year        1.615907
Origin            1.550678
dtype: float64

In [10]:
# Preparing Dataset for Training
x = df1.drop("MPG",axis = 1)
y = df1.MPG

In [11]:
x

Unnamed: 0,Horsepower,Acceleration,Model Year,Origin
0,190,12.5,72,1
1,145,13.0,73,1
2,170,11.4,77,1
3,110,21.0,75,1
4,150,14.0,76,1
...,...,...,...,...
295,112,14.7,82,1
296,78,21.0,76,1
297,139,11.2,78,1
298,150,13.5,72,1


In [12]:
y

0      13.0
1      13.0
2      15.5
3      17.0
4      13.0
       ... 
295    22.0
296    18.0
297    18.1
298    15.0
299    40.9
Name: MPG, Length: 300, dtype: float64

## Encoding Categorical Data

In [13]:
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [3])],  remainder = 'passthrough')
ct


In [14]:
x_scaled = np.array(ct.fit_transform(x))

In [15]:
x_scaled

array([[  1. ,   0. ,   0. , 190. ,  12.5,  72. ],
       [  1. ,   0. ,   0. , 145. ,  13. ,  73. ],
       [  1. ,   0. ,   0. , 170. ,  11.4,  77. ],
       ...,
       [  1. ,   0. ,   0. , 139. ,  11.2,  78. ],
       [  1. ,   0. ,   0. , 150. ,  13.5,  72. ],
       [  0. ,   1. ,   0. , 104. ,  17.3,  80. ]])

## Splitting Dataset into Training dataset and Test dataset

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size = .3,random_state = 0)

In [17]:
x_train

array([[  1. ,   0. ,   0. , 198. ,  10. ,  70. ],
       [  0. ,   1. ,   0. ,  54. ,  23.5,  72. ],
       [  1. ,   0. ,   0. ,  84. ,  12.9,  81. ],
       ...,
       [  0. ,   0. ,   1. ,  61. ,  19. ,  74. ],
       [  0. ,   1. ,   0. , 104. ,  15.8,  81. ],
       [  1. ,   0. ,   0. , 150. ,  11.5,  73. ]])

In [18]:
y_train

282    15.0
44     23.0
272    30.0
189    28.0
152    35.7
       ... 
251    15.0
192    34.1
117    32.0
47     34.5
172    14.0
Name: MPG, Length: 210, dtype: float64

In [19]:
x_test

array([[  0. ,   0. ,   1. ,  70. ,  16.8,  77. ],
       [  1. ,   0. ,   0. , 170. ,  10. ,  70. ],
       [  1. ,   0. ,   0. , 175. ,  12. ,  71. ],
       [  1. ,   0. ,   0. , 110. ,  18. ,  74. ],
       [  1. ,   0. ,   0. , 225. ,  10. ,  70. ],
       [  0. ,   0. ,   1. ,  67. ,  13.8,  80. ],
       [  0. ,   0. ,   1. ,  70. ,  17. ,  76. ],
       [  1. ,   0. ,   0. , 230. ,   9.5,  73. ],
       [  0. ,   1. ,   0. ,  90. ,  15.5,  73. ],
       [  1. ,   0. ,   0. , 100. ,  15. ,  71. ],
       [  0. ,   0. ,   1. ,  96. ,  13.9,  82. ],
       [  1. ,   0. ,   0. , 145. ,  14. ,  75. ],
       [  1. ,   0. ,   0. ,  88. ,  18.6,  82. ],
       [  0. ,   1. ,   0. ,  60. ,  19. ,  71. ],
       [  0. ,   1. ,   0. ,  67. ,  19.9,  80. ],
       [  1. ,   0. ,   0. ,  63. ,  14.7,  82. ],
       [  0. ,   1. ,   0. , 110. ,  12.8,  77. ],
       [  1. ,   0. ,   0. , 100. ,  13. ,  71. ],
       [  0. ,   0. ,   1. ,  88. ,  14.5,  70. ],
       [  1. ,   0. ,   0. ,  8

In [20]:
y_test

208    33.5
188    15.0
12     13.0
221    16.0
239    14.0
       ... 
76     16.5
212    23.0
225    16.5
255    21.0
237    15.0
Name: MPG, Length: 90, dtype: float64

## Training the Regression model using Linear Regression

In [21]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [22]:
# Predicting the Test set results
y_pred = regressor.predict(x_test)
y_pred

array([31.02347829, 12.09113757, 11.11751235, 19.00187049,  4.66253933,
       34.77259506, 30.28562166,  6.15368416, 25.54099125, 19.83448076,
       32.09592135, 16.80215915, 26.84000808, 26.65720445, 31.0808117 ,
       32.0534634 , 26.68597022, 20.77643961, 25.16992816, 21.4993556 ,
       24.6175139 , 21.91644258, 12.85929958, 24.98150297, 20.74582193,
       20.54147425, 33.36315589, 33.58918059, 30.83078419, 25.4352577 ,
       14.02316849, 21.32614994, 34.92956593, 26.65913053, 25.7881763 ,
       26.85246489, 27.54350245, 27.77276376, 27.30793757, 19.0008218 ,
       28.893483  , 33.92264417, 31.85577627, 31.60812565, 19.10992492,
       29.44390042, 25.04081226, 25.11144746, 24.80538133, 21.60713114,
       23.94684216, 28.05559688, 28.45996456, 32.79448148, 19.17245443,
       27.30175897, 21.73432097, 21.59169957, 23.78410041, 24.73953939,
       31.85733291, 27.49398751, 29.055667  , 16.89715825, 25.2354578 ,
       14.60297215, 25.61079786, 12.20048615, 33.85254165, 27.33

In [23]:
# Calculating overall accuracy and mean squared error of the model
print(f"Overall Model Accuracy: {r2_score(y_test,y_pred)}")
print(f"Mean Squared Error: {mean_squared_error(y_test,y_pred)}")

Overall Model Accuracy: 0.7234800241016119
Mean Squared Error: 17.25800804111719


## Training the Regression model using Random Forest Regressor

In [24]:
r_forest = RandomForestRegressor(n_estimators = 20,random_state = 0)
r_forest.fit(x_train,y_train)

In [25]:
# Predicting the Test set results
ry_predict = r_forest.predict(x_test)
ry_predict

array([31.99      , 14.55      , 13.5       , 17.225     , 14.2       ,
       35.215     , 32.165     , 13.2       , 25.95      , 18.73333333,
       29.055     , 14.1       , 30.55      , 29.85      , 34.925     ,
       35.875     , 23.175     , 20.03333333, 24.11      , 21.815     ,
       23.21      , 22.55      , 15.275     , 23.775     , 23.84      ,
       24.95      , 38.87      , 33.18      , 29.67      , 26.425     ,
       14.96666667, 20.6       , 35.935     , 24.85      , 30.285     ,
       31.64      , 31.25      , 30.005     , 30.205     , 14.9       ,
       27.65      , 34.76      , 37.225     , 30.35      , 16.895     ,
       33.945     , 24.125     , 26.825     , 28.185     , 19.235     ,
       22.61      , 30.725     , 25.69      , 32.99      , 17.63      ,
       25.6       , 20.125     , 18.095     , 26.6       , 24.825     ,
       41.95      , 27.395     , 26.9       , 14.79166667, 28.455     ,
       13.6       , 25.35      , 13.125     , 34.4       , 27.19

In [26]:
# Calculating overall accuracy and mean squared error of the model
print(f"Overall Model Accuracy: {r2_score(y_test,ry_predict)}")
print(f"Mean Squared Error: {mean_squared_error(y_test,ry_predict)}")

Overall Model Accuracy: 0.8058247731041526
Mean Squared Error: 12.118754228395055
