In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=sns.load_dataset('mpg')

In [3]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [4]:
df.drop('name',axis=1,inplace=True)

In [5]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [6]:
X=df.drop('mpg',axis=1)
y=df['mpg']

In [7]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,usa
1,8,350.0,165.0,3693,11.5,70,usa
2,8,318.0,150.0,3436,11.0,70,usa
3,8,304.0,150.0,3433,12.0,70,usa
4,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,usa
394,4,97.0,52.0,2130,24.6,82,europe
395,4,135.0,84.0,2295,11.6,82,usa
396,4,120.0,79.0,2625,18.6,82,usa


In [8]:
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1)

In [10]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
298,8,350.0,125.0,3900,17.4,79,usa
13,8,455.0,225.0,3086,10.0,70,usa
376,4,91.0,68.0,2025,18.2,82,japan
61,4,122.0,86.0,2226,16.5,72,usa
346,4,97.0,67.0,2065,17.8,81,japan
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,usa
255,4,140.0,88.0,2720,15.4,78,usa
72,8,304.0,150.0,3892,12.5,72,usa
235,4,97.0,75.0,2265,18.2,77,japan


In [11]:
X_train.shape,X_test.shape

((318, 7), (80, 7))

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [13]:
cat_cols=[col for col in df.columns if df[col].dtype=='object']
num_cols=[col for col in df.columns if df[col].dtype!='object']

In [14]:
del num_cols[0]

In [15]:
num_cols

['cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model_year']

In [16]:
cat_pipeline=Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),('encode',OneHotEncoder())])
num_pipeline=Pipeline(steps=[('impute',SimpleImputer(strategy='median')),('scale',StandardScaler())])

In [17]:
preprocessor=ColumnTransformer([('num_pipeline',num_pipeline,num_cols),('cat_pipeline',cat_pipeline,cat_cols)])

In [18]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [20]:
lr=LinearRegression()
svr=SVR()
dtr=DecisionTreeRegressor()

In [24]:
from sklearn.ensemble import VotingRegressor
vr=VotingRegressor(estimators=[('LR',lr),('SVR',svr),('DTR',dtr)],n_jobs=-1)

In [25]:
vr.fit(X_train,y_train)

In [26]:
y_pred=vr.predict(X_test)

In [27]:
y_pred

array([20.98078144, 27.28930575, 18.37435016, 25.03415909, 22.70600916,
       14.79210603, 28.01344313, 38.66960017, 15.62719839, 12.53824957,
       30.37388943, 17.54015464, 20.33185742, 26.01801736, 35.97803624,
       22.95981098, 12.89218065, 20.71507578, 11.65158676, 34.62292776,
       25.08749968, 30.6194439 , 20.47782622, 26.01963913, 25.57595386,
       27.81719864, 33.13680143, 36.3898234 , 16.6329338 , 26.95319352,
       28.13439695, 13.34133747, 22.06789998, 25.72572519, 24.84955999,
       14.06354348, 26.60898304, 12.16505446, 28.22188276, 23.57771407,
       24.63244247, 24.76264758, 18.87510652, 33.11358387, 23.55983607,
       20.86692889, 19.03606391, 13.47765347, 28.49887413, 17.90061309,
       25.62846717, 25.93398911, 15.84258307, 13.28868244, 29.15603891,
       25.04551574, 12.33206542, 13.68434977, 31.78719644, 34.75262977,
       35.07097055, 35.20427066, 16.16259137, 26.0255723 , 17.77630946,
       32.25785803, 23.85874943, 29.65610878, 31.44592826, 13.37

In [28]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [29]:
mean_absolute_error(y_test,y_pred)

1.7865541924007253

In [30]:
mean_squared_error(y_test,y_pred)

6.672618979029951

In [32]:
score=r2_score(y_test,y_pred)

In [33]:
score

0.8815675199229568

In [35]:
adjusted_score=1- (1-score)*(len(y_test)-1) / (len(y_test) - X_test.shape[1] - 1)

In [36]:
adjusted_score

0.866340486770194