## Importing Basic Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.regressionplots import influence_plot
import statsmodels.formula.api as smf
import numpy as np

from sklearn. compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

## Loading and Understanding the Data
### Performing Basic EDA

In [None]:
#Read the data
df = pd.read_csv("co2_emissions.csv",sep=';')
df.head()
df.shape

(7385, 12)

In [None]:
df=df.rename({'engine_size':'eng_sz','cylinders':'cyln','fuel_consumption_comb(l/100km)':'fuel_comb','fuel_consumption_city':'fuel_city','fuel_consumption_hwy':'fuel_hwy','fuel_consumption_comb(mpg)':'fuel_mpg','co2_emissions':'co2_emi'},axis=1)
df

Unnamed: 0,make,model,vehicle_class,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,fuel_comb,fuel_mpg,co2_emi
0,ACURA,ILX,COMPACT,2.0,4,AS,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS,Z,10.7,7.7,9.4,30,219
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS,Z,11.2,8.3,9.9,29,232
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS,Z,11.7,8.6,10.3,27,240
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS,Z,11.2,8.3,9.9,29,232


In [None]:
df=df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,make,model,vehicle_class,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,fuel_comb,fuel_mpg,co2_emi
0,ACURA,ILX,COMPACT,2.0,4,AS,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...
6268,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS,Z,10.7,7.7,9.4,30,219
6269,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS,Z,11.2,8.3,9.9,29,232
6270,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS,Z,11.7,8.6,10.3,27,240
6271,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS,Z,11.2,8.3,9.9,29,232


In [None]:
df['Make_Type'] = df['make'].replace(['BUGATTI', 'PORSCHE', 'MASERATI', 'ASTON MARTIN', 'LAMBORGHINI', 'JAGUAR','SRT'], 'Sports')
df['Make_Type'] = df['Make_Type'].replace(['ALFA ROMEO', 'AUDI', 'BMW', 'BUICK', 'CADILLAC', 'CHRYSLER', 'DODGE', 'GMC','INFINITI', 'JEEP', 'LAND ROVER', 'LEXUS', 'MERCEDES-BENZ','MINI', 'SMART', 'VOLVO'],'Premium')
df['Make_Type'] = df['Make_Type'].replace(['ACURA', 'BENTLEY', 'LINCOLN', 'ROLLS-ROYCE',  'GENESIS'], 'Luxury')
df['Make_Type'] = df['Make_Type'].replace(['CHEVROLET', 'FIAT', 'FORD', 'KIA', 'HONDA', 'HYUNDAI', 'MAZDA', 'MITSUBISHI','NISSAN', 'RAM', 'SCION', 'SUBARU', 'TOYOTA','VOLKSWAGEN'],'General')

In [None]:
df.drop(['make'], inplace=True, axis=1)
df['Make_Type'].unique()

array(['Luxury', 'Premium', 'Sports', 'General'], dtype=object)

In [None]:
df.drop(['model'], inplace=True, axis=1)

In [None]:
df['Vehicle_Class_Type'] = df['vehicle_class'].replace(['COMPACT', 'MINICOMPACT', 'SUBCOMPACT'], 'Hatchback')
df['Vehicle_Class_Type'] = df['Vehicle_Class_Type'].replace(['MID-SIZE', 'TWO-SEATER', 'FULL-SIZE', 'STATION WAGON - SMALL', 'STATION WAGON - MID-SIZE'],'Sedan')
df['Vehicle_Class_Type'] = df['Vehicle_Class_Type'].replace(['SUV - SMALL', 'SUV - STANDARD', 'MINIVAN'],  'SUV')
df['Vehicle_Class_Type'] = df['Vehicle_Class_Type'].replace(['VAN - CARGO', 'VAN - PASSENGER', 'PICKUP TRUCK - STANDARD', 'SPECIAL PURPOSE VEHICLE', 'PICKUP TRUCK - SMALL'], 'Truck')

In [None]:
df.drop(['vehicle_class'], inplace=True, axis=1)
df['Vehicle_Class_Type'].unique()

array(['Hatchback', 'SUV', 'Sedan', 'Truck'], dtype=object)

In [None]:
df[df['fuel_type']=='N']

Unnamed: 0,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,fuel_comb,fuel_mpg,co2_emi,Make_Type,Vehicle_Class_Type
2232,3.6,6,AS,N,15.2,9.5,12.7,22,213,General,Sedan


## Importing necessary Libraries for Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Defining Function for calculating MAPE

In [None]:
def mape (actual, predicted):
  actual, predicted = np.array (actual), np.array (predicted)
  return np.mean (np.abs ( (actual - predicted) / actual)) * 100

## Dropping irrelevent columns and then removing the duplicates again.

In [None]:
#df_final = df.drop(columns=['make','model','vehicle_class', 'fuel_type', 'transmission', 'fuel_comb','fuel_mpg'])
df_final = df.drop(columns=['fuel_comb','fuel_mpg'])

In [None]:
df_final[df_final['co2_emi']<100]

Unnamed: 0,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,co2_emi,Make_Type,Vehicle_Class_Type
3577,1.6,4,AM,X,4.2,4.0,96,General,Sedan
5221,1.6,4,AM,X,4.2,4.0,96,General,Sedan
6055,1.6,4,AM,X,4.2,4.2,99,General,Sedan


In [None]:
df_final[df_final['co2_emi']>400]

Unnamed: 0,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,co2_emi,Make_Type,Vehicle_Class_Type
46,4.2,8,M,Z,21.2,13.4,407,Premium,Sedan
48,5.2,10,M,Z,21.1,13.8,409,Premium,Sedan
50,4.2,8,M,Z,21.2,13.4,407,Premium,Sedan
52,5.2,10,M,Z,21.1,13.8,409,Premium,Sedan
75,6.8,8,AS,Z,22.3,14.9,437,Luxury,Sedan
...,...,...,...,...,...,...,...,...,...
5286,6.2,8,A,Z,20.9,13.8,413,Premium,SUV
5321,6.5,12,AM,Z,26.3,15.6,487,Sports,Sedan
5322,6.5,12,AM,Z,26.6,15.8,493,Sports,Sedan
5784,6.0,12,AS,Z,20.0,13.8,401,Luxury,SUV


In [None]:
df_final=df_final.drop_duplicates().reset_index(drop=True)
df_final

Unnamed: 0,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,co2_emi,Make_Type,Vehicle_Class_Type
0,2.0,4,AS,Z,9.9,6.7,196,Luxury,Hatchback
1,2.4,4,M,Z,11.2,7.7,221,Luxury,Hatchback
2,1.5,4,AV,Z,6.0,5.8,136,Luxury,Hatchback
3,3.5,6,AS,Z,12.7,9.1,255,Luxury,SUV
4,3.5,6,AS,Z,12.1,8.7,244,Luxury,SUV
...,...,...,...,...,...,...,...,...,...
4920,2.0,4,AS,X,10.2,7.5,210,Premium,SUV
4921,2.0,4,AS,Z,10.7,7.7,219,Premium,SUV
4922,2.0,4,AS,Z,11.2,8.3,232,Premium,SUV
4923,2.0,4,AS,Z,11.7,8.6,240,Premium,SUV


## Performing Train test split

In [None]:
x_data = df_final.drop(columns =['co2_emi'])

y_data= df_final['co2_emi']

In [None]:
x_data

Unnamed: 0,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,Make_Type,Vehicle_Class_Type
0,2.0,4,AS,Z,9.9,6.7,Luxury,Hatchback
1,2.4,4,M,Z,11.2,7.7,Luxury,Hatchback
2,1.5,4,AV,Z,6.0,5.8,Luxury,Hatchback
3,3.5,6,AS,Z,12.7,9.1,Luxury,SUV
4,3.5,6,AS,Z,12.1,8.7,Luxury,SUV
...,...,...,...,...,...,...,...,...
4920,2.0,4,AS,X,10.2,7.5,Premium,SUV
4921,2.0,4,AS,Z,10.7,7.7,Premium,SUV
4922,2.0,4,AS,Z,11.2,8.3,Premium,SUV
4923,2.0,4,AS,Z,11.7,8.6,Premium,SUV


In [None]:
# Step 1 -> train/test/split
X_train, X_test,y_train,y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=42)
X_train

Unnamed: 0,eng_sz,cyln,transmission,fuel_type,fuel_city,fuel_hwy,Make_Type,Vehicle_Class_Type
252,2.4,4,A,X,12.7,9.1,Premium,SUV
3138,2.5,4,AS,X,10.2,8.3,General,SUV
720,3.8,6,AM,Z,12.3,8.9,Sports,Hatchback
2584,2.0,4,AS,Z,11.8,9.1,Premium,SUV
4567,3.0,6,AS,Z,11.3,8.7,Premium,SUV
...,...,...,...,...,...,...,...,...
4426,2.5,4,AV,X,9.1,7.1,General,SUV
466,2.4,4,A,X,10.9,7.7,Premium,SUV
3092,2.4,4,AS,X,11.3,9.5,General,SUV
3772,3.5,6,AS,X,14.9,11.3,Luxury,SUV


## Applying Transformations on columns

In [None]:
transformer = ColumnTransformer([
                                  ('ohe_transform' , OneHotEncoder(sparse=False, drop= 'first'), [2, 3,6,7]),
                                ],remainder='passthrough')

## Executing Pipe

In [None]:
# train the model

#trf5 = DecisionTreeRegressor()

trf6 = LinearRegression()   # sm.OLS

pipe = Pipeline([('EDA',transformer),('Model_Building',trf6)])

pipe.fit(X_train,y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
mae= mean_absolute_error(y_test, y_pred)
mse= mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print ('Linear Regression::')
print('MAE     = ',mae)
print('RMSE    = ',rmse)
print('MAPE    = ', 100-mape(y_test,y_pred))
#print ('Model Accuracy = ', model.score(X_test, y_test))
#print('R^2     = ', model.rsquared)
#print('Adj R^2 = ', model.rsquared_adj)

Linear Regression::
MAE     =  3.511074508501738
RMSE    =  5.661226083781312
MAPE    =  98.58400743909012


## Saving Models and Pickeling them

In [None]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

In [None]:
import numpy as np
pipe = pickle.load(open('pipe.pkl', 'rb'))

In [None]:
test_input = np.array([6.5,	12,	'AM',	'Z',	26.6,	15.8, 'Sports',	'Sedan'], dtype=object).reshape(1,8)  # output is 196

prediction= pipe.predict(test_input)

print (prediction)

[498.95198866]
