In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## CONTEXT
#### Hi, I am currently pursuing PGP in Data Science. Recently we were assigned with a project on regression and hypothesis by our Statistics department. While looking for a dataset relevant to my project, I stumbled upon this one.

## CONTENT
#### This dataset captures the details of how CO2 emissions by a vehicle can vary with the different features. The dataset has been taken from Canada        Government official open data website. This is a compiled version. This contains data over a period of 7 years.
#### There are total 7385 rows and 12 columns. There are few abbreviations that has been used to describe the features. I am listing them out here. The      same can be found in the Data Description sheet.

## Model
#### 4WD/4X4 = Four-wheel drive
#### AWD = All-wheel drive
#### FFV = Flexible-fuel vehicle
#### SWB = Short wheelbase
#### LWB = Long wheelbase
#### EWB = Extended wheelbase

## Transmission
#### A = Automatic
#### AM = Automated manual
#### AS = Automatic with select shift
#### AV = Continuously variable
#### M = Manual
#### 3 - 10 = Number of gears

## Fuel type
#### X = Regular gasoline
#### Z = Premium gasoline
#### D = Diesel
#### E = Ethanol (E85)
#### N = Natural gas

## Fuel Consumption
#### City and highway fuel consumption ratings are shown in litres per 100 kilometres (L/100 km) - the combined rating (55% city, 45% hwy) is shown in        L/100 km and in miles per gallon (mpg)

## CO2 Emissions
#### The tailpipe emissions of carbon dioxide (in grams per kilometre) for combined city and highway driving

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Reading the CSV file

In [None]:
file = "/kaggle/input/co2-emission-by-vehicles/CO2 Emissions_Canada.csv"
df = pd.read_csv(file)

### Checking the information from the dataset.

In [None]:
df.head()

### Having a clear look at attributes.

In [None]:
df.info()

####  There are 7385 entries of data and 12 columns. There are 7 numerical variables and 5 categorical variables in the dataset.

### Understanding the distributions of data attributes.

In [None]:
df.describe(include='all')

### Plotting the histogram of all numerical features.

In [None]:
df.hist(figsize=(20,10),bins=50)

In [None]:
df.isna().sum()

#### There are no missing values in the dataset.

In [None]:
df['Transmission'].unique()

### replacing similar labels with single label.

In [None]:
df['Transmission'] = np.where(df['Transmission'].isin(['A4','A5','A6','A7','A8','A9','A10']),"Automatic",df['Transmission'])
df['Transmission'] = np.where(df['Transmission'].isin(["AM5", "AM6", "AM7", "AM8", "AM9"]),"Automated Manual",df['Transmission'])
df['Transmission'] = np.where(df['Transmission'].isin(["AS4", "AS5", "AS6", "AS7", "AS8", "AS9", "AS10"]),"Automatic with Select Shift",df['Transmission'])
df['Transmission'] = np.where(df['Transmission'].isin(["AV", "AV6", "AV7", "AV8", "AV10"]),"Continuously Variable",df['Transmission'])
df['Transmission'] = np.where(df['Transmission'].isin(["M5", "M6", "M7"]),"Manual",df['Transmission'])

In [None]:
df['Transmission'].unique()

In [None]:
sns.distplot(df['CO2 Emissions(g/km)'])

In [None]:
sns.violinplot(df['CO2 Emissions(g/km)'])

In [None]:
plt.figure(figsize=(20,5))
df.groupby(['Make'])['Make'].count().sort_values(ascending=False).plot(kind='bar')

#### We can see high frequency in FORD,CHEVROLET and BMW brands which are dominant in our dataset.

In [None]:
df.groupby(['Model'])['Model'].count().sort_values(ascending=False).head(10)

#### These are the 10 top most frequent vehicle models in our dataset.

In [None]:
plt.figure(figsize=(15,5))
df.groupby(['Vehicle Class'])['Vehicle Class'].count().sort_values().plot(kind='bar')
plt.tight_layout()
plt.xticks(fontsize=8,rotation=45)

#### From this plot we can see that lots of vehicles are SUV-SMALL,MID-SIZE and COMPACT type and VAN_CARGO is very less frequnent vehicle.

In [None]:
plt.figure(figsize=(15,5))
df.groupby(['Cylinders'])['Cylinders'].count().sort_values(ascending=False).plot(kind='bar')

#### It looks like 4 and 6 Cylinders vehicles are mostly used ones.

In [None]:
df.groupby(['Transmission'])['Transmission'].count().sort_values(ascending=False).plot(kind='bar')

#### Automatic with Select Shift type is the most frequent in Transmission.

In [None]:
df.groupby(['Fuel Type'])['Fuel Type'].count().sort_values(ascending=False).plot(kind='bar')

### Fuel type
#### X = Regular gasoline, Z = Premium gasoline, D = Diesel, E = Ethanol (E85), N = Natural gas
#### Gasoline is most used type of Fuel and Natural gas is the least used type of fuel⛽.

In [None]:
plt.figure(figsize=(20,5))
df.groupby(['Make'])['CO2 Emissions(g/km)'].mean().sort_values(ascending=False).plot(kind='bar')
plt.ylabel('CO2 Emissions(g/km)')

#### We can see most of the costly brand cars are also emmitting lot of carbon emissions and brands like Honda and Smart are least carbon emitting vehicles.

In [None]:
plt.figure(figsize=(10,3))
df.groupby(['Vehicle Class'])['CO2 Emissions(g/km)'].mean().sort_values(ascending=False).plot(kind='bar')
plt.xticks(fontsize=8)
plt.ylabel("CO2 Emissions(g/km)")

#### VAN-Passenger vehicles are the top most carbon emitting type of vehicle and Station Wagon-Small is the least Carbon emitting type of vehicle.

In [None]:
plt.figure(figsize=(15,3))
df.groupby(['Engine Size(L)'])['CO2 Emissions(g/km)'].median().sort_values(ascending=True).plot(kind='bar')
plt.xlabel('Engine Size')
plt.ylabel('CO2 Emissions(g/km)')

#### We can observe as Engine size increase CO2 emissions are also rising.

In [None]:
df.groupby(['Cylinders'])['CO2 Emissions(g/km)'].mean().sort_values().plot(kind='bar')
plt.ylabel('CO2 Emissions(g/km)')

#### We can observe as increase in number of cylinders Carbon emissions are also increasing.

In [None]:
df.groupby(['Transmission'])['CO2 Emissions(g/km)'].mean().sort_values().plot(kind='bar')
plt.ylabel('CO2 Emissions(g/km)')

#### From the plot Automatic Transmission Type vehicles emitt a large amount of carbon dioxide.

In [None]:
df.groupby(['Fuel Type'])['CO2 Emissions(g/km)'].mean().sort_values().plot(kind='bar')
plt.ylabel('CO2 Emissions(g/km)')

#### We see that Natural gas emitts less amount of carbon emissions and Ethanol emitts large amount of carbon emissions.

In [None]:
plt.figure(figsize=(25,5))
df.groupby(['Fuel Consumption City (L/100 km)'])['CO2 Emissions(g/km)'].mean().sort_values().plot(kind='bar')
plt.xticks(rotation=90, horizontalalignment='center', fontweight='light', fontsize='7')
plt.ylabel("CO2 Emissions(g/km)")

#### We can observe as fuel consumtion of vehivles on city roads increases carbon emissions also increase.

In [None]:
plt.figure(figsize=(20,5))
df.groupby(['Fuel Consumption Hwy (L/100 km)'])['CO2 Emissions(g/km)'].mean().sort_values().plot(kind='bar')
plt.xticks(rotation=90, horizontalalignment='center', fontweight='light', fontsize='7')
plt.ylabel("CO2 Emissions(g/km)")

#### Even on highway as fuel consumtion of vechiles increses carbon emissions also increases.

In [None]:
plt.figure(figsize=(20,5))
df.groupby(['Fuel Consumption Comb (mpg)'])['CO2 Emissions(g/km)'].mean().sort_values().plot(kind='bar')
plt.xticks(rotation=90, horizontalalignment='center', fontweight='light', fontsize='7')
plt.ylabel("CO2 Emissions(g/km)")

#### As per gallon number of miles a vehicle travels increases carbon emissions decreases. It implies that less fuel consumption vehicles emitt less carbon emissions.

In [None]:
sns.scatterplot(df['Fuel Consumption City (L/100 km)'],df['CO2 Emissions(g/km)'],hue=df['Fuel Type'])

#### Fuel consumption of vehicles on city roads is positively corealated with Carbon emissions.

In [None]:
sns.scatterplot(df['Fuel Consumption Hwy (L/100 km)'],df['CO2 Emissions(g/km)'],hue=df['Fuel Type'])

#### Fuel consumption of vehicles on highway is positively corelated with cabon emissions.

In [None]:
sns.scatterplot(df['Fuel Consumption Comb (L/100 km)'],df['CO2 Emissions(g/km)'],hue=df['Engine Size(L)'])

#### We can see that Large size Engines consumes more fuel and emitt large amount of carbon emissions.

In [None]:
sns.scatterplot(df['Fuel Consumption Comb (mpg)'],df['CO2 Emissions(g/km)'],hue=df['Engine Size(L)'])

#### As number of miles a vehicle can travel with one gallon increases carbon Emissions decreases. Fuel Consumption Comb (mpg) is negatively corelated with CO2 Emissions.

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(df['Fuel Type'],df['CO2 Emissions(g/km)'])

In [None]:
sns.pointplot(df['Cylinders'],df['CO2 Emissions(g/km)'])

In [None]:
corr = df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,annot=True)

#### From the corelation plot we can see all the features are positively corelated with carbon emissions except Fuel Consumption Comb (mpg) which is negatively corelated with carbon emissions.
#### We can see that Fuel Consumption Comb (L/100 km) is highly corelated with Fuel Consumption City (L/100 km) and Fuel Consumption Hwy (L/100 km) with corelation of 0.99 and 0.98 respectively. It shows that Fuel Consumption Comb (L/100 km) is redundant feature and we can drop this column from the dataset.

### One hot Encoding of Features Fuel Type and Transmission.

In [None]:
Ft = pd.get_dummies(df['Fuel Type'],drop_first=True,prefix='Fuel')
df = df.drop(['Fuel Type'],axis=1)
df = pd.concat([df,Ft],axis=1)

In [None]:
Tr = pd.get_dummies(df['Transmission'],drop_first=True)
df = df.drop(['Transmission'],axis=1)
df = pd.concat([df,Tr],axis=1)

In [None]:
df.head()

#### Dropping the redundant feature Fuel Consumption Comb (L/100 km)

In [None]:
X = df.drop(['CO2 Emissions(g/km)','Fuel Consumption Comb (L/100 km)'],axis=1)
y = df['CO2 Emissions(g/km)']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
x_train.head()

### Encoding the remaining categorical Features with category_encoders.

In [None]:
cat_cols = ['Make','Model','Vehicle Class']

In [None]:
import category_encoders as ce

In [None]:
target_enc = ce.CatBoostEncoder(cols = cat_cols)
target_enc.fit(x_train[cat_cols],y_train)

In [None]:
train_enc = target_enc.transform(x_train[cat_cols])

In [None]:
test_enc = target_enc.transform(x_test[cat_cols])

In [None]:
train_enc.head()

In [None]:
x_train = x_train.drop(['Make','Model','Vehicle Class'],axis=1)
x_test = x_test.drop(['Make','Model','Vehicle Class'],axis=1)

In [None]:
x_train = pd.concat([x_train,train_enc],axis=1)
x_test = pd.concat([x_test,test_enc],axis=1)

In [None]:
x_train.head()

In [None]:
x_test.head()

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
best_features = SelectKBest(score_func=chi2)

In [None]:
fit = best_features.fit(x_train,y_train)

In [None]:
best = pd.DataFrame(fit.scores_,columns=['scores'])

In [None]:
best['var'] = x_train.columns

In [None]:
best.sort_values(by='scores' ,ascending=False)

#### These scores show us which features are important for our model. It seems that model type  and make are the most important features.

### Scaling the features for further usage of data in models.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
xs_train = scaler.fit_transform(x_train)
xs_test = scaler.fit_transform(x_test)

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
model1 = lr.fit(x_train,y_train)

In [None]:
y_pred1 = model1.predict(x_test) 

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [None]:
r2_score(y_test,y_pred1)

In [None]:
mse = mean_squared_error(y_test,y_pred1)
mse

In [None]:
rmse = np.sqrt(mse)
rmse

In [None]:
mae = mean_absolute_error(y_test,y_pred1)
mae

#### Our linear regression model performs best on the test set with 99% accuracy and mean absolute error of 3.

## KNeighbors Regressor model

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=3)

In [None]:
model2 = knn.fit(xs_train,y_train)

In [None]:
y_pred2 = model2.predict(xs_test)

In [None]:
r2_score(y_test,y_pred2)

In [None]:
mean_squared_error(y_test,y_pred2)

#### KNeighbors model also performs better with 98.5% accuracy.

## Support Vector Regressor model

In [None]:
from sklearn.svm import LinearSVR

In [None]:
svr = LinearSVR()

In [None]:
model3 = svr.fit(xs_train,y_train)

In [None]:
y_pred3 = model3.predict(xs_test)

In [None]:
r2_score(y_test,y_pred3)

In [None]:
mean_squared_error(y_test,y_pred3)

#### Linear SVR model also performs better with 98.8% accuracy.

## Decision Tree Regresor Model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtree = DecisionTreeRegressor()

In [None]:
model4 = dtree.fit(x_train,y_train)

In [None]:
y_pred4 = model4.predict(x_test)

In [None]:
r2_score(y_test,y_pred4)

In [None]:
mean_squared_error(y_test,y_pred4)

#### Decision tree model performs amazingly well with 99% accuracy.

## Random Forest Regressor model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=100)

In [None]:
model5 = rf.fit(x_train,y_train)

In [None]:
y_pred5 = model5.predict(x_test)

In [None]:
r2_score(y_test,y_pred5)

In [None]:
mean_squared_error(y_test,y_pred5)

#### Random Forest Regressor model performs exceptionally well than any ohter model so far with 99.3% accuracy and with only 22.3 mean squarred error which is less than all models.

## Gradient Booosting Regressor Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gb = GradientBoostingRegressor()

In [None]:
model6 = gb.fit(x_train,y_train)

In [None]:
y_pred6 = model6.predict(x_test)

In [None]:
r2_score(y_test,y_pred6)

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
ada = AdaBoostRegressor()

In [None]:
model7 = ada.fit(x_train,y_train)

In [None]:
y_pred7 = model7.predict(x_test)

In [None]:
r2_score(y_test,y_pred7)

In [None]:
mean_squared_error(y_test,y_pred7)

#### Boosting models also done a good job predicting the target variable on test set.

### We see that all our models are performing well in predicting the carbon emissions from vehicles. From the Observations we can see that 95% of the vehicles generate carbon emissions between 133.5(g/km) to 367.5(g/km). Most influencing factors that increase carbon emissions are fuel consumption of vehicle. Vehicles with good mileage generate less amount of carbon dioxide. Even the Top brands does not do any good they even stay top in emitting carbon emissions. With large Engine sizes and more number of Cylinders of heavy vechiles more carbon emissions are observed.