# CO2 EMISSION PROJECT

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from collections import Counter as c
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error,mean_absolute_error
import pickle
from sklearn.linear_model import LinearRegression

In [2]:
data=pd.read_csv("../Dataset/Sample.csv")
data.head()

# Renaming columns

In [None]:
data.columns=["Model","Make","Model.1","Vehicle_Class","Engine_size","Cylinder","Transmission","Fuel_Type","Fuel_Consumption_City",
              "Fuel_Consumption_Hwy","Fuel_Consumption_Comb(L/100 km)","Fuel_Consumption_Comb(mpg)","CO2_Emissions"]

In [None]:
data.drop("Model",axis=1,inplace=True)            #drop model column

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
np.unique(data.dtypes,return_counts=True)    #finding different datatypes and their counts

# Finding catagorical columns

In [None]:
cat=data.dtypes[data.dtypes=="O"].index.values   #finding catagorical columns
cat

In [None]:
for i in cat:
    print("column : ",i)
    print("count of classes: ",data[i].nunique()) #returns number of unique values or classes
    print(c(data[i]))                             #counts each class
    print("*"*120)

In [None]:
data

In [None]:
#here we are combining different classes into one using where and is in

In [None]:
data["Transmission"]=np.where(data["Transmission"].isin(["A4","A5","A3"]),"Automatic",data["Transmission"])
data["Transmission"]=np.where(data["Transmission"].isin(["M5","M6"]),"Manual",data["Transmission"])
data["Transmission"]=np.where(data["Transmission"].isin(["AS4","AS5"]),"Automatic with Select Shift",data["Transmission"])
data["Transmission"]=np.where(data["Transmission"].isin(["AV"]),"Continuously Variable",data["Transmission"])
c(data["Transmission"])

In [None]:
data["Fuel_Type"]=np.where(data["Fuel_Type"]=="Z","Premium Gasoline",data["Fuel_Type"])
data["Fuel_Type"]=np.where(data["Fuel_Type"]=="X","Regular Gasoline",data["Fuel_Type"])
data["Fuel_Type"]=np.where(data["Fuel_Type"]=="D","Diesel",data["Fuel_Type"])
data["Fuel_Type"]=np.where(data["Fuel_Type"]=="E","Ethanol(E85)",data["Fuel_Type"])
data["Fuel_Type"]=np.where(data["Fuel_Type"]=="N","Natural Gas",data["Fuel_Type"])
c(data["Fuel_Type"])

# Find numerical columns

In [None]:
data.dtypes[data.dtypes!="O"].index.values

In [None]:
#checking null values
data.isnull().sum().sum()

# Labelling for catagorical data

In [None]:

data1=data.copy()
from sklearn.preprocessing import LabelEncoder
x="*"
for i in cat:
    print("LABEL ENCODING OF:",i)
    le=LabelEncoder()
    print(c(data[i]))
    data[i]=le.fit_transform(data[i])
    print(c(data[i]))
    print(x*100)

# VISUALISING THE DATA

# Feature:Make

In [None]:
plt.figure(figsize=(19,5))
data1.groupby("Make")["Make"].count().sort_values(ascending=False).plot(kind="bar")

# Feature:Model.1

In [None]:
data1.groupby("Model.1")["Model.1"].count().sort_values(ascending=False)[:20].plot(kind="bar")

# Feature:Vehicle class

In [None]:
data1.groupby('Vehicle_Class')['Vehicle_Class'].count().sort_values(ascending=False).plot(kind="bar")

# Feature:Transmission

In [None]:
data1.groupby('Transmission')['Transmission'].count().sort_values(ascending=False).plot(kind='bar')

# Feature:fuel_type

In [None]:
data1.groupby('Fuel_Type')["Fuel_Type"].count().sort_values(ascending=False).plot(kind='bar')

# Make vs CO2_Emissions

In [None]:
#grouping the make and co2 emissions columns and storing top 20 classes
MCO2=data1.groupby(["Make"])["CO2_Emissions"].mean().sort_values()[:20].reset_index()
plt.figure(figsize=(25,6))
sns.barplot(x="Make",y="CO2_Emissions",data=MCO2)

# Vehicle vs CO2_Emissions

In [None]:
VC=data1.groupby(["Vehicle_Class"])["CO2_Emissions"].mean().sort_values(ascending=False)[:10].reset_index()
plt.figure(figsize=(20,6))
sns.barplot(x="Vehicle_Class",y="CO2_Emissions",data=VC)


# Finding correlation between the independent Columns

In [None]:
corr=data.corr()
plt.subplots(figsize=(16,16))
sns.heatmap(corr,annot=True,square=True)
plt.title("Corelation matrix of numerical features")
plt.tight_layout()
plt.show()


# Correlation with target column

In [None]:
plt.figure(figsize=(16,16))
corr["CO2_Emissions"].sort_values(ascending=True)[:-1].plot(kind="barh")


# Splitting data into independent and dependent variables

In [None]:
x=data.drop(["CO2_Emissions","Fuel_Consumption_Comb(L/100 km)","Model.1"],axis=1)
x=pd.DataFrame(x)
y=data["CO2_Emissions"]
y=pd.DataFrame(y)

# Splitting data into train and test set

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)
print(x_train.shape)
print(y_train.shape)

# Working with the Linear Regression model

In [None]:
lr=LinearRegression()
lr=lr.fit(x_train,y_train)


# predict the values

In [None]:
y_pred=lr.predict(x_test)
y_pred

# Model Evaluation

In [None]:
from sklearn import metrics
print("MAE:",metrics.mean_absolute_error(y_test,y_pred))
print("MSE:",metrics.mean_squared_error(y_test,y_pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

# Dumping our model

In [None]:
pickle.dump(lr,open("CO2.pkl","wb"))