In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split 
import time
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle

from matplotlib import style
style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
co2_data = pd.read_csv('co2.csv')

In [3]:
co2_data 

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,10.7,7.7,9.4,30,219
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,11.2,8.3,9.9,29,232
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS8,Z,11.7,8.6,10.3,27,240
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS8,Z,11.2,8.3,9.9,29,232


In [4]:
co2_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Model                             7385 non-null   object 
 2   Vehicle Class                     7385 non-null   object 
 3   Engine Size(L)                    7385 non-null   float64
 4   Cylinders                         7385 non-null   int64  
 5   Transmission                      7385 non-null   object 
 6   Fuel Type                         7385 non-null   object 
 7   Fuel Consumption City (L/100 km)  7385 non-null   float64
 8   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64
 9   Fuel Consumption Comb (L/100 km)  7385 non-null   float64
 10  Fuel Consumption Comb (mpg)       7385 non-null   int64  
 11  CO2 Emissions(g/km)               7385 non-null   int64  
dtypes: flo

Step1: Checking for duplicates

In [5]:
# Check for duplicates
co2_data.duplicated().sum()

1103

In [6]:
# Drop the duplicates
co2_data.drop_duplicates(inplace = True)
co2_data.duplicated().sum()

0

Step2: Checking for Missing values

In [7]:
# Check for missing values
co2_data.isnull().sum()

Make                                0
Model                               0
Vehicle Class                       0
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    0
Fuel Consumption Hwy (L/100 km)     0
Fuel Consumption Comb (L/100 km)    0
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 0
dtype: int64

Step3: Checking for Outliers
    
### First spliting coulmns under quantitative[quan] & qualitative[qual]

In [9]:
co2_data.columns

Index(['Make', 'Model', 'Vehicle Class', 'Engine Size(L)', 'Cylinders',
       'Transmission', 'Fuel Type', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
       'Fuel Consumption Comb (mpg)', 'CO2 Emissions(g/km)'],
      dtype='object')

In [10]:
def quanQual(co2_data):
    quan=[]
    qual=[]
    for columnName in co2_data.columns:
        #print(columnName)
        if(co2_data[columnName].dtype=='O'):
            #print("qual")
            qual.append(columnName)
        else:
            #print("quan")
            quan.append(columnName)
    return quan,qual

In [11]:
quan,qaul=quanQual(co2_data)

In [12]:
quan

['Engine Size(L)',
 'Cylinders',
 'Fuel Consumption City (L/100 km)',
 'Fuel Consumption Hwy (L/100 km)',
 'Fuel Consumption Comb (L/100 km)',
 'Fuel Consumption Comb (mpg)',
 'CO2 Emissions(g/km)']

In [13]:
qaul

['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']

In [14]:
co2_data[quan]

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,2.0,4,9.9,6.7,8.5,33,196
1,2.4,4,11.2,7.7,9.6,29,221
2,1.5,4,6.0,5.8,5.9,48,136
3,3.5,6,12.7,9.1,11.1,25,255
4,3.5,6,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...
7380,2.0,4,10.7,7.7,9.4,30,219
7381,2.0,4,11.2,8.3,9.9,29,232
7382,2.0,4,11.7,8.6,10.3,27,240
7383,2.0,4,11.2,8.3,9.9,29,232


In [15]:
co2_data[qaul]

Unnamed: 0,Make,Model,Vehicle Class,Transmission,Fuel Type
0,ACURA,ILX,COMPACT,AS5,Z
1,ACURA,ILX,COMPACT,M6,Z
2,ACURA,ILX HYBRID,COMPACT,AV7,Z
3,ACURA,MDX 4WD,SUV - SMALL,AS6,Z
4,ACURA,RDX AWD,SUV - SMALL,AS6,Z
...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,AS8,Z
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,AS8,Z
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,AS8,Z
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,AS8,Z


In [16]:
descriptive=pd.DataFrame(index=["Mean","Median","Mode","Q1:25%","Q2:50%",
                               "Q3:75%","99%","Q4:100%","IQR","1.5rule","Lesser","Greater","Min","Max"],columns=quan)
for columnName in quan:
    descriptive[columnName]["Mean"]=co2_data[columnName].mean()
    descriptive[columnName]["Median"]=co2_data[columnName].median()
    descriptive[columnName]["Mode"]=co2_data[columnName].mode()[0]
    descriptive[columnName]["Q1:25%"]=co2_data.describe()[columnName]["25%"]
    descriptive[columnName]["Q2:50%"]=co2_data.describe()[columnName]["50%"]
    descriptive[columnName]["Q3:75%"]=co2_data.describe()[columnName]["75%"]
    descriptive[columnName]["99%"]=np.percentile(co2_data[columnName],99)
    descriptive[columnName]["Q4:100%"]=co2_data.describe()[columnName]["max"]
    descriptive[columnName]["IQR"]=descriptive[columnName]["Q3:75%"]-descriptive[columnName]["Q1:25%"]
    descriptive[columnName]["1.5rule"]=1.5*descriptive[columnName]["IQR"]
    descriptive[columnName]["Lesser"]=descriptive[columnName]["Q1:25%"]-descriptive[columnName]["1.5rule"]
    descriptive[columnName]["Greater"]=descriptive[columnName]["Q3:75%"]+descriptive[columnName]["1.5rule"]
    descriptive[columnName]["Min"]=co2_data[columnName].min()
    descriptive[columnName]["Max"]=co2_data[columnName].max()
    

In [17]:
descriptive

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
Mean,3.161812,5.618911,12.61022,9.070583,11.017876,27.411016,251.157752
Median,3.0,6.0,12.1,8.7,10.6,27.0,246.0
Mode,2.0,4.0,11.9,7.8,9.4,25.0,221.0
Q1:25%,2.0,4.0,10.1,7.5,8.9,22.0,208.0
Q2:50%,3.0,6.0,12.1,8.7,10.6,27.0,246.0
Q3:75%,3.7,6.0,14.7,10.3,12.7,32.0,289.0
99%,6.6,12.0,22.3,16.1,19.519,50.0,414.0
Q4:100%,8.4,16.0,30.6,20.6,26.1,69.0,522.0
IQR,1.7,2.0,4.6,2.8,3.8,10.0,81.0
1.5rule,2.55,3.0,6.9,4.2,5.7,15.0,121.5


In [18]:
# Checking for Outliers

lesser=[]
greater=[]

for columnName in quan:
    if(descriptive[columnName]["Min"]<descriptive[columnName]["Lesser"]):
        lesser.append(columnName)
    if(descriptive[columnName]["Max"]>descriptive[columnName]["Greater"]):
        greater.append(columnName)
        

In [19]:
lesser

[]

In [20]:
greater

['Engine Size(L)',
 'Cylinders',
 'Fuel Consumption City (L/100 km)',
 'Fuel Consumption Hwy (L/100 km)',
 'Fuel Consumption Comb (L/100 km)',
 'Fuel Consumption Comb (mpg)',
 'CO2 Emissions(g/km)']

In [21]:
# After finding the outliers just replacing it with greater value in the maximum field

for column in lesser:
    co2_data[column][co2_data[column]<descriptive[column]["Lesser"]]=descriptive[column]["Lesser"]
for column in greater:
    co2_data[column][co2_data[column]>descriptive[column]["Greater"]]=descriptive[column]["Greater"]

In [22]:
descriptive=pd.DataFrame(index=["Mean","Median","Mode","Q1:25%","Q2:50%",
                               "Q3:75%","99%","Q4:100%","IQR","1.5rule","Lesser","Greater","Min","Max"],columns=quan)
for columnName in quan:
    descriptive[columnName]["Mean"]=co2_data[columnName].mean()
    descriptive[columnName]["Median"]=co2_data[columnName].median()
    descriptive[columnName]["Mode"]=co2_data[columnName].mode()[0]
    descriptive[columnName]["Q1:25%"]=co2_data.describe()[columnName]["25%"]
    descriptive[columnName]["Q2:50%"]=co2_data.describe()[columnName]["50%"]
    descriptive[columnName]["Q3:75%"]=co2_data.describe()[columnName]["75%"]
    descriptive[columnName]["99%"]=np.percentile(co2_data[columnName],99)
    descriptive[columnName]["Q4:100%"]=co2_data.describe()[columnName]["max"]
    descriptive[columnName]["IQR"]=descriptive[columnName]["Q3:75%"]-descriptive[columnName]["Q1:25%"]
    descriptive[columnName]["1.5rule"]=1.5*descriptive[columnName]["IQR"]
    descriptive[columnName]["Lesser"]=descriptive[columnName]["Q1:25%"]-descriptive[columnName]["1.5rule"]
    descriptive[columnName]["Greater"]=descriptive[columnName]["Q3:75%"]+descriptive[columnName]["1.5rule"]
    descriptive[columnName]["Min"]=co2_data[columnName].min()
    descriptive[columnName]["Max"]=co2_data[columnName].max()
    

In [23]:
# Running the DF for checking the reflection
descriptive

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
Mean,3.154099,5.545845,12.571792,9.026377,10.980086,27.302133,250.832697
Median,3.0,6.0,12.1,8.7,10.6,27.0,246.0
Mode,2.0,4.0,21.6,7.8,18.4,25.0,410.5
Q1:25%,2.0,4.0,10.1,7.5,8.9,22.0,208.0
Q2:50%,3.0,6.0,12.1,8.7,10.6,27.0,246.0
Q3:75%,3.7,6.0,14.7,10.3,12.7,32.0,289.0
99%,6.25,9.0,21.6,14.5,18.4,47.0,410.5
Q4:100%,6.25,9.0,21.6,14.5,18.4,47.0,410.5
IQR,1.7,2.0,4.6,2.8,3.8,10.0,81.0
1.5rule,2.55,3.0,6.9,4.2,5.7,15.0,121.5


In [24]:
lesser=[]
greater=[]

for columnName in quan:
    if(descriptive[columnName]["Min"]<descriptive[columnName]["Lesser"]):
        lesser.append(columnName)
    if(descriptive[columnName]["Max"]>descriptive[columnName]["Greater"]):
        greater.append(columnName)
        

In [25]:
lesser

[]

In [26]:
greater

[]

In [27]:
co2_data.to_csv("Preprocessedco2_data.csv",index=False)

In [28]:
co2_data

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196.0
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221.0
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,47,136.0
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255.0
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,10.7,7.7,9.4,30,219.0
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,11.2,8.3,9.9,29,232.0
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS8,Z,11.7,8.6,10.3,27,240.0
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS8,Z,11.2,8.3,9.9,29,232.0
