In [9]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r"E:\IIT Chicago\Sem 4\Data Science Practicum - CSP-572\Datasets\Odometer Data\dataset_emissions_demographic_corrected.csv")

In [3]:
df.head()

Unnamed: 0,vin,zip,make,model,my,purchase_date,Combination MPG,1st Test Date,2nd Test Date,1st Odometer,...,Date Difference (Days),VMT,Fuel Used,Fuel_Type,zipcode,blkgrp,population,households,median_income,housing_units
0,WBAEV33452KL68783,60655,BMW,3 Series,2002,5/1/2021,21.0,2020-06-01,,125000,...,,,,Gasoline,606551055.0,170317204001,776.0,291.0,158859.0,309.0
1,1GBFG15R6Y1100200,60632,CHEV,Express Cargo,2000,5/21/2017,,2020-06-01,2024-03-26,206000,...,1394.0,7000.0,,Gasoline,606321626.0,170315801002,1097.0,422.0,47778.0,494.0
2,1GCDT136548185796,60501,CHEV,Colorado,2004,6/18/2013,18.0,2020-06-01,2020-06-04,109000,...,3.0,0.0,0.0,Gasoline,605011310.0,170318203004,987.0,307.0,46047.0,366.0
3,2GKALMEK0C6364495,60453,GMC,Terrain,2012,4/26/2018,20.0,2020-06-01,2024-05-03,80000,...,1432.0,20000.0,1000.0,Gasoline,604534400.0,170318226013,520.0,278.0,61786.0,278.0
4,1HGCG1652YA094701,60459,HOND,Accord,2000,7/19/2014,23.0,2020-06-01,,167000,...,,,,Gasoline,604591106.0,170318209013,750.0,235.0,91036.0,235.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5195673 entries, 0 to 5195672
Data columns (total 21 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   vin                     object 
 1   zip                     int64  
 2   make                    object 
 3   model                   object 
 4   my                      int64  
 5   purchase_date           object 
 6   Combination MPG         float64
 7   1st Test Date           object 
 8   2nd Test Date           object 
 9   1st Odometer            int64  
 10  2nd Odometer            float64
 11  Date Difference (Days)  float64
 12  VMT                     float64
 13  Fuel Used               float64
 14  Fuel_Type               object 
 15  zipcode                 float64
 16  blkgrp                  int64  
 17  population              float64
 18  households              float64
 19  median_income           float64
 20  housing_units           float64
dtypes: float64(10), int64(4), objec

In [6]:
df.isnull().sum()

vin                             0
zip                             0
make                            0
model                           0
my                              0
purchase_date                   0
Combination MPG            439998
1st Test Date                   0
2nd Test Date             2718298
1st Odometer                    0
2nd Odometer              2718298
Date Difference (Days)    2718298
VMT                       2718298
Fuel Used                 2931452
Fuel_Type                    9144
zipcode                       273
blkgrp                          0
population                2960182
households                2960182
median_income             3101037
housing_units             2960182
dtype: int64

In [7]:
df['blkgrp'].nunique()

6289

In [8]:
df['Fuel_Type'].value_counts()

Fuel_Type
Gasoline                       4889313
Flexible Fuel Vehicle (FFV)     287301
Electric                          7486
Ethanol (E85)                     2025
Diesel                             400
Not Applicable                       4
Name: count, dtype: int64

# Calculate the Emission Column 

In [10]:
# Function to calculate emissions based on fuel type and VMT
def calculate_emission(row):
    fuel_type = row['Fuel_Type']
    vmt = row['VMT']
    mpg = row['Combination MPG']

    # Handle cases where VMT or Fuel_Type is NaN
    if pd.isna(vmt) or pd.isna(fuel_type):
        return np.nan

    # Calculate emissions for Gasoline
    if fuel_type == 'Gasoline':
        if pd.isna(mpg):
            return np.nan
        emission = 8.89e-3 * vmt * (1 / mpg) * (1 / 0.994)

    # Calculate emissions for Diesel
    elif fuel_type == 'Diesel':
        if pd.isna(mpg):
            return np.nan
        emission = 10.180e-3 * vmt * (1 / mpg) * (1 / 0.994)

    # For Electric vehicles, set emissions to NaN
    elif fuel_type == 'Electric':
        return np.nan

    # Calculate emissions for Ethanol (E85) or Flexible Fuel Vehicles (FFV)
    elif fuel_type in ['E85', 'Flexible Fuel Vehicle (FFV)']:
        if pd.isna(mpg):
            return np.nan
        gasoline_emission_factor = 8.89
        ethanol_emission_factor = 5.75
        e85_emission_factor = (gasoline_emission_factor * 0.85) + (ethanol_emission_factor * 0.15)
        emission = e85_emission_factor * 1e-3 * vmt * (1 / mpg) * (1 / 0.994)
    
    # For other fuel types, return NaN
    else:
        return np.nan

    return emission

In [11]:
df['Emissions'] = df.apply(calculate_emission, axis=1)

In [12]:
df.head()

Unnamed: 0,vin,zip,make,model,my,purchase_date,Combination MPG,1st Test Date,2nd Test Date,1st Odometer,...,VMT,Fuel Used,Fuel_Type,zipcode,blkgrp,population,households,median_income,housing_units,Emissions
0,WBAEV33452KL68783,60655,BMW,3 Series,2002,5/1/2021,21.0,2020-06-01,,125000,...,,,Gasoline,606551055.0,170317204001,776.0,291.0,158859.0,309.0,
1,1GBFG15R6Y1100200,60632,CHEV,Express Cargo,2000,5/21/2017,,2020-06-01,2024-03-26,206000,...,7000.0,,Gasoline,606321626.0,170315801002,1097.0,422.0,47778.0,494.0,
2,1GCDT136548185796,60501,CHEV,Colorado,2004,6/18/2013,18.0,2020-06-01,2020-06-04,109000,...,0.0,0.0,Gasoline,605011310.0,170318203004,987.0,307.0,46047.0,366.0,0.0
3,2GKALMEK0C6364495,60453,GMC,Terrain,2012,4/26/2018,20.0,2020-06-01,2024-05-03,80000,...,20000.0,1000.0,Gasoline,604534400.0,170318226013,520.0,278.0,61786.0,278.0,8.943662
4,1HGCG1652YA094701,60459,HOND,Accord,2000,7/19/2014,23.0,2020-06-01,,167000,...,,,Gasoline,604591106.0,170318209013,750.0,235.0,91036.0,235.0,


In [13]:
df.isnull().sum()

vin                             0
zip                             0
make                            0
model                           0
my                              0
purchase_date                   0
Combination MPG            439998
1st Test Date                   0
2nd Test Date             2718298
1st Odometer                    0
2nd Odometer              2718298
Date Difference (Days)    2718298
VMT                       2718298
Fuel Used                 2931452
Fuel_Type                    9144
zipcode                       273
blkgrp                          0
population                2960182
households                2960182
median_income             3101037
housing_units             2960182
Emissions                 2938446
dtype: int64

# Save the datset

In [14]:
df.to_csv("E:\IIT Chicago\Sem 4\Data Science Practicum - CSP-572\Datasets\Odometer Data\dataset_emissions_demographic.csv", index=False)