Data Acquisition and Inspection

In [4]:
import pandas as pd

df = pd.read_csv('vehiclefueleconomies.csv', low_memory=False)

In [6]:
mixed_type_columns_indices = [72, 74, 75, 77]

for column_index in mixed_type_columns_indices:
    column_name = df.columns[column_index]
    unique_values = df.iloc[:, column_index].unique()
    print(f"Column '{column_name}':")
    print(unique_values)
    print()

Column 'evMotor':
[nan '49 kW DC Brushless' '85 kW AC Induction' '102kW AC Induction'
 '49kW DC Brushless' '67 KW AC  Induction' '67 KW AC' '56kW AC Induction'
 '62 KW AC Induction' '50 KW DC' '27 KW AC Induction' '67 KW AC Induction'
 '24 KW AC Synchronous' '288V Ni-MH' '330V Ni-MH' '36V Ni-MH' '245V Ni-MH'
 '158V Ni-MH' '202V Ni-MH' '150 kW' '300V Ni-MH' '101V Ni-MH' '275V Ni-MH'
 '126V Li-Ion' '312V Ni-MH' '80 kW DCPM' '111 kW' '30 kW DCPM'
 '270V Li-Ion' '125 kW AC Induction' '144V Li-Ion' '346V Li-Ion'
 '115V Li-Ion' '49 kW DCPM' '52 kW AC Induction' '144V Ni-MH'
 '100 kW DCPM' '107 kW AC Induction' '18 kW' '2 @ 150 kw (300 kw)'
 '260 kW AC Induction' '115 kW AC Induction' '374V Li-Ion' '75 kW AC PMSM'
 '280V Li-Ion' '107 kW AC PMSM' '92 kW DCPM' '266V Li-Ion' '55 kW DCPM'
 '68 kW' '220V Li-Ion' '225 kW AC Induction' '270 kW AC Induction'
 '82 kW ACIPM' '124 kW' '104 kW AC Induction' '259V Li-Ion' '260V Li-Ion'
 '126 kW' '270 kW AC Induction (85 kW-hr battery pack)'
 '225 kW AC In

In [8]:
print("Shape Of the dataset:", df.shape)

Shape Of the dataset: (40704, 83)


In [10]:
print("Preview of the dataset:")
print(df.head())

Preview of the dataset:
   barrels08  barrelsA08  charge120  charge240  city08  city08U  cityA08  \
0  15.695714         0.0          0        0.0      18      0.0        0   
1  14.982273         0.0          0        0.0      20      0.0        0   
2  21.974000         0.0          0        0.0      13      0.0        0   
3  21.974000         0.0          0        0.0      13      0.0        0   
4  19.388824         0.0          0        0.0      15      0.0        0   

   cityA08U  cityCD  cityE  ...  mfrCode  c240Dscr  charge240b  c240bDscr  \
0       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
1       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
2       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
3       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
4       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   

                      createdOn                    modif

In [12]:
print("Summary statistics for numerical columns:")
print(df.describe())

Summary statistics for numerical columns:
          barrels08    barrelsA08  charge120     charge240        city08  \
count  40704.000000  40704.000000    40704.0  40704.000000  40704.000000   
mean      17.325764      0.221045        0.0      0.037817     18.276533   
std        4.602734      1.144019        0.0      0.545918      7.555437   
min        0.060000      0.000000        0.0      0.000000      6.000000   
25%       14.330870      0.000000        0.0      0.000000     15.000000   
50%       16.480500      0.000000        0.0      0.000000     17.000000   
75%       19.388824      0.000000        0.0      0.000000     20.000000   
max       47.087143     18.311667        0.0     13.000000    150.000000   

            city08U       cityA08      cityA08U        cityCD         cityE  \
count  40704.000000  40704.000000  40704.000000  40704.000000  40704.000000   
mean       5.752379      0.635294      0.487652      0.000464      0.284498   
std       11.267965      4.917537   

Handling Missing Values

In [20]:
print("columns with missing values in the dataset:")
missing_columns = {}

for column in df.columns:
    missing_count = df[column].isnull().sum()
    if missing_count > 0:
        missing_columns[columns] = missing_count

sorted_missing_columns = sorted(missing_columns.items(), key=lambda x: x[1])

for columns, missing_count in sorted_missing_columns:
    print(f"{column}: {missing_count} missing values")

columns with missing values in the dataset:
phevComb: 31704 missing values


In [22]:
columns_with_missing = ['trany', 'displ', 'drive']
df_cleaned = df.dropna(subset = columns_with_missing)
print("Shape of the cleaned DataFrame after dropping rows:", df_cleaned.shape)

Shape of the cleaned DataFrame after dropping rows: (39343, 83)


In [24]:
if 'EV' in df_cleaned['atvType'].unique():
    print("Type 'EV' exists in the 'atvType' column.")
else:
    print("Type 'EV' does not exists in the 'atvType' column.")

Type 'EV' exists in the 'atvType' column.


In [26]:
ev_df = df_cleaned[df_cleaned['atvType'] == 'EV']
print("Cylinder values where 'atvType' is 'EV':")
print(ev_df['cylinders'].unique())

Cylinder values where 'atvType' is 'EV':
[nan]


In [32]:
ev_null_cylinders = df_cleaned[(df_cleaned['atvType'] == 'EV') & (df_cleaned['cylinders'].isnull())]
df_cleaned.loc[(df_cleaned['atvType'] == 'EV') & (df_cleaned['cylinders'].isnull()), 'cylinders'] == 0
                                                                 

35738    False
Name: cylinders, dtype: bool

In [34]:
ev_df = df_cleaned[df_cleaned['atvType'] == 'EV']
print("Cylinder values where 'atvType' is 'EV':")
print(ev_df['cylinders'].unique())

Cylinder values where 'atvType' is 'EV':
[nan]


In [36]:
print("Distinct values in the 'cylinders' column:")
print(df_cleaned['cylinders'].unique())

Distinct values in the 'cylinders' column:
[ 6.  8.  4. 12.  2.  5.  3. nan 10. 16.]


In [38]:
nan_cylinders_df = df_cleaned[df_cleaned['cylinders'].isnull()]
print("Values in the 'atvType' column where 'cylinders' is NaN:")
print(nan_cylinders_df['atvType'])

Values in the 'atvType' column where 'cylinders' is NaN:
4867     NaN
35738     EV
Name: atvType, dtype: object
