In [7]:
!pip install pycountry

Defaulting to user installation because normal site-packages is not writeable


In [32]:
# Cell 1: Imports & Load Raw Data
import pandas as pd
import os

# Ensure cleaned folder exists
os.makedirs(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data/cleaned", exist_ok=True)

# Load raw datasets
ev_sales = pd.read_csv(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data//raw/ev_sales.csv")
charging = pd.read_csv(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data//raw/charging.csv")
gdp = pd.read_csv(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data//raw/gdp.csv")

print("✅ Raw datasets loaded")


✅ Raw datasets loaded


In [33]:
# Cell 2: Inspect Missing Values
print("EV Sales missing values:\n", ev_sales.isnull().sum())
print("\nCharging missing values:\n", charging.isnull().sum())
print("\nGDP missing values:\n", gdp.isnull().sum())


EV Sales missing values:
 Entity                                  0
Code                                   45
Year                                    0
Share of new cars that are electric     0
dtype: int64

Charging missing values:
 the_geom                                      0
GIS Object ID                                 0
Alternative Fuel Type                         0
Station Name                                  0
Street Address                                0
Intersection Information                    246
City                                          0
State                                         0
ZIP Code                                      0
ZIP Plus4 Information                       466
Phone Number                                 25
Status Code                                   0
Date Expected to Open                       465
Groups with Access                            0
Hours of Operation                           41
Payments Accepted                           46

In [34]:
# Cell 3: Rename Columns for Consistency

# EV Sales
ev_sales = ev_sales.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Share of new cars that are electric": "ev_share"
})

# Charging Stations (keep only useful columns for now)
charging = charging.rename(columns={
    "Latitude": "latitude",
    "Longitude": "longitude",
    "Number of Level 1 EVSE Outlets": "level1_outlets",
    "Number of EVSE Level 2 Outlets": "level2_outlets",
    "Number of DC Fast Chargers": "fast_chargers",
    "City": "city",
    "State": "state",
    "ZIP Code": "zip_code",
    "Station Name": "station_name"
})

# GDP/Population
gdp = gdp.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Population (historical)": "population"
})

print("✅ Column names standardized")


✅ Column names standardized


In [30]:
# Cell 3: Rename Columns for Consistency

# EV Sales
ev_sales = ev_sales.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Share of new cars that are electric": "ev_share"
})

# Charging Stations (keep only useful columns for now)
charging = charging.rename(columns={
    "Latitude": "latitude",
    "Longitude": "longitude",
    "Number of Level 1 EVSE Outlets": "level1_outlets",
    "Number of EVSE Level 2 Outlets": "level2_outlets",
    "Number of DC Fast Chargers": "fast_chargers",
    "City": "city",
    "State": "state",
    "ZIP Code": "zip_code",
    "Station Name": "station_name"
})

# GDP/Population
gdp = gdp.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Population (historical)": "population"
})

print("✅ Column names standardized")


✅ Column names standardized


In [35]:
# Cell 4: Convert Year to Integer
ev_sales["year"] = ev_sales["year"].astype(int)
gdp["year"] = gdp["year"].astype(int)

print(ev_sales[["country", "year"]].head())
print(gdp[["country", "year"]].head())


     country  year
0  Australia  2011
1  Australia  2012
2  Australia  2013
3  Australia  2014
4  Australia  2015
       country   year
0  Afghanistan -10000
1  Afghanistan  -9000
2  Afghanistan  -8000
3  Afghanistan  -7000
4  Afghanistan  -6000


In [36]:
# Cell 5: Handle Missing Values

# For EV share: fill NaN with 0 (assume no adoption if missing)
ev_sales["ev_share"] = ev_sales["ev_share"].fillna(0)

# For charging dataset: fill missing chargers with 0
for col in ["level1_outlets", "level2_outlets", "fast_chargers"]:
    if col in charging.columns:
        charging[col] = charging[col].fillna(0)

# For GDP population: forward fill (as population changes slowly)
gdp["population"] = gdp["population"].fillna(method="ffill")

print("✅ Missing values handled")


✅ Missing values handled


  gdp["population"] = gdp["population"].fillna(method="ffill")


In [37]:
# Cell 6: Quick Summary
print("EV Sales:\n", ev_sales.info())
print("\nCharging:\n", charging.info())
print("\nGDP:\n", gdp.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   country   492 non-null    object 
 1   iso_code  447 non-null    object 
 2   year      492 non-null    int64  
 3   ev_share  492 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 15.5+ KB
EV Sales:
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 40 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   the_geom                                  466 non-null    object 
 1   GIS Object ID                             466 non-null    int64  
 2   Alternative Fuel Type                     466 non-null    object 
 3   station_name                              466 non-null    object 
 4   Street Address                            466 n

In [38]:
# Cell 7: Save Cleaned Datasets
ev_sales.to_csv(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data/cleaned/ev_sales_cleaned.csv", index=False)
charging.to_csv(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data/cleaned/charging_cleaned.csv", index=False)
gdp.to_csv(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data/cleaned/gdp_cleaned.csv", index=False)

print("✅ Cleaned datasets saved in ../data/cleaned/")


✅ Cleaned datasets saved in ../data/cleaned/


In [39]:
# Cell 8: Standardize Country Names (optional)
# This ensures all country names are consistent

try:
    import pycountry

    def standardize_country(name):
        try:
            return pycountry.countries.lookup(name).name
        except:
            return name

    ev_sales["country"] = ev_sales["country"].apply(standardize_country)
    gdp["country"] = gdp["country"].apply(standardize_country)

    print("✅ Country names standardized")
except ImportError:
    print("⚠️ pycountry not installed. Skipping country name standardization.")


✅ Country names standardized
