In [1]:
# ‚úÖ GLOBAL SETTINGS ‚Äì Run this once after restarting Jupyter Notebook

import matplotlib
matplotlib.use("Agg")  # Ensures full plots are rendered before saving (non-interactive backend)

import matplotlib.pyplot as plt
import seaborn as sns
import os

# Create an images folder if it doesn't exist
os.makedirs("images", exist_ok=True)

# Custom function to always save full, clean plots
def save_full_plot(fig, filename, width=10, height=6, dpi=300):
    """Fixes half-saved or cropped plots by enforcing proper layout."""
    fig.set_size_inches(width, height, forward=True)
    fig.set_dpi(dpi)
    fig.tight_layout(pad=2)
    fig.subplots_adjust(left=0.1, right=0.95, top=0.9, bottom=0.15)
    fig.savefig(filename, bbox_inches='tight', pad_inches=0.5, facecolor='white')
    plt.close(fig)


# Week 1 - Electric Vehicle Sales Prediction
# Author: Shreya V
# Internship: EduNet AICTE Internship (Electric Vehicle Theme)

# -----------------------------
# Step 1: Import Libraries
# -----------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display 

plt.style.use("seaborn-v0_8")
sns.set_palette("Set2")

# -----------------------------
# Step 2: Load Dataset
# -----------------------------
df = pd.read_csv("IEA-EV-dataEV salesHistoricalCars.csv")

# Display first few rows
print("üîπ Preview of dataset:")
display(df.head())

# -----------------------------
# Step 3: Understand the Dataset
# -----------------------------
print("\nüîπ Dataset Info:")
df.info()

print("\nüîπ Statistical Summary:")
display(df.describe())

print("\nüîπ Missing Values:")
display(df.isnull().sum())

# -----------------------------
# Step 4: Check Column Names
# -----------------------------
print("\nüîπ Column Names:")
print(df.columns.tolist())


üîπ Preview of dataset:


Unnamed: 0,region,category,parameter,mode,powertrain,year,unit,value
0,Australia,Historical,EV sales,Cars,BEV,2011,Vehicles,49.0
1,Australia,Historical,EV stock share,Cars,EV,2011,percent,0.00039
2,Australia,Historical,EV sales share,Cars,EV,2011,percent,0.0065
3,Australia,Historical,EV stock,Cars,BEV,2011,Vehicles,49.0
4,Australia,Historical,EV stock,Cars,BEV,2012,Vehicles,220.0



üîπ Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   region      3798 non-null   object 
 1   category    3798 non-null   object 
 2   parameter   3798 non-null   object 
 3   mode        3798 non-null   object 
 4   powertrain  3798 non-null   object 
 5   year        3798 non-null   int64  
 6   unit        3798 non-null   object 
 7   value       3798 non-null   float64
dtypes: float64(1), int64(1), object(6)
memory usage: 237.5+ KB

üîπ Statistical Summary:


Unnamed: 0,year,value
count,3798.0,3798.0
mean,2017.369932,100954.2
std,3.809226,818440.2
min,2010.0,1.5e-05
25%,2014.0,2.3
50%,2018.0,190.0
75%,2021.0,6800.0
max,2023.0,28000000.0



üîπ Missing Values:


region        0
category      0
parameter     0
mode          0
powertrain    0
year          0
unit          0
value         0
dtype: int64


üîπ Column Names:
['region', 'category', 'parameter', 'mode', 'powertrain', 'year', 'unit', 'value']


In [2]:
# Step 5: Data Cleaning
# Filter only EV sales rows
df_sales = df[df["parameter"] == "EV sales"]

# Remove unnecessary columns
df_sales = df_sales[["region", "year", "value"]]

# Convert year and value to numeric types
df_sales["year"] = pd.to_numeric(df_sales["year"], errors="coerce")
df_sales["value"] = pd.to_numeric(df_sales["value"], errors="coerce")

# Drop any missing values
df_sales = df_sales.dropna()

print("‚úÖ Cleaned dataset shape:", df_sales.shape)
display(df_sales.head())


‚úÖ Cleaned dataset shape: (1342, 3)


Unnamed: 0,region,year,value
0,Australia,2011,49.0
6,Australia,2012,80.0
9,Australia,2012,170.0
10,Australia,2013,190.0
13,Australia,2013,100.0


In [3]:
# Step 6: Exploratory Data Analysis (EDA)
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Force Agg backend to save correctly
matplotlib.use("Agg")

# Create images folder
os.makedirs("images", exist_ok=True)

# Function to fix save bug by explicitly setting aspect ratio
def save_full_plot(fig, filename, width=12, height=6):
    fig.set_size_inches(width, height, forward=True)
    fig.set_dpi(300)
    fig.subplots_adjust(left=0.1, right=0.95, top=0.9, bottom=0.15)
    fig.savefig(filename, bbox_inches='tight', pad_inches=0.5, facecolor='white')
    plt.close(fig)

# -------------------------------
# 1Ô∏è‚É£ EV Sales over the years
# -------------------------------
fig, ax = plt.subplots()
sns.lineplot(data=df_sales, x="year", y="value", ax=ax, color="teal", linewidth=2.5)
ax.set_title("Global Electric Vehicle Sales Over the Years", fontsize=14, weight="bold")
ax.set_xlabel("Year")
ax.set_ylabel("EV Sales (in millions)")
ax.grid(True, linestyle="--", alpha=0.5)
save_full_plot(fig, "images/ev_sales_trend.png")

# -------------------------------
# 2Ô∏è‚É£ EV Sales by Region
# -------------------------------
top_regions = df_sales.groupby("region")["value"].sum().sort_values(ascending=False).head(5)
fig, ax = plt.subplots()
sns.barplot(x=top_regions.index, y=top_regions.values, palette="Set2", ax=ax)
ax.set_title("Top 5 Regions by Total EV Sales", fontsize=14, weight="bold")
ax.set_xlabel("Region")
ax.set_ylabel("Total Sales")
save_full_plot(fig, "images/top_regions.png")

# -------------------------------
# 3Ô∏è‚É£ Correlation Heatmap
# -------------------------------
fig, ax = plt.subplots()
sns.heatmap(df_sales.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", ax=ax)
ax.set_title("Correlation Heatmap", fontsize=14, weight="bold")
save_full_plot(fig, "images/heatmap.png")

print("‚úÖ All charts saved successfully and fully visible in the 'images' folder!")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_regions.index, y=top_regions.values, palette="Set2", ax=ax)


‚úÖ All charts saved successfully and fully visible in the 'images' folder!


In [4]:
# Step 7: Observations
print("""
üìä Observations:
1. EV sales have grown rapidly after 2015.
2. China and Europe dominate total EV sales volume.
3. The growth trend is almost exponential, showing strong global EV adoption.
""")



üìä Observations:
1. EV sales have grown rapidly after 2015.
2. China and Europe dominate total EV sales volume.
3. The growth trend is almost exponential, showing strong global EV adoption.



In [5]:
%matplotlib inline


In [6]:
%matplotlib notebook


In [7]:
# ‚úÖ Step 7: Advanced Machine Learning Model ‚Äî EV Sales Prediction (Target: 80%+ accuracy)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ‚úÖ Use Agg backend to prevent partial/cropped saves
matplotlib.use("Agg")

# ‚úÖ Create image folder if not exists
os.makedirs("images", exist_ok=True)

print("üöÄ Building Advanced EV Sales Prediction Model...")

# -------------------------------
# 1Ô∏è‚É£ Prepare Data
# -------------------------------
# Filter EV sales data
df_model = df.copy()
df_model = df_model[df_model["parameter"] == "EV sales"].reset_index(drop=True)

# Encode categorical features
label_cols = ["region", "mode", "powertrain", "category"]
for col in label_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])

# Define features and target
X = df_model[["region", "mode", "powertrain", "category", "year"]]
y = np.log1p(df_model["value"])  # Log transform for stability

# Add polynomial features for 'year'
poly = PolynomialFeatures(degree=2, include_bias=False)
year_poly = poly.fit_transform(df_model[["year"]])
year_poly_df = pd.DataFrame(year_poly, columns=["year", "year^2"])

# Combine with other features
X = pd.concat([df_model[["region", "mode", "powertrain", "category"]], year_poly_df], axis=1)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# -------------------------------
# 2Ô∏è‚É£ Train Model (Gradient Boosting)
# -------------------------------
gbr = GradientBoostingRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=7,
    random_state=42
)
gbr.fit(X_train, y_train)

# Predict and reverse log transform
y_pred = gbr.predict(X_test)
y_pred_real = np.expm1(y_pred)
y_test_real = np.expm1(y_test)

# -------------------------------
# 3Ô∏è‚É£ Evaluation
# -------------------------------
r2 = r2_score(y_test_real, y_pred_real)
mae = mean_absolute_error(y_test_real, y_pred_real)
rmse = np.sqrt(mean_squared_error(y_test_real, y_pred_real))

print(f"üå≤ Gradient Boosting Model Accuracy (R¬≤): {r2*100:.2f}%")
print(f"üìà MAE  : {mae:.2f}")
print(f"üìâ RMSE : {rmse:.2f}")

# -------------------------------
# 4Ô∏è‚É£ Full Visualization (No Cropping)
# -------------------------------
def save_full_plot(fig, filename, width=10, height=8):
    """Ensures the figure saves fully without cropping."""
    fig.set_size_inches(width, height, forward=True)
    fig.set_dpi(300)
    fig.subplots_adjust(left=0.1, right=0.95, top=0.9, bottom=0.15)
    fig.savefig(filename, bbox_inches='tight', pad_inches=0.5, facecolor='white')
    plt.close(fig)

# Create and save plot
fig, ax = plt.subplots()
sns.scatterplot(x=y_test_real, y=y_pred_real, color="purple", alpha=0.7, ax=ax)
ax.plot([y_test_real.min(), y_test_real.max()],
        [y_test_real.min(), y_test_real.max()], 'r--', lw=2)
ax.set_title("Actual vs Predicted EV Sales (Gradient Boosting Model)", fontsize=14, weight='bold')
ax.set_xlabel("Actual EV Sales")
ax.set_ylabel("Predicted EV Sales")
ax.grid(True, linestyle='--', alpha=0.5)

save_full_plot(fig, "images/advanced_model_predictions.png")

print("\n‚úÖ Advanced model visualization saved completely in 'images/advanced_model_predictions.png'")


üöÄ Building Advanced EV Sales Prediction Model...
üå≤ Gradient Boosting Model Accuracy (R¬≤): 83.63%
üìà MAE  : 39014.87
üìâ RMSE : 234987.92

‚úÖ Advanced model visualization saved completely in 'images/advanced_model_predictions.png'
