# EV Sales Prediction & Adoption Analysis — Google Colab

This Colab-ready notebook contains an end-to-end pipeline to load an EV sales dataset, do basic cleaning/EDA, train regression models for sales prediction and a classifier for adoption analysis, and save trained models.

**How to use in Google Colab**
1. Open Colab: https://colab.research.google.com
2. Click **File → Upload notebook** and upload this file, or open it from your Google Drive.
3. Upload `ev_sales.csv` into Colab (left sidebar → Files → Upload) or mount Google Drive and place the file there.

Run cells sequentially. The notebook installs required packages automatically.

---


In [None]:
# Install required packages (runs once in Colab runtime)
!pip install -q pandas numpy scikit-learn matplotlib seaborn xgboost joblib
print('Packages installed')

## Upload dataset

You can upload your `ev_sales.csv` directly into Colab's temporary filesystem (left Files pane → Upload) or mount your Google Drive and load the file from there.

If you prefer Drive, run the next cell and follow the authentication steps.

In [None]:
# Option A: Use files.upload (manual upload)
from google.colab import files
uploaded = files.upload()
# After upload, make sure the filename (e.g. 'ev_sales.csv') appears in uploaded.keys()
print('Uploaded files:', list(uploaded.keys()))

# Option B: Mount Google Drive (persistent across sessions if you save in Drive)
from google.colab import drive
print('If you want to use Google Drive instead, uncomment drive.mount below and provide path to file in the code cells')
# drive.mount('/content/drive')

In [None]:
# Imports
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

import joblib

print('Libraries imported')

In [None]:
# Change the filename below if your file has a different name or path
DATAFILE = 'ev_sales.csv'

# Try to read file
import os
if not os.path.exists(DATAFILE):
    print(f"{DATAFILE} not found in current working directory. Use the Files pane to upload or mount Drive and update DATAFILE path.")

try:
    df = pd.read_csv(DATAFILE)
    print('Loaded', DATAFILE)
    print('Shape:', df.shape)
except Exception as e:
    print('Error loading file:', e)
    df = None

# Quick peek
if df is not None:
    display(df.head())
    print('\nColumns:', df.columns.tolist())

In [None]:
# Quick inspection and basic cleaning
if df is None:
    raise SystemExit('Upload the dataset first')

print('\n-- INFO --')
print(df.info())
print('\n-- NULL counts --')
print(df.isnull().sum().sort_values(ascending=False).head(20))

# Attempt to create 'year' if possible
for candidate in ['Year', 'year', 'sale_year', 'date', 'Date']:
    if candidate in df.columns:
        try:
            df['year'] = pd.to_datetime(df[candidate]).dt.year
        except Exception:
            try:
                df['year'] = df[candidate].astype(int)
            except Exception:
                pass
        break

# Find sales column
sales_col = None
for c in ['sales', 'units_sold', 'units', 'quantity']:
    if c in df.columns:
        sales_col = c
        break

if sales_col is None:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if numeric_cols:
        sales_col = numeric_cols[0]
        print(f"No explicit sales column found — using numeric column '{sales_col}' as target. Double-check this.")
    else:
        raise ValueError('No numeric column found to use as sales target')

print('\nUsing sales column:', sales_col)

# Make copies
orig_df = df.copy()

# Drop rows without sales target
df = df.dropna(subset=[sales_col])

# Create example categorical and numeric lists (modify as needed)
cat_cols = []
for c in ['country','region','make','manufacturer','model']:
    if c in df.columns:
        cat_cols.append(c)

num_cols = []
for c in ['battery_kWh','battery_capacity','range_km','range_miles','price','list_price','co2_emissions','year']:
    if c in df.columns:
        num_cols.append(c)

# Fill missing numeric with median and categorical with 'Unknown'
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
    if df[c].isnull().any():
        df[c] = df[c].fillna(df[c].median())
for c in cat_cols:
    df[c] = df[c].astype(str).fillna('Unknown')

print('\nCategorical features:', cat_cols)
print('Numerical features:', num_cols)


In [None]:
# Create adoption label (binary)
adoption_threshold = 0
adopt_col = 'adopted'
df[adopt_col] = (df[sales_col] > adoption_threshold).astype(int)

# Basic EDA plots
import matplotlib
plt.style.use('default')

plt.figure(figsize=(8,4))
plt.title('Distribution of sales')
try:
    sns.histplot(df[sales_col], bins=50)
except Exception:
    plt.hist(df[sales_col].dropna(), bins=50)
plt.show()

if 'year' in df.columns:
    yearly = df.groupby('year')[sales_col].sum().reset_index()
    plt.figure(figsize=(10,4))
    sns.lineplot(data=yearly, x='year', y=sales_col)
    plt.title('Total sales by year')
    plt.show()


In [None]:
# ---- Regression: Sales prediction ----
feature_cols = num_cols + cat_cols
if not feature_cols:
    feature_cols = [c for c in df.columns if c not in [sales_col, adopt_col]]

X = df[feature_cols].copy()
y = df[sales_col].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = [c for c in feature_cols if c in num_cols]
categorical_features = [c for c in feature_cols if c in cat_cols]

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', cat_transformer, categorical_features)
])

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    'XGBoost': XGBRegressor(n_estimators=200, random_state=42, verbosity=0)
}

results = {}
best_rmse = float('inf')
best_pipeline = None

for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results[name] = {'rmse': rmse, 'mae': mae, 'r2': r2}
    print(f"{name} -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R2: {r2:.3f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_pipeline = pipe

print('\nBest model by RMSE:', best_rmse)

# Save the best regressor
joblib.dump(best_pipeline, 'best_ev_regressor.joblib')
print('Saved best_ev_regressor.joblib')

# Show sample predictions
print('\nSample actual vs predicted:')
print(pd.DataFrame({'actual': y_test.iloc[:8].values, 'pred': best_pipeline.predict(X_test.iloc[:8])}))


In [None]:
# ---- Classification: Adoption prediction ----
Xc = df[feature_cols].copy()
yc = df[adopt_col]
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42)

clf_pipe = Pipeline(steps=[('preprocessor', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))])
clf_pipe.fit(Xc_train, yc_train)

preds = clf_pipe.predict(Xc_test)
print('\nClassification report:\n')
print(classification_report(yc_test, preds))
print('\nConfusion matrix:\n', confusion_matrix(yc_test, preds))

# Save classifier
joblib.dump(clf_pipe, 'ev_adoption_classifier.joblib')
print('Saved ev_adoption_classifier.joblib')


In [None]:
# Feature importance (if available)
try:
    model = best_pipeline.named_steps['model']
    if hasattr(model, 'feature_importances_'):
        # attempt to reconstruct feature names
        num_feats = numeric_features
        ohe = best_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
        cat_feat_names = ohe.get_feature_names_out(categorical_features).tolist()
        feat_names = num_feats + cat_feat_names
        importances = model.feature_importances_
        fi = pd.Series(importances, index=feat_names).sort_values(ascending=False).head(30)
        display(fi)
        fi.plot(kind='barh', figsize=(8,6));
        plt.gca().invert_yaxis();
        plt.show()
    else:
        print('Best regressor does not provide feature_importances_')
except Exception as e:
    print('Could not compute feature importances:', e)

print('\nNotebook complete. Models saved in this runtime: best_ev_regressor.joblib and ev_adoption_classifier.joblib')
