In [4]:
!pip install -q pmdarima statsmodels matplotlib seaborn

In [5]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pmdarima import auto_arima
from statsmodels.tsa.seasonal import STL, seasonal_decompose
import zipfile

In [6]:
# ------------------------------
# Configuration
# ------------------------------
OUT_DIR = "forecast_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

MIN_MONTHS = 18            # minimum months to run ARIMA
FORECAST_PERIODS = 12      # forecast horizon (months)
TOP_N = 8                  # top brands/series to process
SEASONAL_M = 12
PLOT_DPI = 150

# Standardized plot axes
XMIN = pd.to_datetime("2016-01-01")
XMAX = pd.to_datetime("2026-12-31")
SALES_STD_Y = (0.0, 1.0)   # standardized sales (normalized to historical max)
SENT_STD_Y = (-1.0, 1.0)   # sentiment VADER range

In [7]:
# ------------------------------
# Helpers
# ------------------------------
def safe_name(x):
    return str(x).replace("/", "_").replace("\\", "_").replace(" ", "_")[:80]

def save_fig(fig, fname):
    path = os.path.join(OUT_DIR, fname)
    fig.savefig(path, dpi=PLOT_DPI, bbox_inches='tight')
    plt.close(fig)
    return path

def monthly_series(df, col, agg="sum"):
    """Aggregate df to monthly series (MS index)."""
    if col not in df.columns:
        return pd.Series(dtype=float)
    if agg == "sum":
        s = df.groupby(df['purchase_date'].dt.to_period('M'))[col].sum().to_timestamp()
    else:
        s = df.groupby(df['purchase_date'].dt.to_period('M'))[col].mean().to_timestamp()
    s = s.dropna()
    if s.empty:
        return s
    idx = pd.date_range(start=s.index.min(), end=s.index.max(), freq='MS')
    s = s.reindex(idx)
    s = s.fillna(method='ffill').fillna(method='bfill')
    return s

def normalize_by_max(ts):
    """Return series normalized by its max, and the max value."""
    if ts is None or ts.empty:
        return ts, None
    m = ts.max()
    if m == 0 or np.isnan(m):
        return ts * 0.0, m
    return ts / m, m

In [8]:
# ------------------------------
# ARIMA forecast with CI (pmdarima)
# ------------------------------
def arima_forecast_with_ci(ts, periods=FORECAST_PERIODS, seasonal=True, m=SEASONAL_M):
    if ts is None or len(ts.dropna()) < MIN_MONTHS:
        return None, None
    try:
        model = auto_arima(ts, seasonal=seasonal, m=m, stepwise=True,
                           error_action='ignore', suppress_warnings=True, trace=False)
        fc_vals, conf_int = model.predict(n_periods=periods, return_conf_int=True, alpha=0.05)
        last = ts.index[-1]
        future_idx = pd.date_range(start=last + pd.offsets.MonthBegin(1), periods=periods, freq='MS')
        fc_series = pd.Series(fc_vals, index=future_idx)
        conf_df = pd.DataFrame(conf_int, index=future_idx, columns=['lower','upper'])
        return fc_series, conf_df
    except Exception as e:
        # if auto_arima fails, log and return None
        print("ARIMA failed for series:", e)
        return None, None

In [9]:
# ------------------------------
# Plot function: raw + standardized + extras
# ------------------------------
def plot_item_forecasts(name, kind, hist_ts, fc_ts, fc_ci=None, is_sales=False, do_standardize=True):
    """Saves RAW and STANDARDIZED forecast plots and some additional visuals."""
    safe = safe_name(name)
    prefix = f"{kind}_{safe}"

    # RAW plot (auto y-limits) with CI if available
    fig, ax = plt.subplots(figsize=(12,5))
    if hist_ts is not None and not hist_ts.empty:
        ax.plot(hist_ts.index, hist_ts.values, label='Historical', linewidth=2)
    if fc_ts is not None:
        ax.plot(fc_ts.index, fc_ts.values, '--', label='Forecast', linewidth=2, color='orange')
        if fc_ci is not None:
            ax.fill_between(fc_ci.index, fc_ci['lower'], fc_ci['upper'], color='orange', alpha=0.2)
    ax.set_xlim(XMIN, XMAX)
    ax.set_title(f"{prefix} RAW — {'Sales' if is_sales else 'Sentiment'}")
    ax.set_xlabel("Year"); ax.set_ylabel("Value")
    ax.grid(True, linestyle='--', alpha=0.4); ax.legend()
    save_fig(fig, f"{prefix}_RAW_Forecast.png")

    # STANDARDIZED plot
    if do_standardize:
        if is_sales:
            hist_std, hist_max = normalize_by_max(hist_ts)
            if fc_ts is not None and hist_max and hist_max != 0:
                fc_std = fc_ts / hist_max
                fc_ci_std = (fc_ci / hist_max) if fc_ci is not None else None
            else:
                fc_std = None
                fc_ci_std = None
        else:
            # sentiment: use natural range
            hist_std = hist_ts
            fc_std = fc_ts
            fc_ci_std = fc_ci

        fig, ax = plt.subplots(figsize=(12,5))
        if hist_std is not None and not hist_std.empty:
            ax.plot(hist_std.index, hist_std.values, label='Historical (std)', linewidth=2)
        if fc_std is not None:
            ax.plot(fc_std.index, fc_std.values, '--', label='Forecast (std)', linewidth=2, color='orange')
            if fc_ci_std is not None:
                ax.fill_between(fc_ci_std.index, fc_ci_std['lower'], fc_ci_std['upper'], color='orange', alpha=0.2)
        ax.set_xlim(XMIN, XMAX)
        if is_sales:
            ax.set_ylim(SALES_STD_Y)
            ax.set_ylabel("Sales (standardized)")
            ax.set_title(f"{prefix} STANDARDIZED — Sales (0..1)")
        else:
            ax.set_ylim(SENT_STD_Y)
            ax.set_ylabel("Sentiment (VADER)")
            ax.set_title(f"{prefix} STANDARDIZED — Sentiment (-1..1)")
        ax.set_xlabel("Year"); ax.grid(True, linestyle='--', alpha=0.4); ax.legend()
        save_fig(fig, f"{prefix}_STANDARDIZED_Forecast.png")

    # Additional visuals:
    if hist_ts is not None and not hist_ts.empty:
        # Rolling means
        fig, ax = plt.subplots(figsize=(12,5))
        ax.plot(hist_ts.index, hist_ts.values, alpha=0.5, label='Historical')
        ax.plot(hist_ts.index, hist_ts.rolling(3, min_periods=1).mean(), label='3-mo MA')
        ax.plot(hist_ts.index, hist_ts.rolling(6, min_periods=1).mean(), label='6-mo MA')
        ax.plot(hist_ts.index, hist_ts.rolling(12, min_periods=1).mean(), label='12-mo MA')
        ax.set_xlim(XMIN, XMAX); ax.set_title(f"{prefix} Rolling Means"); ax.legend(); ax.grid(True, linestyle='--', alpha=0.4)
        save_fig(fig, f"{prefix}_Rolling_MA.png")

        # Month-year heatmap
        try:
            dfm = hist_ts.copy().to_frame(name='val')
            dfm['year'] = dfm.index.year
            dfm['month'] = dfm.index.month
            pivot = dfm.pivot_table(index='year', columns='month', values='val', aggfunc='mean')
            fig, ax = plt.subplots(figsize=(10,6))
            im = ax.imshow(pivot.values, aspect='auto', origin='lower')
            ax.set_xticks(np.arange(12))
            ax.set_xticklabels(pivot.columns)
            ax.set_yticks(np.arange(len(pivot.index)))
            ax.set_yticklabels(pivot.index)
            ax.set_title(f"{prefix} Month-Year Heatmap")
            fig.colorbar(im, ax=ax, label='Value')
            save_fig(fig, f"{prefix}_Month_Heatmap.png")
        except Exception:
            pass

        # Boxplot by month
        try:
            dfm = hist_ts.copy().to_frame(name='val')
            dfm['month'] = dfm.index.month
            months = sorted(dfm['month'].unique())
            data = [dfm[dfm['month']==m]['val'].values for m in months]
            fig, ax = plt.subplots(figsize=(10,5))
            ax.boxplot(data, labels=months, showfliers=False)
            ax.set_title(f"{prefix} Monthly Boxplot"); ax.set_xlabel("Month"); ax.set_ylabel("Value")
            save_fig(fig, f"{prefix}_Month_Boxplot.png")
        except Exception:
            pass

        # Yearly aggregated bars
        try:
            yearly = hist_ts.groupby(hist_ts.index.year).sum()
            fig, ax = plt.subplots(figsize=(10,5))
            ax.bar(yearly.index.astype(str), yearly.values)
            ax.set_title(f"{prefix} Yearly Aggregated"); ax.set_xlabel("Year"); ax.set_ylabel("Sum")
            save_fig(fig, f"{prefix}_Yearly_Bar.png")
        except Exception:
            pass

        # Cumulative area plot
        try:
            fig, ax = plt.subplots(figsize=(12,5))
            ax.fill_between(hist_ts.index, hist_ts.cumsum(), alpha=0.4)
            ax.set_xlim(XMIN, XMAX)
            ax.set_title(f"{prefix} Cumulative Area")
            save_fig(fig, f"{prefix}_Cumulative_Area.png")
        except Exception:
            pass

In [10]:
# ------------------------------
# Load dataset
# ------------------------------
print("Loading dataset:", DATA_PATH)
df = pd.read_csv(DATA_PATH)
print("Raw shape:", df.shape)

if 'purchase_date' not in df.columns:
    raise ValueError("purchase_date column not found. Rename your date column to 'purchase_date'.")

df['purchase_date'] = pd.to_datetime(df['purchase_date'], errors='coerce')
df = df.dropna(subset=['purchase_date']).sort_values('purchase_date')
print("After date parsing shape:", df.shape)

Loading dataset: /kaggle/input/ba-dataset-with-sentiment-score/BA_dataset_with_sentiment-score.csv
Raw shape: (264292, 49)
After date parsing shape: (264292, 49)


In [11]:
# ------------------------------
# SECTION 1 — PRICE FORECAST
# ------------------------------
print("\nSECTION 1 — PRICE FORECAST")
price_cols = [c for c in ['ex-factory_price', 'official_guide_price'] if c in df.columns]
for col in price_cols:
    ts = monthly_series(df, col, agg='mean')
    name = f"Price_{col}"
    if ts.empty or len(ts) < MIN_MONTHS:
        print(f"[PRICE] skip {col}")
        continue
    # STL plot (save)
    try:
        stl = STL(ts, period=SEASONAL_M, robust=True).fit()
        fig = stl.plot()
        fig.set_size_inches(10,8)
        save_fig(fig, f"{name}_STL.png")
    except Exception:
        pass
    fc, ci = arima_forecast_with_ci(ts)
    if fc is not None:
        pd.DataFrame({'date': fc.index, 'forecast': fc.values}).to_csv(os.path.join(OUT_DIR, f"{name}_forecast.csv"), index=False)
    plot_item_forecasts(col, "Price", ts, fc, ci, is_sales=False, do_standardize=True)


SECTION 1 — PRICE FORECAST


In [12]:
# ------------------------------
# SECTION 2 — FUEL SALES FORECAST (vehicle_energy_type)
# ------------------------------
print("\nSECTION 2 — FUEL SALES FORECAST")
if 'vehicle_energy_type' in df.columns and 'sales' in df.columns:
    fuels = df['vehicle_energy_type'].value_counts().head(TOP_N).index.tolist()
    for fuel in fuels:
        sub = df[df['vehicle_energy_type'] == fuel]
        ts = monthly_series(sub, 'sales', agg='sum')
        name = f"Fuel_{safe_name(fuel)}_Sales"
        if ts.empty or len(ts) < MIN_MONTHS:
            print(f"[FUEL] skip {fuel}")
            continue
        # STL
        try:
            stl = STL(ts, period=SEASONAL_M, robust=True).fit()
            fig = stl.plot(); fig.set_size_inches(10,8); save_fig(fig, f"{name}_STL.png")
        except Exception:
            pass
        fc, ci = arima_forecast_with_ci(ts)
        if fc is not None:
            pd.DataFrame({'date': fc.index, 'forecast': fc.values, 'fuel_type': fuel}).to_csv(os.path.join(OUT_DIR, f"{name}_forecast.csv"), index=False)
        plot_item_forecasts(fuel, "Fuel", ts, fc, ci, is_sales=True, do_standardize=True)
else:
    print("[FUEL] Missing required columns; skipping.")


SECTION 2 — FUEL SALES FORECAST


In [13]:
# ------------------------------
# SECTION 3 — CITY SALES FORECAST
# ------------------------------
print("\nSECTION 3 — CITY SALES FORECAST")
if 'city_of_purchase' in df.columns and 'sales' in df.columns:
    cities = df['city_of_purchase'].value_counts().head(TOP_N).index.tolist()
    for city in cities:
        sub = df[df['city_of_purchase'] == city]
        ts = monthly_series(sub, 'sales', agg='sum')
        name = f"City_{safe_name(city)}_Sales"
        if ts.empty or len(ts) < MIN_MONTHS:
            print(f"[CITY] skip {city}")
            continue
        try:
            stl = STL(ts, period=SEASONAL_M, robust=True).fit()
            fig = stl.plot(); fig.set_size_inches(10,8); save_fig(fig, f"{name}_STL.png")
        except Exception:
            pass
        fc, ci = arima_forecast_with_ci(ts)
        if fc is not None:
            pd.DataFrame({'date': fc.index, 'forecast': fc.values, 'city': city}).to_csv(os.path.join(OUT_DIR, f"{name}_forecast.csv"), index=False)
        plot_item_forecasts(city, "City", ts, fc, ci, is_sales=True, do_standardize=True)
else:
    print("[CITY] Missing required columns; skipping.")


SECTION 3 — CITY SALES FORECAST


In [14]:
# ------------------------------
# SECTION 4 — OVERALL SEASONAL SALES + DIWALI/MONSOON
# ------------------------------
print("\nSECTION 4 — SEASONAL SALES + DIWALI/MONSOON")
if 'sales' in df.columns:
    overall_ts = monthly_series(df, 'sales', agg='sum')
    if len(overall_ts) >= 12:
        try:
            stl = STL(overall_ts, period=SEASONAL_M, robust=True).fit()
            fig = stl.plot(); fig.set_size_inches(10,8); save_fig(fig, "Overall_Sales_STL.png")
        except Exception:
            pass
        fc, ci = arima_forecast_with_ci(overall_ts)
        if fc is not None:
            pd.DataFrame({'date': fc.index, 'forecast': fc.values, 'series': 'overall_sales'}).to_csv(os.path.join(OUT_DIR, "overall_sales_forecast.csv"), index=False)
        plot_item_forecasts("Overall", "OverallSales", overall_ts, fc, ci, is_sales=True, do_standardize=True)

        # Diwali/Monsoon summary
        monthly_avg = overall_ts.groupby(overall_ts.index.month).mean()
        diwali_mean = float(monthly_avg.loc[10:11].mean()) if (10 in monthly_avg.index and 11 in monthly_avg.index) else None
        monsoon_mean = float(monthly_avg.loc[[6,7,8,9]].mean()) if set([6,7,8,9]).issubset(monthly_avg.index) else None
        summary = {'diwali_mean': diwali_mean, 'monsoon_mean': monsoon_mean, 'monthly_avg_by_month': monthly_avg.to_dict()}
        pd.DataFrame([summary]).to_json(os.path.join(OUT_DIR, "seasonal_report.json"), orient='records')
    else:
        print("[SEASONAL] Not enough overall monthly sales data.")
else:
    print("[SEASONAL] 'sales' column missing; skipping overall seasonal section.")


SECTION 4 — SEASONAL SALES + DIWALI/MONSOON


In [15]:
# ------------------------------
# SECTION 5 — FUEL CONSUMPTION FORECAST
# ------------------------------
print("\nSECTION 5 — FUEL CONSUMPTION FORECAST")
if 'average_fuel_consumption' in df.columns:
    ts_consumption = monthly_series(df, 'average_fuel_consumption', agg='mean')
    if not ts_consumption.empty and len(ts_consumption) >= MIN_MONTHS:
        try:
            stl = STL(ts_consumption, period=SEASONAL_M, robust=True).fit()
            fig = stl.plot(); fig.set_size_inches(10,8); save_fig(fig, "Average_Fuel_Consumption_STL.png")
        except Exception:
            pass
        fc, ci = arima_forecast_with_ci(ts_consumption, seasonal=False, m=1)
        if fc is not None:
            pd.DataFrame({'date': fc.index, 'forecast': fc.values}).to_csv(os.path.join(OUT_DIR, "avg_fuel_consumption_forecast.csv"), index=False)
        plot_item_forecasts("AverageFuelConsumption", "FuelConsumption", ts_consumption, fc, ci, is_sales=False, do_standardize=True)
    else:
        print("[FUEL_CONS] insufficient data; skipping.")
else:
    print("[FUEL_CONS] column missing; skipping.")


SECTION 5 — FUEL CONSUMPTION FORECAST


In [16]:
# ------------------------------
# ZIP the outputs folder
# ------------------------------
zip_name = "forecast_outputs.zip"
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUT_DIR):
        for f in files:
            zf.write(os.path.join(root, f), arcname=os.path.join(os.path.relpath(root, OUT_DIR), f))

print("\nAll outputs saved in:", OUT_DIR)
print("ZIP created:", zip_name)


All outputs saved in: forecast_outputs
ZIP created: forecast_outputs.zip
