In [3]:
!pip install -q pmdarima statsmodels matplotlib seaborn


In [4]:
# Final unified script — forecasts + standardized comparison plots
# Data path taken from session:
# -----------------------------
# Installs (uncomment if needed)
# -----------------------------
# !pip install -q pmdarima statsmodels

# -----------------------------
# Imports & config
# -----------------------------
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pmdarima import auto_arima
from statsmodels.tsa.seasonal import STL
import zipfile
from datetime import datetime

In [5]:
DATA_PATH = "/kaggle/input/ba-dataset-with-sentiment-score2/BA_dataset_with_sentiment-score.csv"


In [6]:
# Output folder
OUT_DIR = "forecast_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Parameters
MIN_MONTHS = 18
FORECAST_PERIODS = 12
SEASONAL_M = 12
TOP_N = 10   # top N brands/series to process

# Standardized axis settings
XMIN = pd.to_datetime("2016-01-01")
XMAX = pd.to_datetime("2026-12-31")
SALES_STD_RANGE = (0.0, 1.0)     # standardized sales axis (normalized)
SENT_STD_RANGE = (-1.0, 1.0)     # sentiment natural VADER range

PLOT_DPI = 150

# -----------------------------
# Helpers
# -----------------------------
def safe_name(x):
    return str(x).replace("/", "_").replace("\\", "_").replace(" ", "_")[:80]

def monthly_series(df, col, agg="sum"):
    if col not in df.columns:
        return pd.Series(dtype=float)
    if agg == "sum":
        s = df.groupby(df['purchase_date'].dt.to_period('M'))[col].sum().to_timestamp()
    else:
        s = df.groupby(df['purchase_date'].dt.to_period('M'))[col].mean().to_timestamp()
    s = s.dropna()
    if s.empty:
        return s
    idx = pd.date_range(start=s.index.min(), end=s.index.max(), freq='MS')
    s = s.reindex(idx)
    s = s.fillna(method='ffill').fillna(method='bfill')
    return s

def save_fig(fig, fname):
    path = os.path.join(OUT_DIR, fname)
    fig.savefig(path, dpi=PLOT_DPI, bbox_inches='tight')
    plt.close(fig)
    return path

# ARIMA forecast with CI (works on raw scale)
def arima_forecast_with_ci(ts, periods=FORECAST_PERIODS, seasonal=True, m=SEASONAL_M):
    try:
        model = auto_arima(ts, seasonal=seasonal, m=m, stepwise=True,
                           error_action='ignore', suppress_warnings=True, trace=False)
        # pmdarima supports return_conf_int
        fc, confint = model.predict(n_periods=periods, return_conf_int=True, alpha=0.05)
        last = ts.index[-1]
        future_idx = pd.date_range(start=last + pd.offsets.MonthBegin(1), periods=periods, freq='MS')
        fc_series = pd.Series(fc, index=future_idx)
        conf_df = pd.DataFrame(confint, index=future_idx, columns=['lower','upper'])
        return fc_series, conf_df
    except Exception as e:
        # fallback: return None
        # print("ARIMA failed:", e)
        return None, None

# Standardize (normalize) sales series by its historical max (for plotting only)
def normalize_by_max(ts):
    if ts.empty:
        return ts, None
    maxv = ts.max()
    if maxv == 0 or np.isnan(maxv):
        return ts * 0.0, maxv
    return ts / maxv, maxv

# -----------------------------
# Load data
# -----------------------------
print("Loading:", DATA_PATH)
df = pd.read_csv(DATA_PATH)
if 'purchase_date' not in df.columns:
    raise ValueError("purchase_date column missing. Rename your date column to 'purchase_date'.")
df['purchase_date'] = pd.to_datetime(df['purchase_date'], errors='coerce')
df = df.dropna(subset=['purchase_date']).sort_values('purchase_date')
print("Data loaded. Shape:", df.shape)

# pick top brands/series
brands = df['brand'].value_counts().head(TOP_N).index.tolist() if 'brand' in df.columns else []
series_list = df['car_series'].value_counts().head(TOP_N).index.tolist() if 'car_series' in df.columns else []

Loading: /kaggle/input/ba-dataset-with-sentiment-score2/BA_dataset_with_sentiment-score.csv
Data loaded. Shape: (264292, 49)


In [7]:
# -----------------------------
# Plotting utility for one item
# -----------------------------
def plot_item_forecasts(name, kind, hist_ts, fc_ts, fc_ci=None, 
                        is_sales=False, do_standardize=True):
    """
    name: string (brand or series)
    kind: 'Brand' or 'Series'
    hist_ts: historical pandas Series (monthly)
    fc_ts: forecast pandas Series (monthly future)
    fc_ci: DataFrame with 'lower' and 'upper' index aligned to fc_ts.index (or None)
    is_sales: bool
    do_standardize: whether to produce standardized plot (normalize sales by max)
    """
    safe = safe_name(name)
    prefix = f"{kind}_{safe}"

    # ---------- RAW PLOT (raw values; auto Y limits but standardized X) ----------
    fig, ax = plt.subplots(figsize=(12,5))
    if not hist_ts.empty:
        ax.plot(hist_ts.index, hist_ts.values, label='Historical', linewidth=2)
    if fc_ts is not None:
        ax.plot(fc_ts.index, fc_ts.values, '--', label='Forecast', linewidth=2, color='orange')
        if fc_ci is not None:
            ax.fill_between(fc_ci.index, fc_ci['lower'], fc_ci['upper'], color='orange', alpha=0.2)
    ax.set_xlim(XMIN, XMAX)
    ax.set_title(f"{prefix} RAW — {'Sales' if is_sales else 'Sentiment'}", fontsize=14)
    ax.set_xlabel("Year")
    ax.set_ylabel("Value")
    ax.grid(True, linestyle='--', alpha=0.4)
    ax.legend()
    save_fig(fig, f"{prefix}_RAW_Forecast.png")

    # ---------- STANDARDIZED PLOT ----------
    # For sales: normalize by historical max (so all series between 0..1)
    # For sentiment: keep natural range (-1..1)
    if do_standardize:
        if is_sales:
            hist_std, hist_max = normalize_by_max(hist_ts)
            fc_std = None
            fc_ci_std = None
            if fc_ts is not None and hist_max and hist_max != 0:
                fc_std = fc_ts / hist_max
                if fc_ci is not None:
                    fc_ci_std = fc_ci / hist_max
        else:
            # sentiment standardized to VADER range [-1,1] (no change)
            hist_std = hist_ts
            fc_std = fc_ts
            fc_ci_std = fc_ci

        # Plot standardized
        fig, ax = plt.subplots(figsize=(12,5))
        if not hist_std.empty:
            ax.plot(hist_std.index, hist_std.values, label='Historical (std)', linewidth=2)
        if fc_std is not None:
            ax.plot(fc_std.index, fc_std.values, '--', label='Forecast (std)', linewidth=2, color='orange')
            if fc_ci_std is not None:
                ax.fill_between(fc_ci_std.index, fc_ci_std['lower'], fc_ci_std['upper'], color='orange', alpha=0.2)

        ax.set_xlim(XMIN, XMAX)
        if is_sales:
            ax.set_ylim(SALES_STD_RANGE[0], SALES_STD_RANGE[1])
            ax.set_ylabel("Sales (standardized, historical max = 1.0)")
            ax.set_title(f"{prefix} STANDARDIZED — Sales (0 to 1 scale)")
        else:
            ax.set_ylim(SENT_STD_RANGE[0], SENT_STD_RANGE[1])
            ax.set_ylabel("Sentiment (VADER compound)")
            ax.set_title(f"{prefix} STANDARDIZED — Sentiment (-1 to 1 scale)")

        ax.set_xlabel("Year")
        ax.grid(True, linestyle='--', alpha=0.4)
        ax.legend()
        save_fig(fig, f"{prefix}_STANDARDIZED_Forecast.png")

    # ---------- Extra plots (rolling, heatmap, box, yearly, area, kde) ----------
    # Rolling means
    if not hist_ts.empty:
        fig, ax = plt.subplots(figsize=(12,5))
        ax.plot(hist_ts.index, hist_ts.values, alpha=0.5, label='Historical')
        ax.plot(hist_ts.index, hist_ts.rolling(3, min_periods=1).mean(), label='3-month MA')
        ax.plot(hist_ts.index, hist_ts.rolling(6, min_periods=1).mean(), label='6-month MA')
        ax.plot(hist_ts.index, hist_ts.rolling(12, min_periods=1).mean(), label='12-month MA')
        ax.set_xlim(XMIN, XMAX)
        ax.set_title(f"{prefix} Rolling Means")
        ax.legend(); ax.grid(True, linestyle='--', alpha=0.4)
        save_fig(fig, f"{prefix}_Rolling_MA.png")

        # Month-of-year heatmap (pivot table)
        try:
            dfm = hist_ts.copy().to_frame(name='val')
            dfm['year'] = dfm.index.year
            dfm['month'] = dfm.index.month
            pivot = dfm.pivot_table(index='year', columns='month', values='val', aggfunc='mean')
            fig, ax = plt.subplots(figsize=(10,6))
            sns.heatmap(pivot, ax=ax, cmap='viridis', cbar_kws={'label':'Value'})
            ax.set_title(f"{prefix} Month-Year Heatmap")
            save_fig(fig, f"{prefix}_Month_Heatmap.png")
        except Exception:
            pass

        # Boxplot by month
        try:
            dfm = hist_ts.copy().to_frame(name='val')
            dfm['month'] = dfm.index.month
            fig, ax = plt.subplots(figsize=(10,5))
            sns.boxplot(x='month', y='val', data=dfm, ax=ax)
            ax.set_title(f"{prefix} Monthly Boxplot")
            save_fig(fig, f"{prefix}_Month_Boxplot.png")
        except Exception:
            pass

        # Yearly aggregated bars
        try:
            yearly = hist_ts.groupby(hist_ts.index.year).sum()
            fig, ax = plt.subplots(figsize=(10,5))
            ax.bar(yearly.index.astype(str), yearly.values)
            ax.set_title(f"{prefix} Yearly Aggregated")
            ax.set_xlabel("Year")
            ax.set_ylabel("Sum")
            save_fig(fig, f"{prefix}_Yearly_Bar.png")
        except Exception:
            pass

        # Cumulative area plot
        try:
            fig, ax = plt.subplots(figsize=(12,5))
            ax.fill_between(hist_ts.index, hist_ts.cumsum(), alpha=0.4)
            ax.set_xlim(XMIN, XMAX)
            ax.set_title(f"{prefix} Cumulative Area")
            save_fig(fig, f"{prefix}_Cumulative_Area.png")
        except Exception:
            pass

        # KDE (only for sentiment or if data reasonable)
        try:
            if not is_sales:
                fig, ax = plt.subplots(figsize=(8,4))
                sns.kdeplot(hist_ts.dropna(), ax=ax, fill=True)
                ax.set_title(f"{prefix} Sentiment KDE")
                save_fig(fig, f"{prefix}_KDE.png")
        except Exception:
            pass

In [8]:
# -----------------------------
# Run for Brands
# -----------------------------
print("Processing brands (top):", brands)
for b in brands:
    sub = df[df['brand'] == b]
    # Sales
    if 'sales' in df.columns:
        ts_sales = monthly_series(sub, 'sales', agg='sum')
        if len(ts_sales) >= MIN_MONTHS:
            fc_sales, ci_sales = arima_forecast_with_ci(ts_sales)
        else:
            fc_sales, ci_sales = None, None
        plot_item_forecasts(b, "Brand", ts_sales, fc_sales, ci_sales, is_sales=True, do_standardize=True)
    # Sentiment
    if 'sentiment_score' in df.columns:
        ts_sent = monthly_series(sub, 'sentiment_score', agg='mean')
        if len(ts_sent) >= MIN_MONTHS:
            fc_sent, ci_sent = arima_forecast_with_ci(ts_sent)
        else:
            fc_sent, ci_sent = None, None
        plot_item_forecasts(b, "Brand", ts_sent, fc_sent, ci_sent, is_sales=False, do_standardize=True)

Processing brands (top): ['Toyota', 'public', 'Ford', 'BYD', 'Geely Auto', 'Honda', 'BMW', 'Buick', 'MG', 'Mazda']


In [9]:
# -----------------------------
# Run for Series
# -----------------------------
print("Processing series (top):", series_list)
for s in series_list:
    sub = df[df['car_series'] == s]
    # Sales
    if 'sales' in df.columns:
        ts_sales = monthly_series(sub, 'sales', agg='sum')
        if len(ts_sales) >= MIN_MONTHS:
            fc_sales, ci_sales = arima_forecast_with_ci(ts_sales)
        else:
            fc_sales, ci_sales = None, None
        plot_item_forecasts(s, "Series", ts_sales, fc_sales, ci_sales, is_sales=True, do_standardize=True)
    # Sentiment
    if 'sentiment_score' in df.columns:
        ts_sent = monthly_series(sub, 'sentiment_score', agg='mean')
        if len(ts_sent) >= MIN_MONTHS:
            fc_sent, ci_sent = arima_forecast_with_ci(ts_sent)
        else:
            fc_sent, ci_sent = None, None
        plot_item_forecasts(s, "Series", ts_sent, fc_sent, ci_sent, is_sales=False, do_standardize=True)

Processing series (top): ['Asian Dragon', 'Corolla', 'RAV4 Rongfang', 'Crown Land Release', 'Qin PLUS', 'Boyue', 'Tang New Energy', 'Lingfang HARRIER', 'Verano', 'Song PLUS New Energy']


In [10]:
# -----------------------------
# ZIP outputs
# -----------------------------
zipname = "forecast_outputs.zip"
with zipfile.ZipFile(zipname, 'w', zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUT_DIR):
        for f in files:
            zf.write(os.path.join(root, f), arcname=os.path.join(os.path.relpath(root, OUT_DIR), f))

print("All outputs saved into:", OUT_DIR)
print("Zipped to:", zipname)

All outputs saved into: forecast_outputs
Zipped to: forecast_outputs.zip
