In [None]:
# ==============================
# SALES PREDICTION PIPELINES (with & without PCA) + HYPERPARAM TUNING
# ==============================
# Assumes `df` is already loaded and contains the columns:
# car_series, brand, vehicle_size, car_model_type, purchased_car_model,
# vehicle_energy_type, sales, province_of_purchase, city_of_purchase,
# sentiment_score, purchase_date

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

# ------------- COPY df to avoid accidental edits -------------
df_sales = pd.read_csv("/kaggle/input/ba-dataset-3/BA_dataset_with_sentiment-score.csv")

# ------------- BASIC CLEANUP & FEATURE ENGINEERING -------------
# Ensure purchase_date is datetime
df_sales['purchase_date'] = pd.to_datetime(df_sales['purchase_date'], errors='coerce')

# Extract date features
df_sales['purchase_year'] = df_sales['purchase_date'].dt.year.fillna(0).astype(int)
df_sales['purchase_month'] = df_sales['purchase_date'].dt.month.fillna(0).astype(int)
df_sales['purchase_day'] = df_sales['purchase_date'].dt.day.fillna(0).astype(int)
df_sales['purchase_dow'] = df_sales['purchase_date'].dt.weekday.fillna(0).astype(int)  # 0=Mon

# Fill missing sentiment_score with 0 (or choose a strategy)
df_sales['sentiment_score'] = df_sales['sentiment_score'].fillna(0.0)

# Columns (as provided)
cat_high_card = ['purchased_car_model', 'car_series', 'city_of_purchase']  # likely many unique values
cat_low_card = ['brand', 'vehicle_size', 'car_model_type', 'vehicle_energy_type', 'province_of_purchase']
num_cols = ['sentiment_score', 'purchase_year', 'purchase_month', 'purchase_day', 'purchase_dow']

# Target
target = 'sales'

# ---------------------
# Frequency encoding for high-cardinality categorical columns
# ---------------------
def freq_encode(df, col):
    freq = df[col].value_counts(normalize=True)
    return df[col].map(freq).fillna(0.0)

for c in cat_high_card:
    df_sales[c + "_freq"] = freq_encode(df_sales, c)

# After freq-encoding, we'll use the new *_freq columns and drop original high-card columns in the pipeline
high_card_freq_cols = [c + "_freq" for c in cat_high_card]

# ---------------------
# Prepare features and target
# ---------------------
feature_cols = high_card_freq_cols + cat_low_card + num_cols
X = df_sales[feature_cols].copy()
y = df_sales[target].astype(float)  # regression target

# Fill missing categorical values (if any)
X[cat_low_card] = X[cat_low_card].fillna("missing")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------
# Preprocessing transformers
# ---------------------
# OHE for low-cardinality categorical columns
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(transformers=[
    ("ohe_small_cat", ohe, cat_low_card),
    ("passthrough_highfreq", "passthrough", high_card_freq_cols),
    ("num", "passthrough", num_cols)
], remainder="drop")

# ---------------------
# Models and parameter grids
# ---------------------
models_and_grids = {
    "LinearRegression": (
        LinearRegression(),
        { }   # no hyperparams for plain LinearRegression; we'll still run CV for baseline
    ),
    "LinearSVR": (
        LinearSVR(max_iter=5000),
        {
            "model__C": [0.01, 0.1, 1, 10],
            "model__epsilon": [0.001, 0.01, 0.1]
        }
    ),
    "GradientBoosting": (
        GradientBoostingRegressor(random_state=42),
        {
            "model__n_estimators": [100, 200],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [2, 3, 4]
        }
    ),
    "XGBoost": (
        XGBRegressor(tree_method="hist", objective="reg:squarederror", random_state=42),
        {
            "model__n_estimators": [100, 200],
            "model__learning_rate": [0.01, 0.1],
            "model__max_depth": [3, 5],
            "model__subsample": [0.7, 1.0],
            "model__colsample_bytree": [0.7, 1.0]
        }
    )
}

# ---------------------
# Cross-validation setup
# ---------------------
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# ---------------------
# Helper to run GridSearch for one pipeline
# ---------------------
def run_gridsearch(pipeline, param_grid, X_tr, y_tr, scoring="neg_root_mean_squared_error"):
    grid = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_tr, y_tr)
    return grid

# ==========================
# A) WITHOUT PCA pipeline
# ==========================
results_no_pca = []

for name, (model, grid_params) in models_and_grids.items():
    print(f"\n--- Running (no PCA): {name} ---")
    # For linear models, include a scaler
    # Build pipeline: preprocessor -> scaler -> model
    pipe_steps = [
        ("preproc", preprocessor),
        ("scaler", StandardScaler()),   # scaler works on the numeric output of preprocessor (OHE -> dense)
        ("model", model)
    ]
    pipe = Pipeline(pipe_steps)

    # If no params provided (e.g., LinearRegression), provide a trivial empty dict so GridSearchCV runs CV
    param_grid = grid_params if grid_params else {}

    grid = run_gridsearch(pipe, param_grid, X_train, y_train, scoring="neg_root_mean_squared_error")

    # Metrics on test set
    preds = grid.predict(X_test)
    rmse_test = mean_squared_error(y_test, preds, squared=False)

    print("Best params:", grid.best_params_)
    print("CV RMSE (best):", -grid.best_score_ if grid.best_score_ < 0 else grid.best_score_)  # sklearn uses negative RMSE scorer
    print("Test RMSE:", rmse_test)

    results_no_pca.append({
        "model": name,
        "best_params": grid.best_params_,
        "cv_rmse": -grid.best_score_,
        "test_rmse": rmse_test,
        "best_estimator": grid.best_estimator_
    })

results_no_pca = pd.DataFrame(results_no_pca)

# ==========================
# B) WITH PCA pipeline (apply PCA after scaling)
# ==========================
results_with_pca = []

# We'll keep 95% variance for PCA
pca = PCA(n_components=0.95, random_state=42)

for name, (model, grid_params) in models_and_grids.items():
    print(f"\n--- Running (with PCA): {name} ---")
    pipe_steps = [
        ("preproc", preprocessor),
        ("scaler", StandardScaler()),
        ("pca", pca),
        ("model", model)
    ]
    pipe = Pipeline(pipe_steps)

    param_grid = grid_params if grid_params else {}

    grid = run_gridsearch(pipe, param_grid, X_train, y_train, scoring="neg_root_mean_squared_error")

    # Evaluate on test set
    preds = grid.predict(X_test)
    rmse_test = mean_squared_error(y_test, preds, squared=False)

    print("Best params:", grid.best_params_)
    print("CV RMSE (best):", -grid.best_score_)
    print("Test RMSE:", rmse_test)

    results_with_pca.append({
        "model": name,
        "best_params": grid.best_params_,
        "cv_rmse": -grid.best_score_,
        "test_rmse": rmse_test,
        "best_estimator": grid.best_estimator_
    })

results_with_pca = pd.DataFrame(results_with_pca)

# ==========================
# SUMMARY
# ==========================
print("\n\n===== SUMMARY (No PCA) =====")
print(results_no_pca[['model', 'cv_rmse', 'test_rmse', 'best_params']])

print("\n\n===== SUMMARY (With PCA) =====")
print(results_with_pca[['model', 'cv_rmse', 'test_rmse', 'best_params']])

# Save best estimators for later use
best_no_pca = {r['model']: r['best_estimator'] for r in results_no_pca.to_dict(orient='records')}
best_with_pca = {r['model']: r['best_estimator'] for r in results_with_pca.to_dict(orient='records')}

# Example: save the best XGBoost (no PCA) to disk (optional)
# import joblib
# joblib.dump(best_no_pca['XGBoost'], "best_xgb_no_pca.joblib")

# End of block



--- Running (no PCA): LinearRegression ---




Best params: {}
CV RMSE (best): 102936369835528.75
Test RMSE: 8481.665550653714

--- Running (no PCA): LinearSVR ---




Best params: {'model__C': 10, 'model__epsilon': 0.1}
CV RMSE (best): 8734.726081969724
Test RMSE: 8791.343774448527

--- Running (no PCA): GradientBoosting ---




Best params: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 200}
CV RMSE (best): 6752.39317324686
Test RMSE: 6778.0178733522525

--- Running (no PCA): XGBoost ---




Best params: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 1.0}
CV RMSE (best): 6490.020282242143
Test RMSE: 6559.771340780464

--- Running (with PCA): LinearRegression ---




Best params: {}
CV RMSE (best): 8696.285568065909
Test RMSE: 8753.123830920611

--- Running (with PCA): LinearSVR ---




Best params: {'model__C': 1, 'model__epsilon': 0.01}
CV RMSE (best): 9157.435678604976
Test RMSE: 9174.656796568652

--- Running (with PCA): GradientBoosting ---


