# Pricing Model Analysis

This notebook generates model evaluation metrics and Matplotlib charts for the Results and Analysis chapter.

In [None]:
import json
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

ROOT = Path.cwd()
PROJECT_PATH = Path(r"d:/Maal Gaadi 1/furniture-mover")
if str(PROJECT_PATH) not in sys.path:
    sys.path.append(str(PROJECT_PATH))

from pricing_module import _generate_kathmandu_data

In [None]:
sample_size = 1200
test_size = 0.25
random_state = 42

df = _generate_kathmandu_data(sample_size)
features = ["distance_km", "truck_category", "traffic_level", "time_of_day", "is_peak_hour"]
target = "accepted_price_npr"

encoded = df[features].copy()
for col in ["truck_category", "traffic_level", "time_of_day"]:
    le = LabelEncoder()
    encoded[col] = le.fit_transform(encoded[col])

X_train, X_test, y_train, y_test = train_test_split(
    encoded, df[target], test_size=test_size, random_state=random_state
)

model = RandomForestRegressor(
    n_estimators=250,
    max_depth=14,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=random_state,
    n_jobs=-1,
)
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

metrics = {
    "train_r2": round(r2_score(y_train, train_pred), 4),
    "test_r2": round(r2_score(y_test, test_pred), 4),
    "train_mae": round(mean_absolute_error(y_train, train_pred), 2),
    "test_mae": round(mean_absolute_error(y_test, test_pred), 2),
    "train_rmse": round(mean_squared_error(y_train, train_pred) ** 0.5, 2),
    "test_rmse": round(mean_squared_error(y_test, test_pred) ** 0.5, 2),
    "samples": int(sample_size),
    "train_samples": int(len(X_train)),
    "test_samples": int(len(X_test)),
}

metrics

In [None]:
fi = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)

plt.figure(figsize=(9, 5.4))
plt.bar(fi.index, fi.values, color="#4472C4")
plt.title("Feature Importance Ranking")
plt.ylabel("Importance Score")
plt.xlabel("Features")
plt.xticks(rotation=20)
plt.tight_layout()
plt.savefig(ROOT / "feature_importance.png", dpi=220)
plt.show()

In [None]:
plt.figure(figsize=(6.8, 6.2))
plt.scatter(y_test, test_pred, alpha=0.65, color="#2E8B57", edgecolor="white", linewidth=0.3)
min_val = min(float(y_test.min()), float(test_pred.min()))
max_val = max(float(y_test.max()), float(test_pred.max()))
plt.plot([min_val, max_val], [min_val, max_val], "r--", linewidth=1.5)
plt.title("Actual vs Predicted Prices")
plt.xlabel("Actual Price (NPR)")
plt.ylabel("Predicted Price (NPR)")
plt.tight_layout()
plt.savefig(ROOT / "actual_vs_predicted.png", dpi=220)
plt.show()

In [None]:
errors = y_test - test_pred

plt.figure(figsize=(9, 5.4))
plt.hist(errors, bins=28, color="#DAA520", edgecolor="black", alpha=0.85)
plt.title("Prediction Error Distribution")
plt.xlabel("Prediction Error (Actual - Predicted)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(ROOT / "error_distribution.png", dpi=220)
plt.show()

In [None]:
category_chart = (
    df.groupby("truck_category")["accepted_price_npr"]
    .agg(["mean", "median", "min", "max"])
    .reindex(["SMALL", "MEDIUM", "LARGE"])
    .reset_index()
)

plt.figure(figsize=(8.4, 5.4))
plt.bar(category_chart["truck_category"], category_chart["mean"], color="#5F9EA0")
plt.title("Average Price by Vehicle Category")
plt.xlabel("Vehicle Category")
plt.ylabel("Average Price (NPR)")
plt.tight_layout()
plt.savefig(ROOT / "price_by_category.png", dpi=220)
plt.show()

summary = {
    "metrics": metrics,
    "feature_importance": {k: round(float(v), 4) for k, v in fi.to_dict().items()},
    "category_summary": category_chart.to_dict(orient="records"),
}

with open(ROOT / "analysis_metrics.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

summary