# 02 — Modeling: Laptop Price Prediction 🧠📈

<p align="left">
  <img alt="scikit-learn" src="https://img.shields.io/badge/scikit--learn-Modeling-F7931E?logo=scikitlearn&logoColor=white">
  <img alt="Gradient Boosting" src="https://img.shields.io/badge/Gradient%20Boosting-⭐%20Best%20So%20Far-0b8457">
  <img alt="Status" src="https://img.shields.io/badge/Notebook-Modeling-1e90ff">
</p>

> <strong>Purpose</strong>: Train baselines, tune models, add categoricals, and save metrics/artifacts.  
> <strong>Author</strong>: <span style="color:#FF6B6B"><b>Noëlla Buti</b></span>

---

### 🛠️ Workflow
1. 🧪 Baselines (LR, RF, GB) with numeric features  
2. 🎛️ Hyperparameter tuning (GB)  
3. 🧩 Add categorical features (OHE) + tune  
4. 📊 Residuals & feature importance  
5. 💾 Save model (<code>best_gb_model.pkl</code>) & <code>reports/metrics.json</code>

<details>
  <summary><b>📁 Artifacts (click to expand)</b></summary>

- Model path (Drive): <code>/content/drive/MyDrive/artifacts/best_gb_model.pkl</code>  
- Metrics path (Drive): <code>/content/drive/MyDrive/reports/metrics.json</code>  
- Keep large files out of GitHub; commit only small metadata (e.g., metrics.json).
</details>

---

### 🚦 Results Snapshot (update from metrics)
- **GB + Categoricals** → <b>R² ≈ 0.882</b>, MSE ≈ <b>58,510</b>  
- GB (Numeric Only) → R² ≈ 0.837  
- RF → R² ≈ 0.825  
- LR → R² ≈ 0.653

> 💡 **Tip:** Use the same columns at train & predict time when using ColumnTransformer (avoid “missing columns” errors).

## 1. Setup and Load Cell

In [None]:
# === Setup ===
import os, random, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# Paths
DRIVE_DIR = "/content/drive/MyDrive"
CSV_NAME = "laptop_prices.csv"
CSV_PATH = f"{DRIVE_DIR}/{CSV_NAME}"

from google.colab import drive
drive.mount('/content/drive')
assert os.path.exists(CSV_PATH), f"CSV not found at {CSV_PATH}"

# Read CSV
try:
    laptop_data = pd.read_csv(CSV_PATH)
except UnicodeDecodeError:
    laptop_data = pd.read_csv(CSV_PATH, encoding="ISO-8859-1")

# Coerce numerics & cast cats (same as in EDA)
num_like = ["Inches","Ram","Weight","Price_euros","ScreenW","ScreenH","CPU_freq",
            "PrimaryStorage","SecondaryStorage"]
for c in num_like:
    if c in laptop_data.columns:
        laptop_data[c] = pd.to_numeric(laptop_data[c], errors="coerce")

cat_cols = ["Company","Product","TypeName","OS","Screen","Touchscreen","IPSpanel",
            "RetinaDisplay","CPU_company","CPU_model","PrimaryStorageType",
            "SecondaryStorageType","GPU_company","GPU_model"]
for c in cat_cols:
    if c in laptop_data.columns:
        laptop_data[c] = laptop_data[c].astype("category")

print(f"Rows: {len(laptop_data):,} | Cols: {len(laptop_data.columns)}")

Mounted at /content/drive
Rows: 1,275 | Cols: 23


## 2. Baselines

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

TARGET = "Price_euros"
feature_numeric = ["Ram","Weight","CPU_freq","PrimaryStorage"]

X = laptop_data[feature_numeric].copy()
y = laptop_data[TARGET].copy()
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=SEED)

num_pre = Pipeline([("impute", SimpleImputer(strategy="mean"))])
pre = ColumnTransformer([("num", num_pre, feature_numeric)])

def fit_eval(model, name):
    pipe = Pipeline([("pre", pre), ("model", model)])
    pipe.fit(X_tr, y_tr)
    pred = pipe.predict(X_te)
    mse = mean_squared_error(y_te, pred)
    r2  = r2_score(y_te, pred)
    print(f"{name:>22} | MSE: {mse:,.0f} | R²: {r2:.3f}")
    return pipe, pred, mse, r2

lr_pipe, lr_pred, mse_lr, r2_lr = fit_eval(LinearRegression(), "Linear Regression")
rf_pipe, rf_pred, mse_rf, r2_rf = fit_eval(RandomForestRegressor(n_estimators=100, random_state=SEED), "Random Forest")
gb_pipe, gb_pred, mse_gb, r2_gb = fit_eval(GradientBoostingRegressor(n_estimators=100, random_state=SEED), "Gradient Boosting")

     Linear Regression | MSE: 172,467 | R²: 0.653
         Random Forest | MSE: 87,042 | R²: 0.825
     Gradient Boosting | MSE: 92,784 | R²: 0.813


## 3. Tune Gradient Boosting (Numeric Only)

In [None]:
gb_base = Pipeline([("pre", pre), ("model", GradientBoostingRegressor(random_state=SEED))])
param_grid = {
    "model__n_estimators": [50, 100, 150],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__max_depth": [3, 4, 5],
}
gsearch = GridSearchCV(gb_base, param_grid, cv=5, n_jobs=-1, scoring="neg_mean_squared_error", verbose=1)
gsearch.fit(X_tr, y_tr)

best = gsearch.best_estimator_
best_pred = best.predict(X_te)
mse_best = mean_squared_error(y_te, best_pred)
r2_best  = r2_score(y_te, best_pred)
print("Best params:", gsearch.best_params_)
print(f"Tuned GB (num) | MSE: {mse_best:,.0f} | R²: {r2_best:.3f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best params: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 100}
Tuned GB (num) | MSE: 81,011 | R²: 0.837


## 4. Tune Gradient Boosting (Adding Categorical Features)

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_candidates = ["Company","CPU_company","CPU_model","GPU_company","GPU_model","OS","TypeName","Screen",
                  "PrimaryStorageType","SecondaryStorageType","Touchscreen","IPSpanel","RetinaDisplay"]
feature_cats = [c for c in cat_candidates if c in laptop_data.columns]
all_features = feature_numeric + feature_cats

X_full = laptop_data[all_features].copy()
y = laptop_data[TARGET].copy()
for c in feature_cats:
    X_full[c] = X_full[c].astype("category")

X_tr, X_te, y_tr, y_te = train_test_split(X_full, y, test_size=0.2, random_state=SEED)

num_pre = Pipeline([("impute", SimpleImputer(strategy="mean"))])
cat_pre = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),  # if sklearn<1.2 → sparse=True
])
pre_full = ColumnTransformer([("num", num_pre, feature_numeric),
                              ("cat", cat_pre, feature_cats)])

gb_full = Pipeline([("pre", pre_full), ("model", GradientBoostingRegressor(random_state=SEED))])
param_grid_full = {
    "model__n_estimators": [100, 150],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [3, 5],
}
gsearch_full = GridSearchCV(gb_full, param_grid_full, cv=5, n_jobs=-1, scoring="neg_mean_squared_error", verbose=1)
gsearch_full.fit(X_tr, y_tr)

best_full = gsearch_full.best_estimator_
pred_full = best_full.predict(X_te)
mse_full = mean_squared_error(y_te, pred_full)
r2_full  = r2_score(y_te, pred_full)
print("Best (with cats):", gsearch_full.best_params_)
print(f"Tuned GB + Cats | MSE: {mse_full:,.0f} | R²: {r2_full:.3f}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best (with cats): {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 150}
Tuned GB + Cats | MSE: 58,510 | R²: 0.882


## 5. Residuals and Feature Importance

In [None]:
import plotly.express as px

px.scatter(x=y_te, y=pred_full, labels={"x":"Actual (€)","y":"Predicted (€)"},
           title="Predicted vs Actual (Best Model)", trendline="ols").show()

gb_model = best_full.named_steps["model"]
if hasattr(gb_model, "feature_importances_"):
    # Get names
    ohe = best_full.named_steps["pre"].transformers_[1][1].named_steps["ohe"]
    cat_names = ohe.get_feature_names_out(feature_cats).tolist()
    names = feature_numeric + cat_names
    importances = gb_model.feature_importances_[:len(names)]
    imp_df = pd.DataFrame({"feature": names, "importance": importances}).sort_values("importance")
    px.bar(imp_df.tail(25), x="importance", y="feature", orientation="h",
           title="Top Feature Importances").show()

## 6. Save Models and Metrics (Drive)

In [None]:
import joblib, json, time

ART_DIR = os.path.join(DRIVE_DIR, "artifacts")
REP_DIR = os.path.join(DRIVE_DIR, "reports")
os.makedirs(ART_DIR, exist_ok=True)
os.makedirs(REP_DIR, exist_ok=True)

joblib.dump(best_full, os.path.join(ART_DIR, "best_gb_model.pkl"))
print("Saved model to:", os.path.join(ART_DIR, "best_gb_model.pkl"))

metrics = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "split": {"test_size": 0.2, "random_state": SEED},
    "models": {
        "linear_regression": {"mse": float(mse_lr), "r2": float(r2_lr)},
        "random_forest":     {"mse": float(mse_rf), "r2": float(r2_rf)},
        "gb_numeric":        {"mse": float(mse_best), "r2": float(r2_best)},
        "gb_full":           {"mse": float(mse_full), "r2": float(r2_full)},
    },
    "best_params_gb_numeric": gsearch.best_params_,
    "best_params_gb_full":    gsearch_full.best_params_
}
with open(os.path.join(REP_DIR, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)
print("Saved metrics to:", os.path.join(REP_DIR, "metrics.json"))

Saved model to: /content/drive/MyDrive/artifacts/best_gb_model.pkl
Saved metrics to: /content/drive/MyDrive/reports/metrics.json
