# 06-Deploy_Model.ipynb — Deployment preparation

**Purpose:** Build user feature store (one row per user), bundle the trained pipeline + metadata (and optional SHAP explainer) into a deployment bundle, and provide local test functions identical to the API core.
**Outputs produced by this notebook:**
- ../data/user_latest_features.csv
- ../models/deployment_bundle.pkl

In [1]:
# Imports & configuration
import os, joblib, json, math, warnings
from datetime import datetime
import numpy as np, pandas as pd
warnings.filterwarnings('ignore')

DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))
MODEL_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'models'))

MASTER_PATH = os.path.join(DATA_DIR, 'master_features_v2.csv')   
PIPE_PATH = os.path.join(MODEL_DIR, 'ponpare_v2_pipeline.joblib')  
USER_FEATURE_PATH = os.path.join(DATA_DIR, 'user_latest_features.csv')
ITEM_FEATURE_PATH = os.path.join(DATA_DIR, 'item_features.csv')
OUT_BUNDLE = os.path.join(MODEL_DIR, 'deployment_bundle.pkl')

print("DATA_DIR =", DATA_DIR)
print("MODEL_DIR =", MODEL_DIR)
print("Expecting master:", MASTER_PATH)
print("Expecting pipeline:", PIPE_PATH)
print("Will write user store to:", USER_FEATURE_PATH)
print("Will write item store to:", ITEM_FEATURE_PATH)
print("Will write bundle to:", OUT_BUNDLE)

DATA_DIR = C:\Users\017413903\Desktop\Graduation project\data
MODEL_DIR = C:\Users\017413903\Desktop\Graduation project\models
Expecting master: C:\Users\017413903\Desktop\Graduation project\data\master_features_v2.csv
Expecting pipeline: C:\Users\017413903\Desktop\Graduation project\models\ponpare_v2_pipeline.joblib
Will write user store to: C:\Users\017413903\Desktop\Graduation project\data\user_latest_features.csv
Will write item store to: C:\Users\017413903\Desktop\Graduation project\data\item_features.csv
Will write bundle to: C:\Users\017413903\Desktop\Graduation project\models\deployment_bundle.pkl


In [2]:
# Sanity checks
missing = []
if not os.path.exists(MASTER_PATH):
    missing.append(MASTER_PATH + '  <-- master_features_v2.csv not found')
if not os.path.exists(PIPE_PATH):
    missing.append(PIPE_PATH + '  <-- ponpare_pipeline_v2.joblib not found')

if missing:
    raise FileNotFoundError("Required files missing:\n" + "\n".join(missing))
else:
    print("All required files are present. Proceeding.")

All required files are present. Proceeding.


In [4]:
# Load master features
print("Loading master features...")
df = pd.read_csv(MASTER_PATH, low_memory=False)
df.columns = df.columns.str.lower()
if 'i_date' in df.columns:
    df['i_date'] = pd.to_datetime(df['i_date'], errors='coerce')
print("Master shape:", df.shape)
display(df.head(3))

Loading master features...
Master shape: (2833180, 52)


Unnamed: 0,i_date,page_serial,referrer_hash,view_coupon_id_hash,user_id_hash,session_id_hash,purchaseid_hash,reg_date,sex_id,age,...,discount_value,validity_duration_days,user_age_days,user_total_views,user_unique_views,user_total_purchases,user_conversion_rate,user_preferred_genre,user_max_views_same_coupon,avg_time_between_visits
0,2012-03-28 14:15:00,7,7d3892e54acb559ae36c459978489330,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,Unknown,2012-03-28 14:14:18,f,25,...,5625.0,0.0,0,13,6,5,0.384615,宅配,5,265.833333
1,2012-03-28 14:17:28,9,7d3892e54acb559ae36c459978489330,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,Unknown,2012-03-28 14:14:18,f,25,...,5625.0,0.0,0,13,6,5,0.384615,宅配,5,265.833333
2,2012-03-28 14:20:05,16,7d3892e54acb559ae36c459978489330,17c450c3b470c045d35ec22b02daa690,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,Unknown,2012-03-28 14:14:18,f,25,...,3000.0,0.0,0,13,6,5,0.384615,宅配,5,265.833333


In [6]:
# Cell 4: Build User AND Item Feature Stores (Fixed for Column Names)
threshold_price = 50.0

# 1. User Feature Store (One row per user - LATEST state)
print("Building User Feature Store...")
if 'i_date' in df.columns:
    latest_user = df.sort_values('i_date').groupby('user_id_hash', as_index=False).tail(1)
else:
    latest_user = df.groupby('user_id_hash', as_index=False).tail(1)

# Filter bad prices
latest_user = latest_user[latest_user['catalog_price'] > threshold_price]

# Save User Store
os.makedirs(DATA_DIR, exist_ok=True)
latest_user.to_csv(USER_FEATURE_PATH, index=False)
print(f"Saved User Store ({len(latest_user)} rows) to: {USER_FEATURE_PATH}")

# 2. Item Feature Store (One row per Coupon)
print("Building Item Feature Store...")

# In master_features, it is often 'view_coupon_id_hash'
item_id_col = 'view_coupon_id_hash' if 'view_coupon_id_hash' in df.columns else 'coupon_id_hash'

if item_id_col not in df.columns:
    # Fallback: try finding any column with 'coupon' and 'id'
    candidates = [c for c in df.columns if 'coupon' in c and 'id' in c]
    if candidates:
        item_id_col = candidates[0]
    else:
        raise KeyError("Could not find a Coupon ID column (checked 'view_coupon_id_hash' and 'coupon_id_hash').")

print(f"Using '{item_id_col}' as the Item ID.")

# Define attributes we want for the item
item_atts = ['catalog_price', 'genre_name', 'capsule_text', 
             'large_area_name', 'ken_name', 'validperiod', 'dispperiod']
# Only keep attributes that actually exist
item_atts = [c for c in item_atts if c in df.columns]

# Group by the ID and take the first occurrence (static features don't change)
item_store = df.groupby(item_id_col, as_index=False).first()[[item_id_col] + item_atts]

# Rename to standard 'coupon_id_hash' for the API
item_store = item_store.rename(columns={item_id_col: 'coupon_id_hash'})

# Filter bad prices
item_store = item_store[item_store['catalog_price'] > threshold_price]

item_store.to_csv(ITEM_FEATURE_PATH, index=False)
print(f"Saved Item Store ({len(item_store)} items) to: {ITEM_FEATURE_PATH}")

Building User Feature Store...
Saved User Store (20068 rows) to: C:\Users\017413903\Desktop\Graduation project\data\user_latest_features.csv
Building Item Feature Store...
Using 'view_coupon_id_hash' as the Item ID.
Saved Item Store (19404 items) to: C:\Users\017413903\Desktop\Graduation project\data\item_features.csv


In [7]:
# Load trained pipeline
print("Loading pipeline:", PIPE_PATH)
pipeline = joblib.load(PIPE_PATH)
print("Pipeline loaded. Type:", type(pipeline))

# Test on a sample
sample_row = pd.read_csv(USER_FEATURE_PATH, nrows=1)
if 'user_id_hash' in sample_row.columns:
    sample_row = sample_row.drop(columns=['user_id_hash'])
    
try:
    probs = pipeline.predict_proba(sample_row)[:, 1]
    print("Sample predicted prob:", float(probs[0]))
except Exception as e:
    print("Error testing pipeline:", e)

Loading pipeline: C:\Users\017413903\Desktop\Graduation project\models\ponpare_v2_pipeline.joblib
Pipeline loaded. Type: <class 'sklearn.pipeline.Pipeline'>
Sample predicted prob: 0.9202265739440918


In [8]:
# Optional SHAP
build_shap = False
explainer = None
if build_shap:
    pass
else:
    print("Skipping SHAP explainer creation.")

Skipping SHAP explainer creation.


In [9]:
# Create deployment bundle
# We need the LIST of features the model expects
feature_columns = [c for c in pd.read_csv(USER_FEATURE_PATH, nrows=0).columns if c != 'user_id_hash']

bundle = {
    'pipeline': pipeline,
    'feature_columns': feature_columns,
    'created_at': datetime.utcnow().isoformat(),
    'notes': 'Deployment bundle created by 06-Deploy_Model notebook'
}

if explainer:
    bundle['shap_explainer'] = explainer

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(bundle, OUT_BUNDLE, compress=3)
print("Saved deployment bundle to:", OUT_BUNDLE)

Saved deployment bundle to: C:\Users\017413903\Desktop\Graduation project\models\deployment_bundle.pkl


In [10]:
# Cell 8: Local recommendation function (Logic Fixed)
ASSUMED_MARGIN_PCT = 0.50

def simulate_discount_response_local(pipeline, base_row, feature_cols, price_col='price_rate', price_vals=None):
    if price_vals is None:
        price_vals = list(range(0, 101, 10)) # Test 0, 10, 20... 100
        
    rows = []
    for discount_pct in price_vals:
        r = base_row.copy()
        
        # 1. Calculate Logic: Discount % -> Price Rate
        if price_col in r.index:
            r[price_col] = 100 - discount_pct
            
        # 2. Recalculate Dependent Features (CRITICAL)
        if 'catalog_price' in r.index:
            cat_price = float(r['catalog_price'])
            r['discount_price'] = cat_price * (1 - discount_pct / 100.0)
            r['discount_value'] = cat_price - r['discount_price']
            
        rows.append(r)
        
    df_sim = pd.DataFrame(rows)
    # Fill missing cols to prevent crash
    for c in feature_cols:
        if c not in df_sim.columns:
            df_sim[c] = 0 
            
    # Predict
    probs = pipeline.predict_proba(df_sim[feature_cols])[:, 1]
    df_sim['pred_proba'] = probs
    df_sim['sim_price_rate'] = price_vals
    
    return df_sim

# Helper: Compute Financials
def compute_financials_local(df_sim, catalog_price):
    df_sim['revenue_per_unit'] = catalog_price * (1 - df_sim['sim_price_rate'] / 100.0)
    cost_of_goods = catalog_price * ASSUMED_MARGIN_PCT
    df_sim['profit_per_unit'] = df_sim['revenue_per_unit'] - cost_of_goods
    df_sim['expected_profit'] = df_sim['pred_proba'] * df_sim['profit_per_unit']
    return df_sim

In [12]:
# Cell 9: Test recommend_for_user_local (Fixed for Styler)
uf = pd.read_csv(USER_FEATURE_PATH, low_memory=False)
sample_user_id = uf['user_id_hash'].iloc[0]
base_row = uf.iloc[0]
feature_cols = bundle['feature_columns']

print(f"Testing recommendation for User: {sample_user_id}")

# Run Simulation
sim_result = simulate_discount_response_local(pipeline, base_row, feature_cols)
sim_result = compute_financials_local(sim_result, float(base_row['catalog_price']))
sim_result = sim_result.reset_index(drop=True)

# Find Best
best = sim_result.nlargest(1, 'expected_profit').iloc[0]

print("\n--- Recommendation ---")
print(f"Optimal Discount: {int(best['sim_price_rate'])}%")
print(f"Expected Profit: {best['expected_profit']:.2f}")
print(f"Probability: {best['pred_proba']:.4f}")

# Now safe to display with style
display(sim_result[['sim_price_rate', 'pred_proba', 'expected_profit']].style.background_gradient(subset=['expected_profit']))

Testing recommendation for User: 425e53423715b1d6bd2e6fca0d965a8c

--- Recommendation ---
Optimal Discount: 0%
Expected Profit: 864.92
Probability: 0.8046


Unnamed: 0,sim_price_rate,pred_proba,expected_profit
0,0,0.804578,864.921549
1,10,0.796955,685.381547
2,20,0.884626,570.583675
3,30,0.921147,396.093359
4,40,0.91885,197.552687
5,50,0.917551,0.0
6,60,0.925078,-198.891726
7,70,0.91718,-394.387401
8,80,0.888711,-573.218426
9,90,0.766989,-659.610277
