In [2]:
import math
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import joblib
import warnings
# --- New imports for ONNX conversion ---
import onnxmltools
from onnxmltools import convert_lightgbm
from onnxmltools.convert.common.data_types import FloatTensorType
import onnxruntime as rt
# ------------------------------------

warnings.filterwarnings("ignore")

# ----------------- CONFIG -----------------
# NOTE: Using your uploaded file name
DATA_PATH = "../data/Kiosk_DOOH_Ads_300.csv"
TEST_SIZE = 0.20
RANDOM_STATE = 42
MIN_CLASS_FREQ = 2
LGBM_PARAMS = {
    "objective": "multiclass",
    "n_estimators": 400,
    "num_leaves": 64,
    "random_state": RANDOM_STATE,
    "verbosity": -1
}
TOP_K = 5
ARTIFACT_PATH = "recommender_artifacts_singular.joblib"
# ------------------------------------------

# --------- 1) Load dataset (CSV or Excel) ----------
if DATA_PATH.lower().endswith((".xls", ".xlsx")):
    df = pd.read_excel(DATA_PATH)
else:
    df = pd.read_csv(DATA_PATH)
print("Loaded dataset:", df.shape)

# ---------- 2) Defensive cleanup: drop obvious ID-like columns ----------
for col in ["Timestamp", "ID", "Index"]:
    if col in df.columns:
        df = df.drop(columns=[col], errors='ignore')
print("After dropping raw timestamp/IDs (if present):", df.shape)

# ---------- 3) Detect ad-ranking columns & build LikedTopic ----------
ad_prefix = "What kinds of advertisements do you find most engaging? (1-5 in order of decreasing preference)"
ad_cols = [c for c in df.columns if c.startswith(ad_prefix)]
if not ad_cols:
    ad_cols = [c for c in df.columns if "most engaging" in c.lower() or "1-5 in order" in c.lower()]
if not ad_cols:
    raise RuntimeError("Could not detect ad-ranking columns. Please verify your file has the expected ad columns.")

def extract_label(col):
    if '[' in col and ']' in col:
        return col[col.find('[')+1:col.rfind(']')].strip()
    return col

ad_labels = [extract_label(c) for c in ad_cols]
print(f"Detected ad categories (count={len(ad_cols)}):")

def get_top_ad(row):
    for c, label in zip(ad_cols, ad_labels):
        v = row.get(c)
        if pd.isna(v): continue
        if float(v) == 1.0: return label
    return None

df['LikedTopic'] = df.apply(get_top_ad, axis=1)
df = df.dropna(subset=['LikedTopic']).reset_index(drop=True)

# ---------- 4) Detect and process apparel columns ----------
upper_cols = [c for c in df.columns if 'Upper' in c and '[' in c]
lower_cols = [c for c in df.columns if 'Lower' in c and '[' in c]
accessory_cols = [c for c in df.columns if 'accessori' in c.lower()]

def pick_choice_onehot(row, cols):
    for c in cols:
        v = row.get(c)
        if pd.isna(v): continue
        if float(v) == 1.0: return extract_label(c)
    return None

def extract_accessory(row, acc_cols):
    for c in acc_cols:
        if c in row.index and pd.notna(row[c]) and str(row[c]).strip() != '':
            return str(row[c]).split(';')[0].strip()
    return pick_choice_onehot(row, acc_cols)

if upper_cols: df['upper_choice'] = df.apply(lambda r: pick_choice_onehot(r, upper_cols), axis=1)
else: df['upper_choice'] = None
if lower_cols: df['lower_choice'] = df.apply(lambda r: pick_choice_onehot(r, lower_cols), axis=1)
else: df['lower_choice'] = None
df['accessory_choice'] = df.apply(lambda r: extract_accessory(r, accessory_cols), axis=1)
df = df.dropna(subset=['Age','Gender','LikedTopic']).reset_index(drop=True)
df['upper_choice'] = df['upper_choice'].fillna('unknown_upper')
df['lower_choice'] = df['lower_choice'].fillna('unknown_lower')
df['accessory_choice'] = df['accessory_choice'].fillna('unknown_acc')

# ---------- 5) Drop other high-cardinality columns ----------
obj_cols = [c for c in df.columns if df[c].dtype == 'object' and c not in ['upper_choice','lower_choice','accessory_choice','LikedTopic','Gender']]
high_card = {c: df[c].nunique() for c in obj_cols if df[c].nunique() > 50}
if high_card:
    df = df.drop(columns=list(high_card.keys()))

# --- START of Altered Logic ---

# 6) Build feature matrix (X)
age_scaler = MinMaxScaler()
df['age_norm'] = age_scaler.fit_transform(df[['Age']].astype(float))
gender_enc = LabelEncoder()
df['gender_enc'] = gender_enc.fit_transform(df['Gender'].astype(str))

# Create a single 'Apparel' column by combining all apparel choices
df['Apparel'] = df['upper_choice'].str.cat(df[['lower_choice', 'accessory_choice']], sep='|').str.split('|')

# Explode the DataFrame to have one row per apparel item
df_apparel = df.explode('Apparel')
apparel_dummies = pd.get_dummies(df_apparel['Apparel'], prefix='apparel')

# Group back by the original index and sum to create the final one-hot encoded matrix
# This handles cases where a user has multiple apparel items
apparel_features = apparel_dummies.groupby(level=0).sum()

# Reindex to ensure alignment with the main DataFrame
apparel_features = apparel_features.reindex(df.index, fill_value=0)

# Normalize the apparel features as a single block
def block_normalize(block_df):
    if block_df.shape[1] == 0: return block_df
    return block_df.divide(math.sqrt(block_df.shape[1]))

apparel_block = block_normalize(apparel_features)

X = pd.concat([df[['age_norm','gender_enc']].reset_index(drop=True),
               apparel_block.reset_index(drop=True)], axis=1).fillna(0)

# --- END of Altered Logic ---

# Build target (y)
y = df['LikedTopic'].astype(str).values
print("Final feature matrix shape:", X.shape)

# ---------- 7) Handle rare classes ----------
vc = pd.Series(y).value_counts()
rare = vc[vc < MIN_CLASS_FREQ].index.tolist()
if rare:
    df['LikedTopic'] = df['LikedTopic'].apply(lambda x: x if x not in rare else 'other')
    y = df['LikedTopic'].astype(str).values
label_enc = LabelEncoder()
y_enc = label_enc.fit_transform(y)
print("Final number of classes:", len(label_enc.classes_))

# ---------- 8) Train/test split ----------
stratify_param = y_enc if min(Counter(y_enc).values()) >= 2 else None
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=stratify_param)
print("Train/test shapes:", X_train.shape, X_test.shape)

# ---------- 9) Sample weights ----------
freq = Counter(y_train)
total = len(y_train)
sample_weight = np.array([total / freq[int(lbl)] for lbl in y_train])

# ---------- 10) Train LightGBM model ----------
model = lgb.LGBMClassifier(**LGBM_PARAMS)
cb = [lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=50)]
model.fit(X_train, y_train, sample_weight=sample_weight, eval_set=[(X_test, y_test)], eval_metric='multi_logloss', callbacks=cb)

# ---------- 11) Evaluation ----------
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest accuracy: {acc:.4f}")

# ---------- 12) Save artifacts (your original saving) ----------
# --- Altered Logic: Now only saving one apparel column list ---
artifacts = {
    "model": model, "age_scaler": age_scaler, "gender_enc": gender_enc,
    "label_enc": label_enc, "apparel_cols": list(apparel_block.columns),
    "feature_columns": list(X.columns)
}
joblib.dump(artifacts, ARTIFACT_PATH)
print(f"\nSaved artifacts to: {ARTIFACT_PATH}")
# ==================================================================
# NEW: Convert the trained model to ONNX
# ==================================================================
print("\n--- Starting ONNX Conversion ---")

# Define the input signature for the ONNX model based on the final feature matrix X
# The shape is [batch_size, num_features]. We use None for batch_size to allow for variable input.
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]

# Convert the trained LightGBM model ('model') to ONNX format
onnx_model = convert_lightgbm(model, initial_types=initial_type)

# Save the ONNX model to a file
with open("../Models/ml_model/lightgbm_model_singular.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("Model successfully converted and saved as 'lightgbm_model_singular.onnx'")

Loaded dataset: (300, 49)
After dropping raw timestamp/IDs (if present): (300, 48)
Detected ad categories (count=17):
Final feature matrix shape: (300, 32)
Final number of classes: 17
Train/test shapes: (240, 32) (60, 32)
Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 3.92667
Early stopping, best iteration is:
[1]	valid_0's multi_logloss: 2.84517

Test accuracy: 0.0833

Saved artifacts to: recommender_artifacts_singular.joblib

--- Starting ONNX Conversion ---
Model successfully converted and saved as 'lightgbm_model_singular.onnx'


In [None]:
import math
import numpy as np
import pandas as pd
from collections import Counter
import warnings
import joblib


# --- Configuration for testing ---
ARTIFACT_PATH = "recommender_artifacts_singular.joblib"
model = "../Models/ml_model/lightgbm_model_singular.onnx"
TOP_K = 5

# --- User Profile to Test ---
# NOTE: This is the user data you want to get a recommendation for.
user_age = 58
user_gender = "Female"
# The apparel list should only contain valid items, so we filter out "None" and "unknown_acc"
user_apparel = ["polo", "skirt", "spectacles", "hat"]

# --- 1. Load the saved artifacts ---
print("Loading recommender artifacts...")
try:
    artifacts = joblib.load(ARTIFACT_PATH)
    model = artifacts["model"]
    age_scaler = artifacts["age_scaler"]
    gender_enc = artifacts["gender_enc"]
    label_enc = artifacts["label_enc"]
    apparel_cols = artifacts["apparel_cols"]
    feature_columns = artifacts["feature_columns"]
    print("Artifacts loaded successfully.")
except FileNotFoundError:
    print(f"Error: The artifact file '{ARTIFACT_PATH}' was not found. Please ensure it has been created by the training script.")
    exit()

# --- 2. Preprocess the user profile data ---
print("\nPreprocessing user profile...")

# Initialize a DataFrame for the user
user_df = pd.DataFrame({
    'Age': [user_age],
    'Gender': [user_gender]
})

# Apply age scaling
user_df['age_norm'] = age_scaler.transform(user_df[['Age']].astype(float))

# Apply gender encoding
user_df['gender_enc'] = gender_enc.transform(user_df['Gender'].astype(str))

# Create one-hot encoded features for apparel
# Initialize a dictionary with all apparel columns set to 0
apparel_features = {col: 0 for col in apparel_cols}

# Set the value to 1 for each apparel item the user is wearing
for item in user_apparel:
    # Ensure the item is a valid column in our model's features
    if f'apparel_{item}' in apparel_features:
        apparel_features[f'apparel_{item}'] = 1

# Convert the apparel features dictionary to a DataFrame row
apparel_df = pd.DataFrame([apparel_features])

# Apply the same block normalization as during training
def block_normalize(block_df):
    if block_df.shape[1] == 0:
        return block_df
    return block_df.divide(math.sqrt(block_df.shape[1]))

apparel_block = block_normalize(apparel_df)

# Concatenate all features into a single DataFrame
# We must ensure the columns are in the exact same order as the training data
user_features = pd.concat([user_df[['age_norm', 'gender_enc']].reset_index(drop=True), apparel_block.reset_index(drop=True)], axis=1)

# Reindex the DataFrame to match the feature column order from training
user_features = user_features.reindex(columns=feature_columns, fill_value=0)

# --- 3. Make a prediction ---
print("\nMaking prediction...")
# Get the probability distribution for all classes
probabilities = model.predict_proba(user_features)[0]

# --- 4. Get and display the top K recommendations ---
# Get the indices of the top K highest probabilities
top_k_indices = np.argsort(probabilities)[::-1][:TOP_K]

print(f"\nTop {TOP_K} Recommended Ad Topics:")
for i, index in enumerate(top_k_indices):
    # Decode the class index back to the original ad topic name
    ad_topic = label_enc.inverse_transform([index])[0]
    confidence = probabilities[index]
    print(f"  {i+1}. {ad_topic}: {confidence:.4f} confidence")

Loading recommender artifacts...
Artifacts loaded successfully.

Preprocessing user profile...

Making prediction...


AttributeError: 'super' object has no attribute 'get_params'

 1. Health Products: 0.0798 confidence
  2. Finance/Investing: 0.0663 confidence
  3. Beauty & Skincare: 0.0662 confidence
  4. Jewelry: 0.0658 confidence
  5. Education & Careers: 0.0651 confidence

  for age 69, female, polo, skirt, spectacles, hat