# Notebook 03: Feature Engineering for Hybrid Model
This notebook merges predictions from CF and CBF, and incorporates user and item metadata to form a feature set for training a hybrid stacked model.

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder

## Load Predictions from CF and CBF

In [None]:
cf_df = pd.read_csv("models/cf_predictions.csv")
cbf_df = pd.read_csv("models/cbf_predictions.csv")

# Merge on userID and itemID
merged_df = pd.merge(cf_df, cbf_df, on=["userID", "itemID"], how="inner")

## Load User and Item Metadata

In [None]:
# Load user info
user_df = pd.read_csv("u_user.csv", sep="|", names=["userID", "age", "gender", "occupation", "zip_code"])

# Load item info
item_df = pd.read_csv("u_item.csv", encoding="latin-1")
genre_cols = item_df.columns[5:]
item_df = item_df[["movie_id"] + list(genre_cols)]
item_df.rename(columns={"movie_id": "itemID"}, inplace=True)

# Ensure itemID is int for merge
item_df["itemID"] = item_df["itemID"].astype(int)
user_df["userID"] = user_df["userID"].astype(int)

## Merge Metadata with Prediction Data

In [None]:
# Merge user and item info
df = pd.merge(merged_df, user_df, on="userID", how="left")
df = pd.merge(df, item_df, on="itemID", how="left")

## One-Hot Encode Categorical Features

In [None]:
# One-hot encode gender and occupation
categorical_cols = ["gender", "occupation"]
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Combine all features
features_df = pd.concat([df[["cf_pred", "cbf_pred"]], encoded_df, df[genre_cols].reset_index(drop=True)], axis=1)

## Add True Ratings (Labels)

In [None]:
# Load true ratings
ratings_df = pd.read_csv("u_data.csv", sep=",", header=0)
ratings_df["userID"] = ratings_df["user_id"].astype(int)
ratings_df["itemID"] = ratings_df["item_id"].astype(int)
ratings_df["rating"] = ratings_df["rating"].astype(int)
ratings_df = ratings_df[["userID", "itemID", "rating"]]

# Extract 20% test set
from sklearn.model_selection import train_test_split
_, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Merge to get labels
labels_df = pd.merge(df[["userID", "itemID"]], test_df, on=["userID", "itemID"], how="left")
features_df["rating"] = labels_df["rating"]

# Save features
features_df.to_csv("models/meta_features.csv", index=False)
print("Saved hybrid feature set to models/meta_features.csv")