# Testing Recommender Functions

This notebook installs the required dependencies and tests the recommender functions defined in your modules (e.g. `utils.py` and `recommender.py`). It loads the models via the `get_models()` function and then tests various ensemble methods.

In [None]:
# Install required dependencies
#!pip install --upgrade pip
#!pip install numpy scikit-learn tensorflow keras fastapi torch transformers

# If your project has a requirements.txt file, you can also use:
# !pip install --no-cache-dir -r requirements.txt

In [1]:
# Import required modules
import numpy as np
from utils import get_models  # Ensure these are in your PYTHONPATH
from recommender import ensemble_bagging, ensemble_boosting, train_stacking_meta_model, ensemble_stacking, hybrid_ensemble, tokenize_input

# For demonstration, we assume get_models() returns a dictionary of models for clusters 0, 1, 2, etc.
print("Modules imported successfully.")

2025-03-05 13:55:39.755834: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-05 13:55:39.821568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741182939.854646    3586 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741182939.861986    3586 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-05 13:55:39.915407: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Modules imported successfully.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


History tensor shape: (1, 50, 30)
Candidate tensor shape: (1, 30)


2025-03-05 13:56:00.307815: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [3]:
import os
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
import sys
import tensorflow as tf

# Remap standalone keras modules to tensorflow.keras
sys.modules["keras.preprocessing.text"] = tf.keras.preprocessing.text
sys.modules["keras.preprocessing.sequence"] = tf.keras.preprocessing.sequence
sys.modules["keras.utils"] = tf.keras.utils

news_file = "news.tsv"
behaviors_file = "behaviors.tsv"
data_dir = 'dataset/train/'  # Adjust path as necessary
valid_data_dir = 'dataset/valid/'  # Adjust path as necessary
news_path = os.path.join(data_dir, news_file)
behaviors_path = os.path.join(data_dir, behaviors_file)

# Set maximum lengths (should match your model settings)
max_history_length = 50
max_title_length = 30

# Load the pre-saved tokenizer (assumes you already created and saved it)
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Load MIND test data (adjust file paths as necessary)
# Assume news.tsv contains columns: NewsID, Category, SubCategory, Title, Abstract, URL, TitleEntities, AbstractEntities
news_df = pd.read_csv(news_path, sep='\t', 
                      names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])
# Assume behaviors_test.tsv contains: ImpressionID, UserID, Time, HistoryText, Impressions
behaviors_df = pd.read_csv(behaviors_path, sep='\t', 
                           names=['ImpressionID', 'UserID', 'Time', 'HistoryText', 'Impressions'])

# Create a dictionary mapping NewsID to Title (or CombinedText if available)
news_dict = dict(zip(news_df['NewsID'], news_df['Title']))

# Select one sample from the test behaviors
sample = behaviors_df.iloc[0]

# Process history: split the HistoryText (a space-separated string of NewsIDs)
history_text = sample['HistoryText']
history_ids = history_text.split() if pd.notna(history_text) else []

# Retrieve the title for each news ID in the history (default to empty string if missing)
history_titles = [news_dict.get(nid, "") for nid in history_ids]

# Convert history titles to sequences using the tokenizer
history_sequences = tokenizer.texts_to_sequences(history_titles)
# Pad each sequence to max_title_length
history_padded = pad_sequences(history_sequences, maxlen=max_title_length, 
                               padding='post', truncating='post', value=0)

# Ensure the history has exactly max_history_length rows:
if history_padded.shape[0] < max_history_length:
    # Pre-pad with zeros if there are fewer history items
    pad_rows = np.zeros((max_history_length - history_padded.shape[0], max_title_length), dtype=int)
    history_padded = np.vstack([pad_rows, history_padded])
else:
    # If too many, take the last max_history_length items
    history_padded = history_padded[-max_history_length:]

# Process candidate: the "Impressions" column is a space-separated list like "newsID-label newsID-label ..."
impressions = sample['Impressions']
first_candidate = impressions.split()[0]  # take the first candidate
candidate_news_id = first_candidate.split('-')[0]
candidate_title = news_dict.get(candidate_news_id, "")
candidate_sequence = tokenizer.texts_to_sequences([candidate_title])
candidate_padded = pad_sequences(candidate_sequence, maxlen=max_title_length, 
                                 padding='post', truncating='post', value=0)[0]

# Convert to TensorFlow tensors
history_tensor = tf.convert_to_tensor([history_padded], dtype=tf.int32)  # shape: (1, max_history_length, max_title_length)
candidate_tensor = tf.convert_to_tensor([candidate_padded], dtype=tf.int32)  # shape: (1, max_title_length)

print("History tensor shape:", history_tensor.shape)
print("Candidate tensor shape:", candidate_tensor.shape)

# Load ensemble models using the get_models function
print("Loading models...")
models_dict = get_models()
print("Models loaded:", models_dict.keys())

# Test ensemble bagging
bagging_pred = ensemble_bagging(history_tensor, candidate_tensor, models_dict)
print("Ensemble Bagging Prediction:", bagging_pred)

# Test ensemble boosting with dummy error values
dummy_errors = np.array([0.2, 0.15, 0.25])
boosting_pred = ensemble_boosting(history_tensor, candidate_tensor, models_dict, dummy_errors)
print("Ensemble Boosting Prediction:", boosting_pred)

# Test ensemble stacking with dummy training data
X_train_dummy = np.array([
    [0.80, 0.75, 0.85],
    [0.55, 0.60, 0.50],
    [0.30, 0.35, 0.25],
    [0.20, 0.25, 0.15]
])
y_train_dummy = np.array([1, 0, 1, 0])
meta_model = train_stacking_meta_model(X_train_dummy, y_train_dummy)
stacking_pred = ensemble_stacking(history_tensor, candidate_tensor, models_dict, meta_model)
print("Ensemble Stacking Prediction:", stacking_pred)

# Test hybrid ensemble
hybrid_pred = hybrid_ensemble(history_tensor, candidate_tensor, models_dict, dummy_errors, meta_model)
print("Hybrid Ensemble Prediction:", hybrid_pred)

Loading models...
Loaded news data:
   NewsID   Category               SubCategory  \
0  N88753  lifestyle           lifestyleroyals   
1  N45436       news  newsscienceandtechnology   
2  N23144     health                weightloss   
3  N86255     health                   medical   
4  N93187       news                 newsworld   

                                               Title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1    Walmart Slashes Prices on Last-Generation iPads   
2                      50 Worst Habits For Belly Fat   
3  Dispose of unwanted prescription drugs during ...   
4  The Cost of Trump's Aid Freeze in the Trenches...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  Apple's new iPad releases bring big deals on l...   
2  These seemingly harmless habits are holding yo...   
3                                                NaN   
4  Lt. Ivan Molchanets peeked over a parapet o

  saveable.load_own_variables(weights_store.get(inner_path))



Loading model for Cluster 1 from fastformer_cluster_1_full_balanced_1_epoch.keras
.cache
.ipynb_checkpoints
backend copy 2.py
backend copy.py
backend-flask-unused.py
backend.py
data
dataset
Dockerfile
downloads
fastapi copy.py
fastapi2.py
fastformer.json
fastformer_clusters.ipynb
fastformer_cluster_0_full_balanced_1_epoch.h5
fastformer_cluster_0_full_balanced_1_epoch.hdf5
fastformer_cluster_0_full_balanced_1_epoch.json
fastformer_cluster_0_full_balanced_1_epoch.keras
fastformer_cluster_0_full_balanced_1_epoch.weights.h5
fastformer_cluster_1_full_balanced_1_epoch.keras
fastformer_cluster_2_full_balanced_1_epoch.keras
fastformer_model.py
gdrive.py
models
models.py
recommender.py
requirements.txt
test_recommender.ipynb
test_recommender.py
tokenizer.pkl
upload_to_hf.py
user_category_profiles.pkl
utils.py
__pycache__
2.18.0
3.8.0

Loading model for Cluster 2 from fastformer_cluster_2_full_balanced_1_epoch.keras
.cache
.ipynb_checkpoints
backend copy 2.py
backend copy.py
backend-flask-unuse

I0000 00:00:1741183042.339769    3726 service.cc:148] XLA service 0x7fbd2800a430 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1741183042.340942    3726 service.cc:156]   StreamExecutor device (0): Host, Default Version
2025-03-05 13:57:22.603298: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step


I0000 00:00:1741183045.569925    3726 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
Ensemble Bagging Prediction: [0.5459661]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Ensemble Boosting Prediction: [0.55218655]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Ensemble Stacking Prediction: [0.50922481]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m 

### Next Steps

You can now develop and test your recommendation functions independently of the FastAPI backend. 

For further debugging, you might want to add additional print statements or assertions within your recommender functions.