In [1]:
import pandas as pd
import numpy as np

from src.preprocessing import load_data, calculate_rank, filter_valid_stays, discretize_features, train_test_split_by_stay
from src.model import IVVModel
from src.evaluation import calculate_ndcg, calculate_map

In [2]:
# 1. Load Data
print("Loading data...")
# Assuming the file is at data/data_01.csv
# Note: The user's file might be large, but we'll assume it fits in memory for now.
filepath = "data/data_01.csv"
df = load_data(filepath)

print(f"Data loaded: {len(df)} rows.")

Loading data...
Data loaded: 838 rows.


In [3]:
# 2. Preprocessing
print("Preprocessing...")
if 'rank' not in df.columns:
    print("Calculating rank...")
    df = calculate_rank(df)

print("Filtering valid stays...")
df = filter_valid_stays(df)
print(f"Valid stays: {len(df)} rows.")

print("Discretizing features...")
df = discretize_features(df)

Preprocessing...
Calculating rank...
Filtering valid stays...
Valid stays: 307 rows.
Discretizing features...


In [4]:
# 3. Train/Test Split
print("Splitting data...")
train_df, test_df = train_test_split_by_stay(df, test_ratio=0.2)
print(f"Train set: {len(train_df)} rows, Test set: {len(test_df)} rows.")

Splitting data...
Train set: 262 rows, Test set: 45 rows.


In [5]:
# 4. Model Training
print("Training model...")
model = IVVModel()
model.train(train_df, learning_rate=0.001, iterations=100, verbose=True)

# 5. Evaluation
print("\nEvaluating on Test Set...")
# Predict scores for test set
test_df_pred = model.predict_proba(test_df)

# Calculate Metrics
ndcg = calculate_ndcg(test_df_pred, k_list=[1, 5, 10])
map_score = calculate_map(test_df_pred)

print("Evaluation Results:")
print(f"MAP: {map_score:.4f}")
for k, score in ndcg.items():
    print(f"NDCG@{k}: {score:.4f}")
    
# Also evaluate on Train set for sanity check
print("\n(Sanity Check) Train Set Metrics:")
train_df_pred = model.predict_proba(train_df)
train_map = calculate_map(train_df_pred)
print(f"Train MAP: {train_map:.4f}")

Training model...
Iteration 0: LL = -42.3021
Iteration 10: LL = -41.9917
Iteration 20: LL = -41.6868
Iteration 30: LL = -41.3872
Iteration 40: LL = -41.0928
Iteration 50: LL = -40.8036
Iteration 60: LL = -40.5193
Iteration 70: LL = -40.2399
Iteration 80: LL = -39.9654
Iteration 90: LL = -39.6956
Iteration 99: LL = -39.4567

Evaluating on Test Set...
Evaluation Results:
MAP: 0.5655
NDCG@1: 0.3333
NDCG@5: 0.6154
NDCG@10: 0.6710

(Sanity Check) Train Set Metrics:
Train MAP: 0.8210
