# Book Recommendation Solution

**Final Score: 0.889 NDCG@20**

### Approach:
- CatBoost Ranker with PairLogitPairwise loss
- 150 negatives from top-800 popular books
- Ensemble of 4 models with rank-based averaging

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.metrics import ndcg_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('data/train.csv')
targets = pd.read_csv('data/targets.csv')
candidates = pd.read_csv('data/candidates.csv')
books = pd.read_csv('data/books.csv')
users = pd.read_csv('data/users.csv')
book_genres = pd.read_csv('data/book_genres.csv')
print(f"Train: {train.shape}")

In [None]:
# Stats
book_stats = train.groupby('book_id').agg({
    'has_read': ['sum', 'count', 'mean'],
    'rating': 'mean',
    'user_id': 'nunique'
}).reset_index()
book_stats.columns = ['book_id', 'times_read', 'total_interactions', 
                      'read_rate', 'avg_user_rating', 'unique_users']

user_stats = train.groupby('user_id').agg({
    'has_read': ['sum', 'mean'],
    'rating': 'mean'
}).reset_index()
user_stats.columns = ['user_id', 'user_books_read', 'user_read_rate', 'user_avg_rating']

# Genres
user_genres = train[train['has_read'] == 1].merge(book_genres, on='book_id', how='left')
user_top_genres = user_genres.groupby(['user_id', 'genre_id']).size().reset_index(name='cnt')
user_top_genres = user_top_genres.sort_values(['user_id', 'cnt'], ascending=[True, False])
user_top_genres_dict = user_top_genres.groupby('user_id')['genre_id'].apply(list).to_dict()
book_genres_dict = book_genres.groupby('book_id')['genre_id'].apply(list).to_dict()

def genre_overlap(uid, bid):
    ug = set(user_top_genres_dict.get(uid, [])[:10])
    bg = set(book_genres_dict.get(bid, []))
    if not ug or not bg:
        return 0
    return len(ug & bg) / len(bg)

print("Stats ready")

In [None]:
def create_features(df):
    df = df.merge(book_stats, on='book_id', how='left')
    df = df.merge(user_stats, on='user_id', how='left')
    df = df.merge(books[['book_id', 'publication_year', 'avg_rating']], on='book_id', how='left')
    df = df.merge(users, on='user_id', how='left')
    
    df['genre_match'] = df.apply(lambda r: genre_overlap(r['user_id'], r['book_id']), axis=1)
    df['book_age'] = 2024 - df['publication_year']
    df['popularity'] = np.log1p(df['total_interactions'])
    df['rating_compatibility'] = 1 - np.abs(df['avg_rating'] - df['user_avg_rating']) / 5.0
    
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(0)
    
    return df

feature_cols = [
    'times_read', 'total_interactions', 'read_rate', 'avg_user_rating', 'unique_users',
    'user_books_read', 'user_read_rate', 'user_avg_rating',
    'publication_year', 'avg_rating', 'book_age',
    'age', 'gender',
    'genre_match', 'popularity', 'rating_compatibility'
]

popular_books_800 = book_stats.nlargest(800, 'total_interactions')['book_id'].values
user_books_cache = train.groupby('user_id')['book_id'].apply(set).to_dict()

print(f"{len(feature_cols)} features")

## Training Configuration

In [None]:
# CONFIG
N_NEG = 150
N_MODELS = 4
SEEDS = [42, 123, 456, 789]

print(f"Config: N_NEG={N_NEG}, Ensemble={N_MODELS} models")
print("Strategy: Rank-based averaging")

In [None]:
# Prepare training data
print("Preparing training data...")

full_train = train.copy()
full_train['target'] = full_train['has_read'].apply(lambda x: 2 if x == 1 else 1)

np.random.seed(42)
full_negatives = []
for user_id in full_train['user_id'].unique():
    user_books = user_books_cache.get(user_id, set())
    neg_candidates = [b for b in popular_books_800 if b not in user_books]
    neg_books = np.random.choice(neg_candidates, min(N_NEG, len(neg_candidates)), replace=False)
    for book_id in neg_books:
        full_negatives.append({'user_id': user_id, 'book_id': book_id, 'target': 0})

full_all = pd.concat([full_train, pd.DataFrame(full_negatives)], ignore_index=True)
full_features = create_features(full_all)
full_features = full_features.sort_values('user_id').reset_index(drop=True)

print(f"Train size: {len(full_features)}")

In [None]:
# Train ensemble
print(f"Training {N_MODELS} models...\n")

full_pool = Pool(full_features[feature_cols], full_features['target'], group_id=full_features['user_id'])

final_models = []
for i, seed in enumerate(SEEDS):
    print(f"Model {i+1}/{N_MODELS} (seed={seed})")
    
    model = CatBoostRanker(
        iterations=700,
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=3,
        loss_function='PairLogitPairwise',
        random_seed=seed,
        verbose=100,
        task_type='GPU',
        devices='0'
    )
    model.fit(full_pool)
    final_models.append(model)

print(f"\nTrained {len(final_models)} models!")

In [None]:
# Create test predictions with rank-based ensemble
print("Creating test predictions...")

test_data = []
for _, row in candidates.iterrows():
    for bid in [int(b) for b in row['book_id_list'].split(',')]:
        test_data.append({'user_id': row['user_id'], 'book_id': bid})

test_df = create_features(pd.DataFrame(test_data))

# Get predictions from all models
all_preds = []
for i, model in enumerate(final_models):
    test_df[f'pred_{i}'] = model.predict(test_df[feature_cols])
    all_preds.append(f'pred_{i}')

# Rank-based ensemble
print("Computing rank-based ensemble...")

def rank_ensemble(group):
    for col in all_preds:
        group[f'rank_{col}'] = group[col].rank(ascending=False)
    rank_cols = [f'rank_{col}' for col in all_preds]
    group['avg_rank'] = group[rank_cols].mean(axis=1)
    return group

test_df = test_df.groupby('user_id', group_keys=False).apply(rank_ensemble)
test_df['pred'] = -test_df['avg_rank']

print("Done!")

In [None]:
# Create submission
submission = []
for uid in targets['user_id']:
    user_preds = test_df[test_df['user_id'] == uid]
    user_preds = user_preds.sort_values('pred', ascending=False)
    user_preds = user_preds.drop_duplicates('book_id', keep='first')
    top = user_preds.head(20)['book_id'].astype(int).tolist()
    submission.append({'user_id': uid, 'book_id_list': ','.join(map(str, top))})

pd.DataFrame(submission).to_csv('submission.csv', index=False)
print("\nsubmission.csv created!")