# 01 - Data Preparation

**Goal**: Generate synthetic datasets for the ranking system

**Outputs**:
- Synthetic hotel search queries with intent labels
- Synthetic hotel inventory
- Query-hotel candidate pairs with synthetic click/book labels

**Why Synthetic?**: We're simulating realistic user behavior to demonstrate the ranking pipeline. In production, this would be real clickstream data.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from data_utils import (
    load_config, 
    set_random_seeds,
    generate_synthetic_queries,
    generate_synthetic_hotels,
    generate_query_hotel_pairs,
    create_train_test_split,
    save_data
)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Imports complete")

In [None]:
# Load config and set seeds
config = load_config('../config/config.yaml')
set_random_seeds(config['random_seed'])

print(f"Random seed: {config['random_seed']}")
print(f"Intent classes: {config['query_intent']['intent_classes']}")

## 1. Generate Synthetic Queries

In [None]:
# Generate queries
num_queries = config['synthetic']['num_queries']
queries_df = generate_synthetic_queries(num_queries=num_queries)

print(f"Generated {len(queries_df)} queries")
print(f"\nSample queries:")
queries_df.head(10)

In [None]:
# Analyze intent distribution
from collections import Counter

all_intents = []
for intents in queries_df['intent_labels']:
    all_intents.extend(intents)

intent_counts = Counter(all_intents)

plt.figure(figsize=(10, 5))
plt.bar(intent_counts.keys(), intent_counts.values())
plt.xlabel('Intent Class')
plt.ylabel('Frequency')
plt.title('Distribution of Query Intents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\nIntent distribution:")
for intent, count in intent_counts.most_common():
    print(f"{intent:15s}: {count:5d} ({count/len(queries_df)*100:.1f}%)")

## 2. Generate Synthetic Hotels

In [None]:
# Generate hotels
num_hotels = config['synthetic']['num_hotels']
hotels_df = generate_synthetic_hotels(num_hotels=num_hotels)

print(f"Generated {len(hotels_df)} hotels")
print(f"\nSample hotels:")
hotels_df.head(10)

In [None]:
# Visualize hotel distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Price distribution
axes[0, 0].hist(hotels_df['hotel_price'], bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Price ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title(f'Hotel Price Distribution (mean=${hotels_df["hotel_price"].mean():.0f})')

# Rating distribution
axes[0, 1].hist(hotels_df['hotel_rating'], bins=30, edgecolor='black')
axes[0, 1].set_xlabel('Rating')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title(f'Rating Distribution (mean={hotels_df["hotel_rating"].mean():.2f})')

# Distance distribution
axes[1, 0].hist(hotels_df['distance_km'], bins=50, edgecolor='black')
axes[1, 0].set_xlabel('Distance from Center (km)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distance Distribution')

# Price vs Rating scatter
axes[1, 1].scatter(hotels_df['hotel_price'], hotels_df['hotel_rating'], alpha=0.3)
axes[1, 1].set_xlabel('Price ($)')
axes[1, 1].set_ylabel('Rating')
axes[1, 1].set_title('Price vs Rating')

plt.tight_layout()
plt.show()

## 3. Generate Query-Hotel Pairs with Synthetic Labels

This uses a **behavior model** to simulate clicks and bookings:
- `P(click)` = f(price, rating, intent_match, distance)
- `P(book | click)` = f(cancellation, amenities, reviews)

In [None]:
# Generate pairs (this may take 1-2 minutes)
print("Generating query-hotel pairs... (this may take 1-2 min)")

pairs_df = generate_query_hotel_pairs(
    queries_df,
    hotels_df,
    avg_candidates=config['synthetic']['avg_candidates_per_query']
)

print(f"\n✅ Generated {len(pairs_df):,} query-hotel pairs")
print(f"Average candidates per query: {len(pairs_df) / len(queries_df):.1f}")
pairs_df.head()

In [None]:
# Analyze synthetic labels
print("Label Statistics:")
print(f"Click-through rate: {pairs_df['clicked'].mean()*100:.2f}%")
print(f"Booking rate (overall): {pairs_df['booked'].mean()*100:.2f}%")
print(f"Booking rate (given click): {pairs_df[pairs_df['clicked']==1]['booked'].mean()*100:.2f}%")

print(f"\nRelevance distribution:")
print(pairs_df['relevance'].value_counts().sort_index())

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(pairs_df['click_prob'], bins=50, edgecolor='black')
axes[0].set_xlabel('P(click)')
axes[0].set_title('Click Probability Distribution')

axes[1].hist(pairs_df[pairs_df['booked']==1]['book_prob'], bins=30, edgecolor='black')
axes[1].set_xlabel('P(book | click)')
axes[1].set_title('Book Probability Distribution')

relevance_counts = pairs_df['relevance'].value_counts().sort_index()
axes[2].bar(relevance_counts.index, relevance_counts.values)
axes[2].set_xlabel('Relevance Score')
axes[2].set_ylabel('Count')
axes[2].set_title('Relevance Label Distribution')

plt.tight_layout()
plt.show()

## 4. Train/Test Split (by Query)

In [None]:
# Split data (group by query to avoid leakage)
train_df, test_df = create_train_test_split(
    pairs_df,
    test_size=0.2,
    group_col='query_id'
)

print(f"Train set: {len(train_df):,} pairs from {train_df['query_id'].nunique():,} queries")
print(f"Test set:  {len(test_df):,} pairs from {test_df['query_id'].nunique():,} queries")

print(f"\nTrain CTR: {train_df['clicked'].mean()*100:.2f}%")
print(f"Test CTR:  {test_df['clicked'].mean()*100:.2f}%")

## 5. Save Processed Data

In [None]:
# Save all datasets
save_data(queries_df, 'queries.parquet', data_dir='../data/processed')
save_data(hotels_df, 'hotels.parquet', data_dir='../data/processed')
save_data(train_df, 'train_pairs.parquet', data_dir='../data/processed')
save_data(test_df, 'test_pairs.parquet', data_dir='../data/processed')

print("\n✅ All data saved successfully!")

## Summary

✅ **Datasets Created**:
- Queries: Realistic search queries with multi-label intents
- Hotels: Synthetic inventory with realistic distributions
- Pairs: Query-hotel candidates with behavior-based labels

✅ **Key Metrics**:
- CTR ~15% (realistic for hotel search)
- Booking rate ~3% overall, ~20% given click
- Average 100 candidates per query

✅ **Next Step**: Build Query Intent NLP model (Notebook 02)