# Experiment 003: Remove Data Leakage

**Objective:** Remove the data leakage from 'relative_position' feature and measure true model performance.

**Issue identified:** In exp_002, 'relative_position' = position / notebook_size, where 'position' is the target variable. This is data leakage.

**Changes:**
- Remove 'relative_position' feature from training
- Keep all other features (TF-IDF, headings, notebook-level stats)
- Use same 5-fold GroupKFold validation

**Expected outcome:** CV score will drop significantly, revealing true model performance.

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from tqdm import tqdm
import re
from collections import Counter
from sklearn.model_selection import GroupKFold
from sklearn.metrics import make_scorer
from scipy.stats import kendalltau
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Set paths
TRAIN_PATH = Path('/home/data/train')
TEST_PATH = Path('/home/data/test')
ORDERS_PATH = Path('/home/data/train_orders.csv')

print("Loading data...")
orders_df = pd.read_csv(ORDERS_PATH)
print(f"Orders shape: {orders_df.shape}")
print(f"Unique notebooks: {orders_df['id'].nunique()}")

In [None]:
# Load a subset of training data for faster iteration
# Using 5,000 notebooks like exp_002
np.random.seed(42)
all_notebooks = orders_df['id'].unique()
selected_notebooks = np.random.choice(all_notebooks, size=5000, replace=False)

print(f"Selected {len(selected_notebooks)} notebooks for training")

In [None]:
# Feature extraction functions
def extract_basic_features(df):
    """Extract basic text statistics"""
    df['source_length'] = df['source'].str.len()
    df['line_count'] = df['source'].str.count('\\n') + 1
    df['word_count'] = df['source'].str.split().str.len()
    df['char_count'] = df['source'].str.len()
    
    # Binary flags
    df['has_import'] = df['source'].str.contains('import\\s+\\w+', regex=True, na=False).astype(int)
    df['has_comment'] = df['source'].str.contains('#', na=False).astype(int)
    df['has_heading'] = df['source'].str.contains('^#+\\s+', regex=True, na=False).astype(int)
    df['has_code_block'] = df['source'].str.contains('```', na=False).astype(int)
    df['has_link'] = df['source'].str.contains('\\[.*\\]\\(.*\\)', regex=True, na=False).astype(int)
    
    return df

def extract_heading_features(df):
    """Extract heading-related features"""
    # Extract heading level (1-6)
    df['heading_level'] = 0
    for level in range(1, 7):
        mask = df['source'].str.match(f'^#{{{level}}}\\s+', na=False)
        df.loc[mask, 'heading_level'] = level
    
    # Binary flags for common heading texts
    common_headings = ['introduction', 'conclusion', 'summary', 'results', 'methods', 
                       'analysis', 'eda', 'exploratory', 'data', 'preprocessing',
                       'model', 'training', 'evaluation', 'references', 'appendix',
                       'setup', 'imports', 'installation', 'requirements',
                       'visualization', 'plot', 'train', 'test', 'validation']
    
    for heading in common_headings:
        df[f'heading_{heading}'] = df['source'].str.contains(heading, case=False, na=False).astype(int)
    
    return df

def extract_semantic_features(df):
    """Extract semantic position features"""
    df['has_print'] = df['source'].str.contains('print\\s*\\(', na=False).astype(int)
    df['has_kaggle'] = df['source'].str.contains('kaggle', case=False, na=False).astype(int)
    df['has_input'] = df['source'].str.contains('input', case=False, na=False).astype(int)
    df['has_data'] = df['source'].str.contains('\\bdata\\b', case=False, na=False).astype(int)
    df['has_function'] = df['source'].str.contains('def\\s+\\w+\\s*\\(', regex=True, na=False).astype(int)
    df['has_class'] = df['source'].str.contains('class\\s+\\w+', regex=True, na=False).astype(int)
    df['has_model'] = df['source'].str.contains('\\bmodel\\b', case=False, na=False).astype(int)
    df['has_train'] = df['source'].str.contains('\\btrain\\b', case=False, na=False).astype(int)
    df['has_test'] = df['source'].str.contains('\\btest\\b', case=False, na=False).astype(int)
    df['has_plot'] = df['source'].str.contains('\\.plot\\s*\\(|\\.show\\s*\\(', regex=True, na=False).astype(int)
    df['has_import'] = df['source'].str.contains('^\\s*import\\s+|^\\s*from\\s+', regex=True, na=False).astype(int)
    
    return df

In [None]:
# Load and process training data
print("Loading training notebooks...")
train_data = []
notebook_sizes = {}

for notebook_id in tqdm(selected_notebooks, desc="Processing notebooks"):
    notebook_path = TRAIN_PATH / f"{notebook_id}.json"
    
    if not notebook_path.exists():
        continue
    
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    cell_order = orders_df[orders_df['id'] == notebook_id]['cell_order'].iloc[0].split()
    cell_positions = {cell_id: pos for pos, cell_id in enumerate(cell_order)}
    
    notebook_sizes[notebook_id] = len(cell_order)
    
    # The notebook structure has cell_type and source as separate dictionaries
    cell_types = notebook['cell_type']
    sources = notebook['source']
    
    for cell_id in cell_order:
        cell_type = 1 if cell_types[cell_id] == 'code' else 0
        source = sources[cell_id]
        
        train_data.append({
            'notebook_id': notebook_id,
            'cell_id': cell_id,
            'cell_type': cell_type,
            'source': source,
            'position': cell_positions[cell_id]
        })

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Average cells per notebook: {train_df.groupby('notebook_id').size().mean():.1f}")

In [None]:
# Extract features
print("Extracting features...")
train_df = extract_basic_features(train_df)
train_df = extract_heading_features(train_df)
train_df = extract_semantic_features(train_df)

# Calculate notebook-level statistics (WITHOUT relative_position to avoid leakage)
notebook_stats = train_df.groupby('notebook_id').agg({
    'source_length': ['mean', 'std'],
    'word_count': ['mean', 'std'],
    'cell_type_code': 'mean',  # code ratio
    'position': 'max'  # notebook size
}).round(2)

notebook_stats.columns = ['_'.join(col).strip() for col in notebook_stats.columns]
notebook_stats = notebook_stats.reset_index()
notebook_stats.rename(columns={'position_max': 'notebook_size'}, inplace=True)

# Merge notebook-level features
train_df = train_df.merge(notebook_stats, on='notebook_id', how='left')

# NOTE: DELIBERATELY NOT adding relative_position to avoid data leakage
# train_df['relative_position'] = train_df['position'] / train_df['notebook_size']

print(f"Final training shape: {train_df.shape}")
print(f"Notebook-level features added: {list(notebook_stats.columns)}")

In [None]:
# Prepare TF-IDF features
print("Preparing TF-IDF features...")
# Get all markdown cells for TF-IDF
markdown_cells = train_df[train_df['cell_type'] == 0]['source'].fillna('').tolist()

# Fit TF-IDF vectorizer on markdown content
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9
)

vectorizer.fit(markdown_cells)
tfidf_feature_names = [f'tfidf_{i}' for i in range(len(vectorizer.get_feature_names_out()))]

print(f"TF-IDF features created: {len(tfidf_feature_names)}")

# Transform all cells (both code and markdown)
all_texts = train_df['source'].fillna('').tolist()
tfidf_matrix = vectorizer.transform(all_texts)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)

# Concatenate with main dataframe
train_df = pd.concat([train_df.reset_index(drop=True), tfidf_df], axis=1)
print(f"Final shape with TF-IDF: {train_df.shape}")

In [None]:
# Define feature columns (WITHOUT relative_position)
basic_features = ['source_length', 'line_count', 'word_count', 'char_count',
                  'has_import', 'has_comment', 'has_heading', 'has_code_block', 'has_link']

heading_features = ['heading_level'] + [f'heading_{h}' for h in ['introduction', 'conclusion', 'summary', 'results', 'methods', 
                       'analysis', 'eda', 'exploratory', 'data', 'preprocessing',
                       'model', 'training', 'evaluation', 'references', 'appendix',
                       'setup', 'imports', 'installation', 'requirements',
                       'visualization', 'plot', 'train', 'test', 'validation']]

semantic_features = ['has_print', 'has_kaggle', 'has_input', 'has_data', 'has_function',
                     'has_class', 'has_model', 'has_train', 'has_test', 'has_plot', 'has_import']

notebook_features = ['source_length_mean', 'source_length_std', 'word_count_mean', 
                     'word_count_std', 'cell_type_code_mean', 'notebook_size']

tfidf_features = tfidf_feature_names

# NOTE: relative_position is DELIBERATELY excluded to avoid data leakage
feature_cols = (basic_features + heading_features + semantic_features + 
                notebook_features + tfidf_features)

print(f"Total features: {len(feature_cols)}")
print(f"Basic features: {len(basic_features)}")
print(f"Heading features: {len(heading_features)}")
print(f"Semantic features: {len(semantic_features)}")
print(f"Notebook features: {len(notebook_features)}")
print(f"TF-IDF features: {len(tfidf_features)}")
print(f"\nIMPORTANT: relative_position feature EXCLUDED to avoid data leakage")

In [None]:
# Cross-validation setup
def kendall_tau_score(y_true, y_pred):
    """Calculate Kendall tau correlation"""
    return kendalltau(y_true, y_pred)[0]

gkf = GroupKFold(n_splits=5)
groups = train_df['notebook_id']

X = train_df[feature_cols]
y = train_df['position']

print("Starting cross-validation...")
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Calculate per-notebook Kendall tau
    val_df = train_df.iloc[val_idx].copy()
    val_df['pred_position'] = y_pred
    
    fold_scores = []
    for notebook_id in val_df['notebook_id'].unique():
        notebook_data = val_df[val_df['notebook_id'] == notebook_id]
        if len(notebook_data) > 1:
            score = kendall_tau_score(
                notebook_data['position'].values,
                notebook_data['pred_position'].values
            )
            fold_scores.append(score)
    
    fold_score = np.mean(fold_scores)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1}: {fold_score:.4f}")

print(f"\nCV Score: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")

In [None]:
# Train final model on all data
print("Training final model on all training data...")
final_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_model.fit(X, y)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 most important features:")
print(feature_importance.head(20).to_string(index=False))

In [None]:
# Generate predictions for test set
print("Generating predictions for test set...")
test_notebooks = list(TEST_PATH.glob('*.json'))
print(f"Total test notebooks: {len(test_notebooks)}")

def predict_notebook_order(notebook_path, model, vectorizer, feature_cols):
    """Predict cell order for a single notebook"""
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    notebook_id = notebook_path.stem
    cell_types = notebook['cell_type']
    sources = notebook['source']
    cell_ids = list(cell_types.keys())
    
    # Create features dataframe
    features_df = pd.DataFrame({
        'cell_id': cell_ids,
        'cell_type': [1 if cell_types[cid] == 'code' else 0 for cid in cell_ids],
        'source': [sources[cid] for cid in cell_ids]
    })
    
    # Extract features
    features_df = extract_basic_features(features_df)
    features_df = extract_heading_features(features_df)
    features_df = extract_semantic_features(features_df)
    
    # Calculate notebook-level statistics (same as training)
    features_df['source_length_mean'] = features_df['source_length'].mean()
    features_df['source_length_std'] = features_df['source_length'].std()
    features_df['word_count_mean'] = features_df['word_count'].mean()
    features_df['word_count_std'] = features_df['word_count'].std()
    features_df['cell_type_code_mean'] = features_df['cell_type_code'].mean()
    features_df['notebook_size'] = len(features_df)
    
    # NOTE: NO relative_position feature (avoiding leakage)
    # features_df['relative_position'] = 0.5  # placeholder
    
    # Transform TF-IDF features
    all_texts = features_df['source'].fillna('').tolist()
    tfidf_matrix = vectorizer.transform(all_texts)
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)
    
    features_df = pd.concat([features_df.reset_index(drop=True), tfidf_df], axis=1)
    
    # Ensure all feature columns exist
    for col in feature_cols:
        if col not in features_df.columns:
            features_df[col] = 0
    
    # Predict positions
    X_test = features_df[feature_cols]
    predictions = model.predict(X_test)
    
    # Sort by predicted position
    features_df['predicted_position'] = predictions
    features_df = features_df.sort_values('predicted_position')
    
    # Return ordered cell IDs as space-separated string
    ordered_cells = features_df['cell_id'].tolist()
    return ' '.join(ordered_cells)

In [None]:
# Generate predictions for all test notebooks
submission_data = []

for notebook_path in tqdm(test_notebooks, desc="Predicting test notebooks"):
    try:
        notebook_id = notebook_path.stem
        ordered_cells = predict_notebook_order(notebook_path, final_model, vectorizer, feature_cols)
        
        submission_data.append({
            'id': notebook_id,
            'cell_order': ordered_cells
        })
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")

submission_df = pd.DataFrame(submission_data)
print(f"Submission shape: {submission_df.shape}")
print(submission_df.head())

In [None]:
# Save submission
submission_path = '/home/submission/submission_003.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

# Verify format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nColumns match: {list(submission_df.columns) == list(sample_sub.columns)}")
print(f"Number of rows match: {len(submission_df) == len(sample_sub)}")