# Experiment 002: Enhanced Features with TF-IDF and Heading Analysis

This experiment adds TF-IDF features and heading-level features to improve upon the baseline.

**Strategy:** Priority 1 - Enhanced Feature Engineering

**Expected improvements:**
- TF-IDF features for semantic content patterns
- Heading level features to capture hierarchy
- Semantic position features for first/last cell patterns
- Notebook-level features for context

**Validation:** 5-fold GroupKFold with notebook_id groups, Kendall tau metric

In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from tqdm import tqdm
import re
from collections import Counter
from sklearn.model_selection import GroupKFold
from sklearn.metrics import make_scorer
from scipy.stats import kendalltau
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Set paths
TRAIN_PATH = Path('/home/data/train')
TEST_PATH = Path('/home/data/test')
ORDERS_PATH = Path('/home/data/train_orders.csv')

print("Loading data...")
orders_df = pd.read_csv(ORDERS_PATH)
print(f"Orders shape: {orders_df.shape}")
print(f"Unique notebooks: {orders_df['id'].nunique()}")

Loading data...


Orders shape: (119256, 2)
Unique notebooks: 119256


In [None]:
# Load a subset of training data for faster iteration
# Use 5000 notebooks for this experiment
train_notebooks = orders_df['id'].unique()[:5000]
print(f"Using {len(train_notebooks)} notebooks for training")

# Create training data
all_cells = []
notebook_sizes = {}

for notebook_id in tqdm(train_notebooks, desc="Loading training notebooks"):
    notebook_path = TRAIN_PATH / f"{notebook_id}.json"
    
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    cell_order = orders_df[orders_df['id'] == notebook_id]['cell_order'].iloc[0].split()
    cell_positions = {cell_id: pos for pos, cell_id in enumerate(cell_order)}
    
    notebook_sizes[notebook_id] = len(cell_order)
    
    for cell_id, cell_data in notebook.items():
        cell_type = cell_data['cell_type']
        source = cell_data['source']
        
        all_cells.append({
            'notebook_id': notebook_id,
            'cell_id': cell_id,
            'cell_type': cell_type,
            'source': source,
            'position': cell_positions[cell_id]
        })

train_df = pd.DataFrame(all_cells)
print(f"Training data shape: {train_df.shape}")
print(f"Average cells per notebook: {train_df.groupby('notebook_id').size().mean():.1f}")

In [None]:
# Enhanced feature extraction function
def extract_enhanced_features(df):
    """Extract enhanced features including TF-IDF and heading analysis"""
    
    # Basic text features
    df['source_length'] = df['source'].str.len()
    df['line_count'] = df['source'].str.count('\\n') + 1
    df['word_count'] = df['source'].str.split().str.len()
    df['char_count'] = df['source'].str.replace('\\s+', '', regex=True).str.len()
    
    # Binary flags
    df['has_import'] = df['source'].str.contains('import ', case=False, na=False).astype(int)
    df['has_comment'] = df['source'].str.contains('#', na=False).astype(int)
    df['has_heading'] = df['source'].str.contains('^#+\\s', regex=True, na=False).astype(int)
    df['has_code_block'] = df['source'].str.contains('```', na=False).astype(int)
    df['has_link'] = df['source'].str.contains('\\[.*\\]\\(http', regex=True, na=False).astype(int)
    
    # Cell type
    df['cell_type_code'] = (df['cell_type'] == 'code').astype(int)
    
    # Heading level analysis
    df['heading_level'] = 0
    for level in range(1, 7):
        mask = df['source'].str.match(f'^#{{{level}}}\\s', na=False)
        df.loc[mask, 'heading_level'] = level
    
    # Common heading text features
    common_headings = ['introduction', 'conclusion', 'eda', 'exploratory data analysis', 
                       'model', 'results', 'analysis', 'data', 'preprocessing', 
                       'visualization', 'plot', 'train', 'test', 'validation']
    
    for heading in common_headings:
        df[f'heading_{heading}'] = df['source'].str.contains(heading, case=False, na=False).astype(int)
    
    # Semantic position features
    df['has_print'] = df['source'].str.contains('print\\s*\\(', na=False).astype(int)
    df['has_kaggle'] = df['source'].str.contains('kaggle', case=False, na=False).astype(int)
    df['has_input'] = df['source'].str.contains('input', case=False, na=False).astype(int)
    df['has_data'] = df['source'].str.contains('\\bdata\\b', case=False, na=False).astype(int)
    df['has_function'] = df['source'].str.contains('def\\s+\\w+\\s*\\(', regex=True, na=False).astype(int)
    df['has_class'] = df['source'].str.contains('class\\s+\\w+\\s*\\(', regex=True, na=False).astype(int)
    df['has_plot'] = df['source'].str.contains('\\.plot\\s*\\(|\\.show\\s*\\(|plt\\.', regex=True, na=False).astype(int)
    
    # First/last cell indicators
    df['likely_first_cell'] = ((df['has_import'] == 1) | (df['has_kaggle'] == 1) | (df['has_input'] == 1)).astype(int)
    df['likely_last_cell'] = ((df['has_print'] == 1) | (df['has_plot'] == 1)).astype(int)
    
    return df

print("Extracting enhanced features...")
train_df = extract_enhanced_features(train_df)
print(f"Features extracted. Shape: {train_df.shape}")

In [None]:
# Add notebook-level features
print("Adding notebook-level features...")

# Calculate notebook-level statistics
notebook_stats = train_df.groupby('notebook_id').agg({
    'source_length': ['mean', 'std'],
    'word_count': ['mean', 'std'],
    'cell_type_code': 'mean',  # code ratio
    'position': 'max'  # notebook size
}).round(2)

notebook_stats.columns = ['_'.join(col).strip() for col in notebook_stats.columns]
notebook_stats = notebook_stats.reset_index()
notebook_stats.rename(columns={'position_max': 'notebook_size'}, inplace=True)

# Merge notebook-level features
train_df = train_df.merge(notebook_stats, on='notebook_id', how='left')

# Add relative position feature (percentile within notebook)
train_df['relative_position'] = train_df['position'] / train_df['notebook_size']

print(f"Final training shape: {train_df.shape}")
print(f"Notebook-level features added: {list(notebook_stats.columns)}")

In [None]:
# TF-IDF features for semantic content
print("Extracting TF-IDF features...")

# Get text from markdown cells for TF-IDF
markdown_texts = train_df[train_df['cell_type'] == 'markdown']['source'].fillna('').tolist()

# Use a subset of terms for efficiency
max_features = 1000
vectorizer = TfidfVectorizer(
    max_features=max_features,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.95
)

# Fit TF-IDF on markdown texts
vectorizer.fit(markdown_texts)
print(f"TF-IDF vocabulary size: {len(vectorizer.vocabulary_)}")

# Transform all texts (both code and markdown)
all_texts = train_df['source'].fillna('').tolist()
tfidf_matrix = vectorizer.transform(all_texts)

# Add TF-IDF features to dataframe
tfidf_feature_names = [f'tfidf_{i}' for i in range(max_features)]
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_feature_names,
    index=train_df.index
)

# Concatenate with original dataframe
train_df = pd.concat([train_df, tfidf_df], axis=1)

print(f"Final shape with TF-IDF: {train_df.shape}")
print(f"TF-IDF features added: {max_features}")

In [None]:
# Prepare feature columns
basic_features = ['source_length', 'line_count', 'word_count', 'char_count', 
                  'has_import', 'has_comment', 'has_heading', 'has_code_block', 
                  'has_link', 'cell_type_code', 'heading_level']

heading_features = [f'heading_{h}' for h in ['introduction', 'conclusion', 'eda', 
                    'exploratory data analysis', 'model', 'results', 'analysis', 
                    'data', 'preprocessing', 'visualization', 'plot', 'train', 
                    'test', 'validation']]

semantic_features = ['has_print', 'has_kaggle', 'has_input', 'has_data', 
                     'has_function', 'has_class', 'has_plot', 'likely_first_cell', 
                     'likely_last_cell']

notebook_features = ['source_length_mean', 'source_length_std', 'word_count_mean', 
                     'word_count_std', 'cell_type_code_mean', 'notebook_size']

tfidf_features = tfidf_feature_names

feature_cols = (basic_features + heading_features + semantic_features + 
                notebook_features + tfidf_features + ['relative_position'])

print(f"Total features: {len(feature_cols)}")
print(f"Basic features: {len(basic_features)}")
print(f"Heading features: {len(heading_features)}")
print(f"Semantic features: {len(semantic_features)}")
print(f"Notebook features: {len(notebook_features)}")
print(f"TF-IDF features: {len(tfidf_features)}")

In [None]:
# Define evaluation metric
def kendall_tau_score(y_true, y_pred):
    """Calculate Kendall tau correlation"""
    return kendalltau(y_true, y_pred).correlation

kendall_scorer = make_scorer(kendall_tau_score, greater_is_better=True)

# Cross-validation setup
gkf = GroupKFold(n_splits=5)
groups = train_df['notebook_id']

X = train_df[feature_cols]
y = train_df['position']

print("Starting cross-validation...")
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Calculate per-notebook Kendall tau
    val_df = train_df.iloc[val_idx].copy()
    val_df['pred_position'] = y_pred
    
    fold_scores = []
    for notebook_id in val_df['notebook_id'].unique():
        notebook_data = val_df[val_df['notebook_id'] == notebook_id]
        if len(notebook_data) > 1:
            score = kendall_tau_score(
                notebook_data['position'].values,
                notebook_data['pred_position'].values
            )
            fold_scores.append(score)
    
    fold_score = np.mean(fold_scores)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1}: {fold_score:.4f}")

print(f"\nCV Score: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

In [None]:
# Train final model on all data
print("Training final model on all data...")

final_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_model.fit(X, y)

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 most important features:")
print(importance_df.head(20))

In [None]:
# Prediction function for test data
def predict_notebook_order(notebook_id, path, model, feature_cols, vectorizer):
    """Predict cell order for a notebook"""
    notebook_path = path / f"{notebook_id}.json"
    
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    # Extract features
    cells = []
    cell_ids = []
    
    for cell_id, cell_data in notebook.items():
        cells.append({
            'cell_id': cell_id,
            'cell_type': cell_data['cell_type'],
            'source': cell_data['source']
        })
        cell_ids.append(cell_id)
    
    features_df = pd.DataFrame(cells)
    
    # Extract enhanced features
    features_df = extract_enhanced_features(features_df)
    
    # Add placeholder for notebook-level features (will be filled)
    features_df['source_length_mean'] = features_df['source_length'].mean()
    features_df['source_length_std'] = features_df['source_length'].std()
    features_df['word_count_mean'] = features_df['word_count'].mean()
    features_df['word_count_std'] = features_df['word_count'].std()
    features_df['cell_type_code_mean'] = features_df['cell_type_code'].mean()
    features_df['notebook_size'] = len(features_df)
    features_df['relative_position'] = 0.5  # placeholder
    
    # TF-IDF features
    texts = features_df['source'].fillna('').tolist()
    tfidf_matrix = vectorizer.transform(texts)
    
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=tfidf_feature_names,
        index=features_df.index
    )
    
    features_df = pd.concat([features_df, tfidf_df], axis=1)
    
    # Predict
    X_test = features_df[feature_cols]
    predicted_positions = model.predict(X_test)
    
    # Sort by predicted position
    order_df = pd.DataFrame({
        'cell_id': cell_ids,
        'pred_position': predicted_positions
    })
    
    ordered_cells = order_df.sort_values('pred_position')['cell_id'].tolist()
    return ' '.join(ordered_cells)

# Test on a few notebooks
print("Testing prediction on sample notebooks...")
test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')[:5]]

for notebook_id in test_notebooks:
    try:
        predicted_order = predict_notebook_order(notebook_id, TEST_PATH, final_model, feature_cols, vectorizer)
        print(f"{notebook_id}: {predicted_order[:100]}...")
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")

In [None]:
# Generate submission for all test notebooks
print("Generating submission for all test notebooks...")

test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')]
submission_data = []

for notebook_id in tqdm(test_notebooks, desc="Predicting notebooks"):
    try:
        predicted_order = predict_notebook_order(notebook_id, TEST_PATH, final_model, feature_cols, vectorizer)
        submission_data.append({
            'id': notebook_id,
            'cell_order': predicted_order
        })
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")

submission_df = pd.DataFrame(submission_data)
print(f"Submission shape: {submission_df.shape}")
print(submission_df.head())

In [None]:
# Save submission
submission_path = '/home/submission/submission_002.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

# Verify format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nColumns match: {list(submission_df.columns) == list(sample_sub.columns)}")
print(f"Number of rows match: {len(submission_df) == len(sample_sub)}")