# AI4Code Baseline Model

This notebook implements a baseline solution for the AI4Code competition.

## Approach
1. Extract basic features from cells (length, type, etc.)
2. Train a model to predict cell ordering
3. Use Kendall tau correlation as evaluation metric

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import re
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set paths
DATA_PATH = Path('/home/data')
TRAIN_PATH = DATA_PATH / 'train'
TEST_PATH = DATA_PATH / 'test'
TRAIN_ORDERS_PATH = DATA_PATH / 'train_orders.csv'

print("Loading data paths...")
print(f"Train path: {TRAIN_PATH}")
print(f"Test path: {TEST_PATH}")
print(f"Train orders: {TRAIN_ORDERS_PATH}")

In [None]:
# Load train orders
train_orders = pd.read_csv(TRAIN_ORDERS_PATH)
print(f"Train orders shape: {train_orders.shape}")
print(f"First few rows:")
print(train_orders.head())

In [None]:
# Load a sample notebook to understand structure
def load_notebook(notebook_id, path):
    """Load a notebook from JSON file"""
    with open(path / f"{notebook_id}.json", 'r') as f:
        notebook = json.load(f)
    return notebook

# Load sample notebook
sample_nb = load_notebook('0002115f48f982', TRAIN_PATH)
print("Notebook keys:", list(sample_nb.keys()))
print("\nCell types:", list(sample_nb['cell_type'].items())[:5])
print("\nSample code cell:", list(sample_nb['source'].items())[0])
print("\nSample markdown cell:", list(sample_nb['source'].items())[-1])

In [None]:
# Create features for a notebook
def extract_features(notebook_id, path):
    """Extract features from a notebook"""
    notebook = load_notebook(notebook_id, path)
    
    features = []
    cell_ids = []
    
    for cell_id, cell_type in notebook['cell_type'].items():
        source = notebook['source'][cell_id]
        
        # Basic features
        feature_dict = {
            'notebook_id': notebook_id,
            'cell_id': cell_id,
            'cell_type': cell_type,
            'source_length': len(source),
            'line_count': source.count('\n') + 1,
            'word_count': len(source.split()),
            'char_count': len(source.replace('\n', '').replace(' ', '')),
            'has_import': int('import ' in source or 'from ' in source) if cell_type == 'code' else 0,
            'has_comment': int('#' in source) if cell_type == 'code' else 0,
            'has_heading': int(any(heading in source for heading in ['# ', '## ', '### '])) if cell_type == 'markdown' else 0,
            'has_code_block': int('```' in source) if cell_type == 'markdown' else 0,
            'has_link': int('http' in source or 'www.' in source) if cell_type == 'markdown' else 0,
        }
        
        features.append(feature_dict)
        cell_ids.append(cell_id)
    
    return pd.DataFrame(features), cell_ids

# Test feature extraction
sample_features, sample_cell_ids = extract_features('0002115f48f982', TRAIN_PATH)
print("Sample features shape:", sample_features.shape)
print(sample_features.head())

In [None]:
# Create training data
def create_training_data(notebook_ids, path, orders_df):
    """Create training dataset with features and target positions"""
    all_features = []
    all_targets = []
    
    for notebook_id in tqdm(notebook_ids, desc="Processing notebooks"):
        # Extract features
        features, cell_ids = extract_features(notebook_id, path)
        
        # Get correct order
        correct_order = orders_df[orders_df['id'] == notebook_id]['cell_order'].iloc[0].split()
        
        # Create position mapping
        position_map = {cell_id: pos for pos, cell_id in enumerate(correct_order)}
        
        # Add target position
        features['position'] = features['cell_id'].map(position_map)
        
        all_features.append(features)
    
    return pd.concat(all_features, ignore_index=True)

# Load a subset for initial training
sample_notebooks = train_orders['id'].head(1000).tolist()
train_df = create_training_data(sample_notebooks, TRAIN_PATH, train_orders)
print(f"Training data shape: {train_df.shape}")
print(train_df.head())

In [None]:
# Prepare features for modeling
feature_cols = [col for col in train_df.columns if col not in ['notebook_id', 'cell_id', 'position', 'cell_type']]
print(f"Feature columns: {feature_cols}")

# Add cell type as categorical feature
train_df['cell_type_code'] = (train_df['cell_type'] == 'code').astype(int)
feature_cols.append('cell_type_code')

print(f"Final feature columns: {feature_cols}")
print(f"Training data shape: {train_df.shape}")

In [None]:
# Define Kendall tau metric for evaluation
def kendall_tau_score(y_true, y_pred):
    """Calculate Kendall tau correlation"""
    from scipy.stats import kendalltau
    return kendalltau(y_true, y_pred).correlation

kendall_scorer = make_scorer(kendall_tau_score, greater_is_better=True)

# Train model
print("Training LightGBM model...")

# Use a subset for faster training
sample_df = train_df.sample(frac=0.3, random_state=42)

X = sample_df[feature_cols]
y = sample_df['position']

# Train LightGBM model
model = lgb.LGBMRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

model.fit(X, y)

print("Model training completed!")
print(f"Feature importances: {dict(zip(feature_cols, model.feature_importances_))}")

In [None]:
# Predict on test data
def predict_notebook_order(notebook_id, path, model, feature_cols):
    """Predict cell order for a notebook"""
    features, cell_ids = extract_features(notebook_id, path)
    
    # Add cell type code
    features['cell_type_code'] = (features['cell_type'] == 'code').astype(int)
    
    # Predict positions
    X = features[feature_cols]
    predicted_positions = model.predict(X)
    
    # Sort by predicted position
    order_df = pd.DataFrame({
        'cell_id': cell_ids,
        'predicted_position': predicted_positions
    })
    
    # Sort and get ordered cell IDs
    ordered_cells = order_df.sort_values('predicted_position')['cell_id'].tolist()
    
    return ' '.join(ordered_cells)

# Test on a few notebooks
test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')][:5]
print("Testing prediction on sample notebooks:")
for nb_id in test_notebooks:
    try:
        predicted_order = predict_notebook_order(nb_id, TEST_PATH, model, feature_cols)
        print(f"{nb_id}: {predicted_order[:100]}...")
    except Exception as e:
        print(f"Error with {nb_id}: {e}")

In [None]:
# Generate submission for all test notebooks
print("Generating submission for all test notebooks...")

test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')]
submission_data = []

for notebook_id in tqdm(test_notebooks, desc="Predicting notebooks"):
    try:
        predicted_order = predict_notebook_order(notebook_id, TEST_PATH, model, feature_cols)
        submission_data.append({
            'id': notebook_id,
            'cell_order': predicted_order
        })
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")
        # Use default order (code cells first, then markdown cells in random order)
        submission_data.append({
            'id': notebook_id,
            'cell_order': ''
        })

submission_df = pd.DataFrame(submission_data)
print(f"Submission shape: {submission_df.shape}")
print(submission_df.head())

In [None]:
# Save submission
submission_path = '/home/submission/submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

# Verify format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nSample submission format:")
print(sample_sub.head())
print(f"\nOur submission format:")
print(submission_df.head())

print(f"\nColumns match: {list(submission_df.columns) == list(sample_sub.columns)}")
print(f"Number of rows match: {len(submission_df) == len(sample_sub)}")