# AI4Code Baseline Model

This notebook implements a baseline solution for the AI4Code competition.

## Approach
1. Extract basic features from cells (length, type, etc.)
2. Train a model to predict cell ordering
3. Use Kendall tau correlation as evaluation metric

In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import re
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set paths
DATA_PATH = Path('/home/data')
TRAIN_PATH = DATA_PATH / 'train'
TEST_PATH = DATA_PATH / 'test'
TRAIN_ORDERS_PATH = DATA_PATH / 'train_orders.csv'

print("Loading data paths...")
print(f"Train path: {TRAIN_PATH}")
print(f"Test path: {TEST_PATH}")
print(f"Train orders: {TRAIN_ORDERS_PATH}")

Loading data paths...
Train path: /home/data/train
Test path: /home/data/test
Train orders: /home/data/train_orders.csv


In [2]:
# Load train orders
train_orders = pd.read_csv(TRAIN_ORDERS_PATH)
print(f"Train orders shape: {train_orders.shape}")
print(f"First few rows:")
print(train_orders.head())

Train orders shape: (119256, 2)
First few rows:
               id                                         cell_order
0  00001756c60be8  1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...
1  0001daf4c2c76d  97266564 a898e555 86605076 76cc2642 ef279279 d...
2  0002115f48f982  9ec225f0 18281c6c e3b6b115 4a044c54 365fe576 a...
3  00035108e64677  3496fbfe 2fa1f27b 719854c4 f3c2de19 d75feb42 5...
4  00038c2941faa0  3e551fb7 45049ad8 8bb41691 123b4f4c 0b92cb59 5...


In [3]:
# Load a sample notebook to understand structure
def load_notebook(notebook_id, path):
    """Load a notebook from JSON file"""
    with open(path / f"{notebook_id}.json", 'r') as f:
        notebook = json.load(f)
    return notebook

# Load sample notebook
sample_nb = load_notebook('0002115f48f982', TRAIN_PATH)
print("Notebook keys:", list(sample_nb.keys()))
print("\nCell types:", list(sample_nb['cell_type'].items())[:5])
print("\nSample code cell:", list(sample_nb['source'].items())[0])
print("\nSample markdown cell:", list(sample_nb['source'].items())[-1])

Notebook keys: ['cell_type', 'source']

Cell types: [('18281c6c', 'code'), ('e3b6b115', 'code'), ('4a044c54', 'code'), ('365fe576', 'code'), ('a3188e54', 'code')]

Sample code cell: ('18281c6c', 'import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n%matplotlib inline\nimport os\nprint(os.listdir("../input"))\n')

Sample markdown cell: ('9ec225f0', 'Hi there,\n\nIs it ok that the same measurement have different target labels between signals?\nAccording to data description it should be the same (or not really?) . There are 38 cases of measurements with not consistent labels between signals.\n\nQuick and dirty code to show the problem below:')


In [4]:
# Create features for a notebook
def extract_features(notebook_id, path):
    """Extract features from a notebook"""
    notebook = load_notebook(notebook_id, path)
    
    features = []
    cell_ids = []
    
    for cell_id, cell_type in notebook['cell_type'].items():
        source = notebook['source'][cell_id]
        
        # Basic features
        feature_dict = {
            'notebook_id': notebook_id,
            'cell_id': cell_id,
            'cell_type': cell_type,
            'source_length': len(source),
            'line_count': source.count('\n') + 1,
            'word_count': len(source.split()),
            'char_count': len(source.replace('\n', '').replace(' ', '')),
            'has_import': int('import ' in source or 'from ' in source) if cell_type == 'code' else 0,
            'has_comment': int('#' in source) if cell_type == 'code' else 0,
            'has_heading': int(any(heading in source for heading in ['# ', '## ', '### '])) if cell_type == 'markdown' else 0,
            'has_code_block': int('```' in source) if cell_type == 'markdown' else 0,
            'has_link': int('http' in source or 'www.' in source) if cell_type == 'markdown' else 0,
        }
        
        features.append(feature_dict)
        cell_ids.append(cell_id)
    
    return pd.DataFrame(features), cell_ids

# Test feature extraction
sample_features, sample_cell_ids = extract_features('0002115f48f982', TRAIN_PATH)
print("Sample features shape:", sample_features.shape)
print(sample_features.head())

Sample features shape: (9, 12)
      notebook_id   cell_id cell_type  source_length  line_count  word_count  \
0  0002115f48f982  18281c6c      code            220           8          32   
1  0002115f48f982  e3b6b115      code             57           2           4   
2  0002115f48f982  4a044c54      code              9           1           1   
3  0002115f48f982  365fe576      code            164           3          15   
4  0002115f48f982  a3188e54      code            119           2          18   

   char_count  has_import  has_comment  has_heading  has_code_block  has_link  
0         188           1            1            0               0         0  
1          54           0            0            0               0         0  
2           9           0            0            0               0         0  
3         150           0            1            0               0         0  
4         101           0            1            0               0         0  


In [5]:
# Create training data
def create_training_data(notebook_ids, path, orders_df):
    """Create training dataset with features and target positions"""
    all_features = []
    all_targets = []
    
    for notebook_id in tqdm(notebook_ids, desc="Processing notebooks"):
        # Extract features
        features, cell_ids = extract_features(notebook_id, path)
        
        # Get correct order
        correct_order = orders_df[orders_df['id'] == notebook_id]['cell_order'].iloc[0].split()
        
        # Create position mapping
        position_map = {cell_id: pos for pos, cell_id in enumerate(correct_order)}
        
        # Add target position
        features['position'] = features['cell_id'].map(position_map)
        
        all_features.append(features)
    
    return pd.concat(all_features, ignore_index=True)

# Load a subset for initial training
sample_notebooks = train_orders['id'].head(1000).tolist()
train_df = create_training_data(sample_notebooks, TRAIN_PATH, train_orders)
print(f"Training data shape: {train_df.shape}")
print(train_df.head())

Processing notebooks:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing notebooks:   1%|          | 8/1000 [00:00<00:13, 74.18it/s]

Processing notebooks:   2%|▏         | 16/1000 [00:00<00:13, 73.08it/s]

Processing notebooks:   2%|▏         | 24/1000 [00:00<00:13, 73.15it/s]

Processing notebooks:   3%|▎         | 32/1000 [00:00<00:12, 75.14it/s]

Processing notebooks:   4%|▍         | 41/1000 [00:00<00:12, 78.68it/s]

Processing notebooks:   5%|▌         | 50/1000 [00:00<00:11, 80.53it/s]

Processing notebooks:   6%|▌         | 59/1000 [00:00<00:11, 82.49it/s]

Processing notebooks:   7%|▋         | 68/1000 [00:00<00:11, 83.20it/s]

Processing notebooks:   8%|▊         | 77/1000 [00:00<00:10, 84.63it/s]

Processing notebooks:   9%|▊         | 86/1000 [00:01<00:10, 85.24it/s]

Processing notebooks:  10%|▉         | 95/1000 [00:01<00:10, 83.34it/s]

Processing notebooks:  10%|█         | 104/1000 [00:01<00:10, 82.99it/s]

Processing notebooks:  11%|█▏        | 113/1000 [00:01<00:10, 83.41it/s]

Processing notebooks:  12%|█▏        | 122/1000 [00:01<00:10, 83.43it/s]

Processing notebooks:  13%|█▎        | 131/1000 [00:01<00:10, 82.69it/s]

Processing notebooks:  14%|█▍        | 140/1000 [00:01<00:10, 81.76it/s]

Processing notebooks:  15%|█▍        | 149/1000 [00:01<00:10, 81.41it/s]

Processing notebooks:  16%|█▌        | 158/1000 [00:01<00:10, 81.74it/s]

Processing notebooks:  17%|█▋        | 167/1000 [00:02<00:10, 82.07it/s]

Processing notebooks:  18%|█▊        | 176/1000 [00:02<00:10, 81.17it/s]

Processing notebooks:  18%|█▊        | 185/1000 [00:02<00:10, 79.54it/s]

Processing notebooks:  19%|█▉        | 193/1000 [00:02<00:10, 78.29it/s]

Processing notebooks:  20%|██        | 201/1000 [00:02<00:10, 78.34it/s]

Processing notebooks:  21%|██        | 210/1000 [00:02<00:09, 80.40it/s]

Processing notebooks:  22%|██▏       | 219/1000 [00:02<00:09, 81.28it/s]

Processing notebooks:  23%|██▎       | 228/1000 [00:02<00:09, 82.95it/s]

Processing notebooks:  24%|██▎       | 237/1000 [00:02<00:09, 84.37it/s]

Processing notebooks:  25%|██▍       | 246/1000 [00:03<00:08, 84.73it/s]

Processing notebooks:  26%|██▌       | 255/1000 [00:03<00:08, 84.42it/s]

Processing notebooks:  26%|██▋       | 264/1000 [00:03<00:08, 84.67it/s]

Processing notebooks:  27%|██▋       | 273/1000 [00:03<00:08, 85.02it/s]

Processing notebooks:  28%|██▊       | 282/1000 [00:03<00:08, 83.04it/s]

Processing notebooks:  29%|██▉       | 291/1000 [00:03<00:08, 81.61it/s]

Processing notebooks:  30%|███       | 300/1000 [00:03<00:08, 79.85it/s]

Processing notebooks:  31%|███       | 309/1000 [00:03<00:08, 80.62it/s]

Processing notebooks:  32%|███▏      | 318/1000 [00:03<00:08, 81.89it/s]

Processing notebooks:  33%|███▎      | 327/1000 [00:04<00:08, 82.87it/s]

Processing notebooks:  34%|███▎      | 336/1000 [00:04<00:07, 83.68it/s]

Processing notebooks:  34%|███▍      | 345/1000 [00:04<00:07, 85.05it/s]

Processing notebooks:  35%|███▌      | 354/1000 [00:04<00:07, 83.35it/s]

Processing notebooks:  36%|███▋      | 363/1000 [00:04<00:07, 83.23it/s]

Processing notebooks:  37%|███▋      | 373/1000 [00:04<00:07, 85.95it/s]

Processing notebooks:  38%|███▊      | 383/1000 [00:04<00:07, 87.45it/s]

Processing notebooks:  39%|███▉      | 392/1000 [00:04<00:06, 87.50it/s]

Processing notebooks:  40%|████      | 401/1000 [00:04<00:06, 87.93it/s]

Processing notebooks:  41%|████      | 411/1000 [00:04<00:06, 89.48it/s]

Processing notebooks:  42%|████▏     | 420/1000 [00:05<00:06, 87.94it/s]

Processing notebooks:  43%|████▎     | 429/1000 [00:05<00:06, 87.16it/s]

Processing notebooks:  44%|████▍     | 438/1000 [00:05<00:06, 86.19it/s]

Processing notebooks:  45%|████▍     | 447/1000 [00:05<00:06, 85.92it/s]

Processing notebooks:  46%|████▌     | 456/1000 [00:05<00:06, 86.18it/s]

Processing notebooks:  46%|████▋     | 465/1000 [00:05<00:06, 84.81it/s]

Processing notebooks:  47%|████▋     | 474/1000 [00:05<00:06, 83.62it/s]

Processing notebooks:  48%|████▊     | 483/1000 [00:05<00:06, 83.32it/s]

Processing notebooks:  49%|████▉     | 492/1000 [00:05<00:06, 81.11it/s]

Processing notebooks:  50%|█████     | 501/1000 [00:06<00:06, 80.32it/s]

Processing notebooks:  51%|█████     | 510/1000 [00:06<00:06, 79.35it/s]

Processing notebooks:  52%|█████▏    | 518/1000 [00:06<00:06, 79.29it/s]

Processing notebooks:  53%|█████▎    | 526/1000 [00:06<00:06, 78.60it/s]

Processing notebooks:  53%|█████▎    | 534/1000 [00:06<00:05, 78.79it/s]

Processing notebooks:  54%|█████▍    | 542/1000 [00:06<00:07, 62.21it/s]

Processing notebooks:  55%|█████▌    | 552/1000 [00:06<00:06, 70.01it/s]

Processing notebooks:  56%|█████▌    | 562/1000 [00:06<00:05, 76.10it/s]

Processing notebooks:  57%|█████▋    | 572/1000 [00:06<00:05, 80.21it/s]

Processing notebooks:  58%|█████▊    | 582/1000 [00:07<00:05, 83.11it/s]

Processing notebooks:  59%|█████▉    | 591/1000 [00:07<00:04, 84.46it/s]

Processing notebooks:  60%|██████    | 600/1000 [00:07<00:04, 85.40it/s]

Processing notebooks:  61%|██████    | 610/1000 [00:07<00:04, 87.66it/s]

Processing notebooks:  62%|██████▏   | 620/1000 [00:07<00:04, 88.81it/s]

Processing notebooks:  63%|██████▎   | 629/1000 [00:07<00:04, 88.30it/s]

Processing notebooks:  64%|██████▍   | 638/1000 [00:07<00:04, 87.80it/s]

Processing notebooks:  65%|██████▍   | 647/1000 [00:07<00:04, 87.71it/s]

Processing notebooks:  66%|██████▌   | 656/1000 [00:07<00:03, 87.30it/s]

Processing notebooks:  66%|██████▋   | 665/1000 [00:08<00:03, 87.99it/s]

Processing notebooks:  67%|██████▋   | 674/1000 [00:08<00:03, 87.70it/s]

Processing notebooks:  68%|██████▊   | 683/1000 [00:08<00:03, 85.14it/s]

Processing notebooks:  69%|██████▉   | 692/1000 [00:08<00:03, 83.70it/s]

Processing notebooks:  70%|███████   | 701/1000 [00:08<00:03, 83.90it/s]

Processing notebooks:  71%|███████   | 711/1000 [00:08<00:03, 87.16it/s]

Processing notebooks:  72%|███████▏  | 721/1000 [00:08<00:03, 88.86it/s]

Processing notebooks:  73%|███████▎  | 731/1000 [00:08<00:02, 90.05it/s]

Processing notebooks:  74%|███████▍  | 741/1000 [00:08<00:02, 91.13it/s]

Processing notebooks:  75%|███████▌  | 751/1000 [00:09<00:02, 92.65it/s]

Processing notebooks:  76%|███████▌  | 761/1000 [00:09<00:02, 93.37it/s]

Processing notebooks:  77%|███████▋  | 771/1000 [00:09<00:02, 94.53it/s]

Processing notebooks:  78%|███████▊  | 781/1000 [00:09<00:02, 91.86it/s]

Processing notebooks:  79%|███████▉  | 791/1000 [00:09<00:02, 92.44it/s]

Processing notebooks:  80%|████████  | 801/1000 [00:09<00:02, 92.71it/s]

Processing notebooks:  81%|████████  | 811/1000 [00:09<00:02, 92.44it/s]

Processing notebooks:  82%|████████▏ | 821/1000 [00:09<00:02, 89.36it/s]

Processing notebooks:  83%|████████▎ | 830/1000 [00:09<00:01, 88.36it/s]

Processing notebooks:  84%|████████▍ | 839/1000 [00:09<00:01, 88.02it/s]

Processing notebooks:  85%|████████▍ | 848/1000 [00:10<00:01, 86.59it/s]

Processing notebooks:  86%|████████▌ | 857/1000 [00:10<00:01, 86.51it/s]

Processing notebooks:  87%|████████▋ | 866/1000 [00:10<00:01, 84.65it/s]

Processing notebooks:  88%|████████▊ | 875/1000 [00:10<00:01, 84.03it/s]

Processing notebooks:  88%|████████▊ | 884/1000 [00:10<00:01, 80.69it/s]

Processing notebooks:  89%|████████▉ | 893/1000 [00:10<00:01, 79.23it/s]

Processing notebooks:  90%|█████████ | 902/1000 [00:10<00:01, 79.62it/s]

Processing notebooks:  91%|█████████ | 911/1000 [00:10<00:01, 80.84it/s]

Processing notebooks:  92%|█████████▏| 920/1000 [00:10<00:00, 81.94it/s]

Processing notebooks:  93%|█████████▎| 929/1000 [00:11<00:00, 82.20it/s]

Processing notebooks:  94%|█████████▍| 938/1000 [00:11<00:00, 82.45it/s]

Processing notebooks:  95%|█████████▍| 948/1000 [00:11<00:00, 85.22it/s]

Processing notebooks:  96%|█████████▌| 958/1000 [00:11<00:00, 87.04it/s]

Processing notebooks:  97%|█████████▋| 968/1000 [00:11<00:00, 88.40it/s]

Processing notebooks:  98%|█████████▊| 978/1000 [00:11<00:00, 90.16it/s]

Processing notebooks:  99%|█████████▉| 988/1000 [00:11<00:00, 90.77it/s]

Processing notebooks: 100%|█████████▉| 998/1000 [00:11<00:00, 89.84it/s]

Processing notebooks: 100%|██████████| 1000/1000 [00:11<00:00, 84.16it/s]

Training data shape: (46232, 13)
      notebook_id   cell_id cell_type  source_length  line_count  word_count  \
0  00001756c60be8  1862f0a6      code            930          17         140   
1  00001756c60be8  2a9e43d6      code            498          17          55   
2  00001756c60be8  038b763d      code             49           2           3   
3  00001756c60be8  2eefe0ef      code             45           1           2   
4  00001756c60be8  0beab1cd      code            694          19          39   

   char_count  has_import  has_comment  has_heading  has_code_block  has_link  \
0         774           1            1            0               0         0   
1         441           1            0            0               0         0   
2          47           1            0            0               0         0   
3          44           0            0            0               0         0   
4         584           0            0            0               0         0   





In [None]:
# Prepare features for modeling
feature_cols = [col for col in train_df.columns if col not in ['notebook_id', 'cell_id', 'position', 'cell_type']]
print(f"Feature columns: {feature_cols}")

# Add cell type as categorical feature
train_df['cell_type_code'] = (train_df['cell_type'] == 'code').astype(int)
feature_cols.append('cell_type_code')

print(f"Final feature columns: {feature_cols}")
print(f"Training data shape: {train_df.shape}")

In [None]:
# Define Kendall tau metric for evaluation
def kendall_tau_score(y_true, y_pred):
    """Calculate Kendall tau correlation"""
    from scipy.stats import kendalltau
    return kendalltau(y_true, y_pred).correlation

kendall_scorer = make_scorer(kendall_tau_score, greater_is_better=True)

# Train model
print("Training LightGBM model...")

# Use a subset for faster training
sample_df = train_df.sample(frac=0.3, random_state=42)

X = sample_df[feature_cols]
y = sample_df['position']

# Train LightGBM model
model = lgb.LGBMRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

model.fit(X, y)

print("Model training completed!")
print(f"Feature importances: {dict(zip(feature_cols, model.feature_importances_))}")

In [None]:
# Predict on test data
def predict_notebook_order(notebook_id, path, model, feature_cols):
    """Predict cell order for a notebook"""
    features, cell_ids = extract_features(notebook_id, path)
    
    # Add cell type code
    features['cell_type_code'] = (features['cell_type'] == 'code').astype(int)
    
    # Predict positions
    X = features[feature_cols]
    predicted_positions = model.predict(X)
    
    # Sort by predicted position
    order_df = pd.DataFrame({
        'cell_id': cell_ids,
        'predicted_position': predicted_positions
    })
    
    # Sort and get ordered cell IDs
    ordered_cells = order_df.sort_values('predicted_position')['cell_id'].tolist()
    
    return ' '.join(ordered_cells)

# Test on a few notebooks
test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')][:5]
print("Testing prediction on sample notebooks:")
for nb_id in test_notebooks:
    try:
        predicted_order = predict_notebook_order(nb_id, TEST_PATH, model, feature_cols)
        print(f"{nb_id}: {predicted_order[:100]}...")
    except Exception as e:
        print(f"Error with {nb_id}: {e}")

In [None]:
# Generate submission for all test notebooks
print("Generating submission for all test notebooks...")

test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')]
submission_data = []

for notebook_id in tqdm(test_notebooks, desc="Predicting notebooks"):
    try:
        predicted_order = predict_notebook_order(notebook_id, TEST_PATH, model, feature_cols)
        submission_data.append({
            'id': notebook_id,
            'cell_order': predicted_order
        })
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")
        # Use default order (code cells first, then markdown cells in random order)
        submission_data.append({
            'id': notebook_id,
            'cell_order': ''
        })

submission_df = pd.DataFrame(submission_data)
print(f"Submission shape: {submission_df.shape}")
print(submission_df.head())

In [None]:
# Save submission
submission_path = '/home/submission/submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

# Verify format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nSample submission format:")
print(sample_sub.head())
print(f"\nOur submission format:")
print(submission_df.head())

print(f"\nColumns match: {list(submission_df.columns) == list(sample_sub.columns)}")
print(f"Number of rows match: {len(submission_df) == len(sample_sub)}")