# Experiment 003: Remove Data Leakage

**Objective:** Remove the data leakage from 'relative_position' feature and measure true model performance.

**Issue identified:** In exp_002, 'relative_position' = position / notebook_size, where 'position' is the target variable. This is data leakage.

**Changes:**
- Remove 'relative_position' feature from training
- Keep all other features (TF-IDF, headings, notebook-level stats)
- Use same 5-fold GroupKFold validation

**Expected outcome:** CV score will drop significantly, revealing true model performance.

In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from tqdm import tqdm
import re
from collections import Counter
from sklearn.model_selection import GroupKFold
from sklearn.metrics import make_scorer
from scipy.stats import kendalltau
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Set paths
TRAIN_PATH = Path('/home/data/train')
TEST_PATH = Path('/home/data/test')
ORDERS_PATH = Path('/home/data/train_orders.csv')

print("Loading data...")
orders_df = pd.read_csv(ORDERS_PATH)
print(f"Orders shape: {orders_df.shape}")
print(f"Unique notebooks: {orders_df['id'].nunique()}")

Loading data...


Orders shape: (119256, 2)
Unique notebooks: 119256


In [2]:
# Load a subset of training data for faster iteration
# Using 5,000 notebooks like exp_002
np.random.seed(42)
all_notebooks = orders_df['id'].unique()
selected_notebooks = np.random.choice(all_notebooks, size=5000, replace=False)

print(f"Selected {len(selected_notebooks)} notebooks for training")

Selected 5000 notebooks for training


In [3]:
# Feature extraction functions
def extract_basic_features(df):
    """Extract basic text statistics"""
    df['source_length'] = df['source'].str.len()
    df['line_count'] = df['source'].str.count('\\n') + 1
    df['word_count'] = df['source'].str.split().str.len()
    df['char_count'] = df['source'].str.len()
    
    # Binary flags
    df['has_import'] = df['source'].str.contains('import\\s+\\w+', regex=True, na=False).astype(int)
    df['has_comment'] = df['source'].str.contains('#', na=False).astype(int)
    df['has_heading'] = df['source'].str.contains('^#+\\s+', regex=True, na=False).astype(int)
    df['has_code_block'] = df['source'].str.contains('```', na=False).astype(int)
    df['has_link'] = df['source'].str.contains('\\[.*\\]\\(.*\\)', regex=True, na=False).astype(int)
    
    return df

def extract_heading_features(df):
    """Extract heading-related features"""
    # Extract heading level (1-6)
    df['heading_level'] = 0
    for level in range(1, 7):
        mask = df['source'].str.match(f'^#{{{level}}}\\s+', na=False)
        df.loc[mask, 'heading_level'] = level
    
    # Binary flags for common heading texts
    common_headings = ['introduction', 'conclusion', 'summary', 'results', 'methods', 
                       'analysis', 'eda', 'exploratory', 'data', 'preprocessing',
                       'model', 'training', 'evaluation', 'references', 'appendix',
                       'setup', 'imports', 'installation', 'requirements',
                       'visualization', 'plot', 'train', 'test', 'validation']
    
    for heading in common_headings:
        df[f'heading_{heading}'] = df['source'].str.contains(heading, case=False, na=False).astype(int)
    
    return df

def extract_semantic_features(df):
    """Extract semantic position features"""
    df['has_print'] = df['source'].str.contains('print\\s*\\(', na=False).astype(int)
    df['has_kaggle'] = df['source'].str.contains('kaggle', case=False, na=False).astype(int)
    df['has_input'] = df['source'].str.contains('input', case=False, na=False).astype(int)
    df['has_data'] = df['source'].str.contains('\\bdata\\b', case=False, na=False).astype(int)
    df['has_function'] = df['source'].str.contains('def\\s+\\w+\\s*\\(', regex=True, na=False).astype(int)
    df['has_class'] = df['source'].str.contains('class\\s+\\w+', regex=True, na=False).astype(int)
    df['has_model'] = df['source'].str.contains('\\bmodel\\b', case=False, na=False).astype(int)
    df['has_train'] = df['source'].str.contains('\\btrain\\b', case=False, na=False).astype(int)
    df['has_test'] = df['source'].str.contains('\\btest\\b', case=False, na=False).astype(int)
    df['has_plot'] = df['source'].str.contains('\\.plot\\s*\\(|\\.show\\s*\\(', regex=True, na=False).astype(int)
    df['has_import'] = df['source'].str.contains('^\\s*import\\s+|^\\s*from\\s+', regex=True, na=False).astype(int)
    
    return df

In [4]:
# Load and process training data
print("Loading training notebooks...")
train_data = []
notebook_sizes = {}

for notebook_id in tqdm(selected_notebooks, desc="Processing notebooks"):
    notebook_path = TRAIN_PATH / f"{notebook_id}.json"
    
    if not notebook_path.exists():
        continue
    
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    cell_order = orders_df[orders_df['id'] == notebook_id]['cell_order'].iloc[0].split()
    cell_positions = {cell_id: pos for pos, cell_id in enumerate(cell_order)}
    
    notebook_sizes[notebook_id] = len(cell_order)
    
    # The notebook structure has cell_type and source as separate dictionaries
    cell_types = notebook['cell_type']
    sources = notebook['source']
    
    for cell_id in cell_order:
        cell_type = 1 if cell_types[cell_id] == 'code' else 0
        source = sources[cell_id]
        
        train_data.append({
            'notebook_id': notebook_id,
            'cell_id': cell_id,
            'cell_type': cell_type,
            'source': source,
            'position': cell_positions[cell_id]
        })

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Average cells per notebook: {train_df.groupby('notebook_id').size().mean():.1f}")

Loading training notebooks...


Processing notebooks:   0%|          | 0/5000 [00:00<?, ?it/s]

Processing notebooks:   0%|          | 11/5000 [00:00<00:49, 100.99it/s]

Processing notebooks:   0%|          | 22/5000 [00:00<00:48, 101.79it/s]

Processing notebooks:   1%|          | 33/5000 [00:00<00:48, 101.86it/s]

Processing notebooks:   1%|          | 44/5000 [00:00<00:48, 101.52it/s]

Processing notebooks:   1%|          | 55/5000 [00:00<00:48, 101.87it/s]

Processing notebooks:   1%|▏         | 66/5000 [00:00<00:48, 102.71it/s]

Processing notebooks:   2%|▏         | 77/5000 [00:00<00:48, 102.29it/s]

Processing notebooks:   2%|▏         | 88/5000 [00:00<00:48, 101.34it/s]

Processing notebooks:   2%|▏         | 99/5000 [00:00<00:48, 101.57it/s]

Processing notebooks:   2%|▏         | 110/5000 [00:01<00:48, 101.67it/s]

Processing notebooks:   2%|▏         | 121/5000 [00:01<00:48, 101.58it/s]

Processing notebooks:   3%|▎         | 132/5000 [00:01<00:48, 99.72it/s] 

Processing notebooks:   3%|▎         | 142/5000 [00:01<00:48, 99.62it/s]

Processing notebooks:   3%|▎         | 153/5000 [00:01<00:48, 99.77it/s]

Processing notebooks:   3%|▎         | 163/5000 [00:01<00:48, 99.68it/s]

Processing notebooks:   3%|▎         | 174/5000 [00:01<00:48, 100.19it/s]

Processing notebooks:   4%|▎         | 185/5000 [00:01<00:48, 100.20it/s]

Processing notebooks:   4%|▍         | 196/5000 [00:01<00:48, 98.85it/s] 

Processing notebooks:   4%|▍         | 206/5000 [00:02<00:49, 97.11it/s]

Processing notebooks:   4%|▍         | 216/5000 [00:02<00:49, 96.39it/s]

Processing notebooks:   5%|▍         | 227/5000 [00:02<00:48, 98.24it/s]

Processing notebooks:   5%|▍         | 238/5000 [00:02<00:48, 99.09it/s]

Processing notebooks:   5%|▍         | 249/5000 [00:02<00:47, 99.83it/s]

Processing notebooks:   5%|▌         | 260/5000 [00:02<00:47, 100.15it/s]

Processing notebooks:   5%|▌         | 271/5000 [00:02<00:47, 100.61it/s]

Processing notebooks:   6%|▌         | 282/5000 [00:02<00:46, 101.43it/s]

Processing notebooks:   6%|▌         | 293/5000 [00:02<00:46, 101.52it/s]

Processing notebooks:   6%|▌         | 304/5000 [00:03<00:46, 101.72it/s]

Processing notebooks:   6%|▋         | 315/5000 [00:03<00:46, 101.72it/s]

Processing notebooks:   7%|▋         | 326/5000 [00:03<00:45, 101.80it/s]

Processing notebooks:   7%|▋         | 337/5000 [00:03<00:46, 101.15it/s]

Processing notebooks:   7%|▋         | 348/5000 [00:03<00:46, 99.69it/s] 

Processing notebooks:   7%|▋         | 359/5000 [00:03<00:46, 99.80it/s]

Processing notebooks:   7%|▋         | 369/5000 [00:03<00:46, 99.43it/s]

Processing notebooks:   8%|▊         | 379/5000 [00:03<00:46, 99.31it/s]

Processing notebooks:   8%|▊         | 389/5000 [00:03<00:46, 99.41it/s]

Processing notebooks:   8%|▊         | 399/5000 [00:03<00:46, 98.64it/s]

Processing notebooks:   8%|▊         | 409/5000 [00:04<00:46, 98.05it/s]

Processing notebooks:   8%|▊         | 419/5000 [00:04<00:47, 97.20it/s]

Processing notebooks:   9%|▊         | 429/5000 [00:04<00:46, 97.29it/s]

Processing notebooks:   9%|▉         | 440/5000 [00:04<00:46, 98.95it/s]

Processing notebooks:   9%|▉         | 451/5000 [00:04<00:45, 100.15it/s]

Processing notebooks:   9%|▉         | 462/5000 [00:04<00:45, 100.56it/s]

Processing notebooks:   9%|▉         | 473/5000 [00:04<00:44, 100.61it/s]

Processing notebooks:  10%|▉         | 484/5000 [00:04<00:44, 100.63it/s]

Processing notebooks:  10%|▉         | 495/5000 [00:04<00:44, 100.96it/s]

Processing notebooks:  10%|█         | 506/5000 [00:05<00:44, 100.77it/s]

Processing notebooks:  10%|█         | 517/5000 [00:05<00:44, 100.89it/s]

Processing notebooks:  11%|█         | 528/5000 [00:05<00:44, 101.07it/s]

Processing notebooks:  11%|█         | 539/5000 [00:05<00:44, 100.29it/s]

Processing notebooks:  11%|█         | 550/5000 [00:05<00:44, 99.83it/s] 

Processing notebooks:  11%|█         | 560/5000 [00:05<00:44, 99.69it/s]

Processing notebooks:  11%|█▏        | 570/5000 [00:05<00:44, 99.63it/s]

Processing notebooks:  12%|█▏        | 580/5000 [00:05<00:44, 99.73it/s]

Processing notebooks:  12%|█▏        | 590/5000 [00:05<00:56, 78.02it/s]

Processing notebooks:  12%|█▏        | 600/5000 [00:06<00:53, 82.29it/s]

Processing notebooks:  12%|█▏        | 610/5000 [00:06<00:51, 85.73it/s]

Processing notebooks:  12%|█▏        | 620/5000 [00:06<00:49, 88.79it/s]

Processing notebooks:  13%|█▎        | 631/5000 [00:06<00:47, 92.71it/s]

Processing notebooks:  13%|█▎        | 642/5000 [00:06<00:45, 95.18it/s]

Processing notebooks:  13%|█▎        | 653/5000 [00:06<00:44, 97.28it/s]

Processing notebooks:  13%|█▎        | 664/5000 [00:06<00:43, 99.27it/s]

Processing notebooks:  14%|█▎        | 675/5000 [00:06<00:43, 100.26it/s]

Processing notebooks:  14%|█▎        | 686/5000 [00:06<00:42, 101.02it/s]

Processing notebooks:  14%|█▍        | 697/5000 [00:07<00:42, 101.48it/s]

Processing notebooks:  14%|█▍        | 708/5000 [00:07<00:42, 101.61it/s]

Processing notebooks:  14%|█▍        | 719/5000 [00:07<00:41, 102.04it/s]

Processing notebooks:  15%|█▍        | 730/5000 [00:07<00:41, 102.70it/s]

Processing notebooks:  15%|█▍        | 741/5000 [00:07<00:41, 102.66it/s]

Processing notebooks:  15%|█▌        | 752/5000 [00:07<00:41, 102.12it/s]

Processing notebooks:  15%|█▌        | 763/5000 [00:07<00:41, 101.80it/s]

Processing notebooks:  15%|█▌        | 774/5000 [00:07<00:41, 101.41it/s]

Processing notebooks:  16%|█▌        | 785/5000 [00:07<00:41, 101.05it/s]

Processing notebooks:  16%|█▌        | 796/5000 [00:08<00:41, 101.35it/s]

Processing notebooks:  16%|█▌        | 807/5000 [00:08<00:41, 100.79it/s]

Processing notebooks:  16%|█▋        | 818/5000 [00:08<00:42, 99.46it/s] 

Processing notebooks:  17%|█▋        | 828/5000 [00:08<00:42, 99.18it/s]

Processing notebooks:  17%|█▋        | 839/5000 [00:08<00:41, 100.94it/s]

Processing notebooks:  17%|█▋        | 850/5000 [00:08<00:40, 101.49it/s]

Processing notebooks:  17%|█▋        | 861/5000 [00:08<00:40, 101.85it/s]

Processing notebooks:  17%|█▋        | 872/5000 [00:08<00:40, 102.53it/s]

Processing notebooks:  18%|█▊        | 883/5000 [00:08<00:39, 103.43it/s]

Processing notebooks:  18%|█▊        | 894/5000 [00:08<00:39, 103.71it/s]

Processing notebooks:  18%|█▊        | 905/5000 [00:09<00:39, 103.59it/s]

Processing notebooks:  18%|█▊        | 916/5000 [00:09<00:39, 103.21it/s]

Processing notebooks:  19%|█▊        | 927/5000 [00:09<00:39, 102.82it/s]

Processing notebooks:  19%|█▉        | 938/5000 [00:09<00:39, 102.88it/s]

Processing notebooks:  19%|█▉        | 949/5000 [00:09<00:39, 103.37it/s]

Processing notebooks:  19%|█▉        | 960/5000 [00:09<00:39, 102.52it/s]

Processing notebooks:  19%|█▉        | 971/5000 [00:09<00:39, 102.13it/s]

Processing notebooks:  20%|█▉        | 982/5000 [00:09<00:39, 101.43it/s]

Processing notebooks:  20%|█▉        | 993/5000 [00:09<00:39, 101.21it/s]

Processing notebooks:  20%|██        | 1004/5000 [00:10<00:39, 100.91it/s]

Processing notebooks:  20%|██        | 1015/5000 [00:10<00:39, 100.91it/s]

Processing notebooks:  21%|██        | 1026/5000 [00:10<00:39, 99.52it/s] 

Processing notebooks:  21%|██        | 1037/5000 [00:10<00:39, 100.05it/s]

Processing notebooks:  21%|██        | 1048/5000 [00:10<00:39, 100.88it/s]

Processing notebooks:  21%|██        | 1059/5000 [00:10<00:38, 101.26it/s]

Processing notebooks:  21%|██▏       | 1070/5000 [00:10<00:38, 102.33it/s]

Processing notebooks:  22%|██▏       | 1081/5000 [00:10<00:38, 102.22it/s]

Processing notebooks:  22%|██▏       | 1092/5000 [00:10<00:38, 102.04it/s]

Processing notebooks:  22%|██▏       | 1103/5000 [00:11<00:37, 103.00it/s]

Processing notebooks:  22%|██▏       | 1114/5000 [00:11<00:37, 103.71it/s]

Processing notebooks:  22%|██▎       | 1125/5000 [00:11<00:37, 103.83it/s]

Processing notebooks:  23%|██▎       | 1136/5000 [00:11<00:37, 103.21it/s]

Processing notebooks:  23%|██▎       | 1147/5000 [00:11<00:37, 102.87it/s]

Processing notebooks:  23%|██▎       | 1158/5000 [00:11<00:37, 102.86it/s]

Processing notebooks:  23%|██▎       | 1169/5000 [00:11<00:37, 102.89it/s]

Processing notebooks:  24%|██▎       | 1180/5000 [00:11<00:37, 101.75it/s]

Processing notebooks:  24%|██▍       | 1191/5000 [00:11<00:37, 101.49it/s]

Processing notebooks:  24%|██▍       | 1202/5000 [00:12<00:37, 101.38it/s]

Processing notebooks:  24%|██▍       | 1213/5000 [00:12<00:37, 100.95it/s]

Processing notebooks:  24%|██▍       | 1224/5000 [00:12<00:37, 100.96it/s]

Processing notebooks:  25%|██▍       | 1235/5000 [00:12<00:37, 100.83it/s]

Processing notebooks:  25%|██▍       | 1246/5000 [00:12<00:37, 99.57it/s] 

Processing notebooks:  25%|██▌       | 1256/5000 [00:12<00:37, 98.92it/s]

Processing notebooks:  25%|██▌       | 1267/5000 [00:12<00:37, 99.50it/s]

Processing notebooks:  26%|██▌       | 1278/5000 [00:12<00:36, 101.18it/s]

Processing notebooks:  26%|██▌       | 1289/5000 [00:12<00:36, 101.32it/s]

Processing notebooks:  26%|██▌       | 1300/5000 [00:12<00:36, 102.05it/s]

Processing notebooks:  26%|██▌       | 1311/5000 [00:13<00:35, 102.82it/s]

Processing notebooks:  26%|██▋       | 1322/5000 [00:13<00:35, 103.39it/s]

Processing notebooks:  27%|██▋       | 1333/5000 [00:13<00:35, 103.32it/s]

Processing notebooks:  27%|██▋       | 1344/5000 [00:13<00:35, 102.73it/s]

Processing notebooks:  27%|██▋       | 1355/5000 [00:13<00:35, 102.69it/s]

Processing notebooks:  27%|██▋       | 1366/5000 [00:13<00:35, 102.06it/s]

Processing notebooks:  28%|██▊       | 1377/5000 [00:13<00:35, 102.11it/s]

Processing notebooks:  28%|██▊       | 1388/5000 [00:13<00:35, 101.69it/s]

Processing notebooks:  28%|██▊       | 1399/5000 [00:13<00:35, 101.13it/s]

Processing notebooks:  28%|██▊       | 1410/5000 [00:14<00:35, 100.94it/s]

Processing notebooks:  28%|██▊       | 1421/5000 [00:14<00:35, 100.58it/s]

Processing notebooks:  29%|██▊       | 1432/5000 [00:14<00:35, 100.89it/s]

Processing notebooks:  29%|██▉       | 1443/5000 [00:14<00:35, 100.68it/s]

Processing notebooks:  29%|██▉       | 1454/5000 [00:14<00:35, 99.35it/s] 

Processing notebooks:  29%|██▉       | 1465/5000 [00:14<00:35, 100.01it/s]

Processing notebooks:  30%|██▉       | 1476/5000 [00:14<00:35, 100.28it/s]

Processing notebooks:  30%|██▉       | 1487/5000 [00:14<00:34, 101.12it/s]

Processing notebooks:  30%|██▉       | 1498/5000 [00:14<00:34, 101.90it/s]

Processing notebooks:  30%|███       | 1509/5000 [00:15<00:34, 102.19it/s]

Processing notebooks:  30%|███       | 1520/5000 [00:15<00:33, 102.49it/s]

Processing notebooks:  31%|███       | 1531/5000 [00:15<00:33, 103.09it/s]

Processing notebooks:  31%|███       | 1542/5000 [00:15<00:33, 103.68it/s]

Processing notebooks:  31%|███       | 1553/5000 [00:15<00:33, 104.19it/s]

Processing notebooks:  31%|███▏      | 1564/5000 [00:15<00:33, 103.55it/s]

Processing notebooks:  32%|███▏      | 1575/5000 [00:15<00:33, 103.23it/s]

Processing notebooks:  32%|███▏      | 1586/5000 [00:15<00:33, 103.02it/s]

Processing notebooks:  32%|███▏      | 1597/5000 [00:15<00:33, 103.08it/s]

Processing notebooks:  32%|███▏      | 1608/5000 [00:15<00:32, 103.81it/s]

Processing notebooks:  32%|███▏      | 1619/5000 [00:16<00:32, 103.78it/s]

Processing notebooks:  33%|███▎      | 1630/5000 [00:16<00:32, 102.75it/s]

Processing notebooks:  33%|███▎      | 1641/5000 [00:16<00:32, 102.10it/s]

Processing notebooks:  33%|███▎      | 1652/5000 [00:16<00:32, 101.81it/s]

Processing notebooks:  33%|███▎      | 1663/5000 [00:16<00:32, 101.48it/s]

Processing notebooks:  33%|███▎      | 1674/5000 [00:16<00:32, 101.43it/s]

Processing notebooks:  34%|███▎      | 1685/5000 [00:16<00:33, 100.27it/s]

Processing notebooks:  34%|███▍      | 1696/5000 [00:16<00:33, 99.70it/s] 

Processing notebooks:  34%|███▍      | 1706/5000 [00:16<00:33, 99.13it/s]

Processing notebooks:  34%|███▍      | 1717/5000 [00:17<00:32, 100.95it/s]

Processing notebooks:  35%|███▍      | 1728/5000 [00:17<00:32, 101.70it/s]

Processing notebooks:  35%|███▍      | 1739/5000 [00:17<00:31, 102.07it/s]

Processing notebooks:  35%|███▌      | 1750/5000 [00:17<00:31, 102.57it/s]

Processing notebooks:  35%|███▌      | 1761/5000 [00:17<00:31, 103.24it/s]

Processing notebooks:  35%|███▌      | 1772/5000 [00:17<00:31, 103.51it/s]

Processing notebooks:  36%|███▌      | 1783/5000 [00:17<00:31, 103.49it/s]

Processing notebooks:  36%|███▌      | 1794/5000 [00:17<00:30, 103.57it/s]

Processing notebooks:  36%|███▌      | 1805/5000 [00:17<00:30, 103.72it/s]

Processing notebooks:  36%|███▋      | 1816/5000 [00:18<00:30, 103.89it/s]

Processing notebooks:  37%|███▋      | 1827/5000 [00:18<00:30, 103.52it/s]

Processing notebooks:  37%|███▋      | 1838/5000 [00:18<00:30, 102.48it/s]

Processing notebooks:  37%|███▋      | 1849/5000 [00:18<00:30, 101.95it/s]

Processing notebooks:  37%|███▋      | 1860/5000 [00:18<00:30, 101.51it/s]

Processing notebooks:  37%|███▋      | 1871/5000 [00:18<00:30, 101.05it/s]

Processing notebooks:  38%|███▊      | 1882/5000 [00:18<00:30, 101.18it/s]

Processing notebooks:  38%|███▊      | 1893/5000 [00:18<00:30, 100.66it/s]

Processing notebooks:  38%|███▊      | 1904/5000 [00:18<00:31, 99.54it/s] 

Processing notebooks:  38%|███▊      | 1914/5000 [00:19<00:31, 98.53it/s]

Processing notebooks:  38%|███▊      | 1925/5000 [00:19<00:30, 100.43it/s]

Processing notebooks:  39%|███▊      | 1936/5000 [00:19<00:30, 101.69it/s]

Processing notebooks:  39%|███▉      | 1947/5000 [00:19<00:29, 102.00it/s]

Processing notebooks:  39%|███▉      | 1958/5000 [00:19<00:29, 102.70it/s]

Processing notebooks:  39%|███▉      | 1969/5000 [00:19<00:29, 104.23it/s]

Processing notebooks:  40%|███▉      | 1980/5000 [00:19<00:28, 105.13it/s]

Processing notebooks:  40%|███▉      | 1991/5000 [00:19<00:28, 104.74it/s]

Processing notebooks:  40%|████      | 2002/5000 [00:19<00:28, 104.26it/s]

Processing notebooks:  40%|████      | 2013/5000 [00:19<00:28, 104.37it/s]

Processing notebooks:  40%|████      | 2024/5000 [00:20<00:28, 103.75it/s]

Processing notebooks:  41%|████      | 2035/5000 [00:20<00:28, 103.76it/s]

Processing notebooks:  41%|████      | 2046/5000 [00:20<00:28, 104.26it/s]

Processing notebooks:  41%|████      | 2057/5000 [00:20<00:28, 104.89it/s]

Processing notebooks:  41%|████▏     | 2068/5000 [00:20<00:28, 104.56it/s]

Processing notebooks:  42%|████▏     | 2079/5000 [00:20<00:28, 104.22it/s]

Processing notebooks:  42%|████▏     | 2090/5000 [00:20<00:28, 103.27it/s]

Processing notebooks:  42%|████▏     | 2101/5000 [00:20<00:28, 102.50it/s]

Processing notebooks:  42%|████▏     | 2112/5000 [00:20<00:28, 101.75it/s]

Processing notebooks:  42%|████▏     | 2123/5000 [00:21<00:28, 101.28it/s]

Processing notebooks:  43%|████▎     | 2134/5000 [00:21<00:28, 101.41it/s]

Processing notebooks:  43%|████▎     | 2145/5000 [00:21<00:28, 100.19it/s]

Processing notebooks:  43%|████▎     | 2156/5000 [00:21<00:28, 99.59it/s] 

Processing notebooks:  43%|████▎     | 2166/5000 [00:21<00:28, 99.11it/s]

Processing notebooks:  44%|████▎     | 2177/5000 [00:21<00:28, 100.49it/s]

Processing notebooks:  44%|████▍     | 2188/5000 [00:21<00:27, 102.15it/s]

Processing notebooks:  44%|████▍     | 2199/5000 [00:21<00:27, 102.55it/s]

Processing notebooks:  44%|████▍     | 2210/5000 [00:21<00:27, 103.27it/s]

Processing notebooks:  44%|████▍     | 2221/5000 [00:21<00:26, 104.35it/s]

Processing notebooks:  45%|████▍     | 2232/5000 [00:22<00:26, 104.63it/s]

Processing notebooks:  45%|████▍     | 2243/5000 [00:22<00:26, 104.26it/s]

Processing notebooks:  45%|████▌     | 2254/5000 [00:22<00:26, 104.12it/s]

Processing notebooks:  45%|████▌     | 2265/5000 [00:22<00:26, 104.04it/s]

Processing notebooks:  46%|████▌     | 2276/5000 [00:22<00:26, 104.11it/s]

Processing notebooks:  46%|████▌     | 2287/5000 [00:22<00:26, 102.78it/s]

Processing notebooks:  46%|████▌     | 2298/5000 [00:22<00:26, 101.98it/s]

Processing notebooks:  46%|████▌     | 2309/5000 [00:22<00:26, 101.46it/s]

Processing notebooks:  46%|████▋     | 2320/5000 [00:22<00:26, 101.17it/s]

Processing notebooks:  47%|████▋     | 2331/5000 [00:23<00:26, 101.31it/s]

Processing notebooks:  47%|████▋     | 2342/5000 [00:23<00:26, 100.79it/s]

Processing notebooks:  47%|████▋     | 2353/5000 [00:23<00:27, 97.46it/s] 

Processing notebooks:  47%|████▋     | 2364/5000 [00:23<00:26, 98.44it/s]

Processing notebooks:  47%|████▋     | 2374/5000 [00:23<00:26, 98.37it/s]

Processing notebooks:  48%|████▊     | 2384/5000 [00:23<00:26, 98.81it/s]

Processing notebooks:  48%|████▊     | 2395/5000 [00:23<00:26, 99.74it/s]

Processing notebooks:  48%|████▊     | 2406/5000 [00:23<00:25, 100.03it/s]

Processing notebooks:  48%|████▊     | 2417/5000 [00:23<00:25, 100.61it/s]

Processing notebooks:  49%|████▊     | 2428/5000 [00:24<00:25, 100.59it/s]

Processing notebooks:  49%|████▉     | 2439/5000 [00:24<00:25, 100.90it/s]

Processing notebooks:  49%|████▉     | 2450/5000 [00:24<00:25, 99.95it/s] 

Processing notebooks:  49%|████▉     | 2460/5000 [00:24<00:25, 99.22it/s]

Processing notebooks:  49%|████▉     | 2471/5000 [00:24<00:25, 99.68it/s]

Processing notebooks:  50%|████▉     | 2482/5000 [00:24<00:25, 99.91it/s]

Processing notebooks:  50%|████▉     | 2492/5000 [00:24<00:25, 99.57it/s]

Processing notebooks:  50%|█████     | 2502/5000 [00:24<00:25, 99.15it/s]

Processing notebooks:  50%|█████     | 2512/5000 [00:24<00:25, 97.20it/s]

Processing notebooks:  50%|█████     | 2522/5000 [00:25<00:26, 94.84it/s]

Processing notebooks:  51%|█████     | 2532/5000 [00:25<00:26, 94.49it/s]

Processing notebooks:  51%|█████     | 2542/5000 [00:25<00:26, 94.37it/s]

Processing notebooks:  51%|█████     | 2552/5000 [00:25<00:26, 93.98it/s]

Processing notebooks:  51%|█████     | 2562/5000 [00:25<00:25, 93.85it/s]

Processing notebooks:  51%|█████▏    | 2572/5000 [00:25<00:26, 92.26it/s]

Processing notebooks:  52%|█████▏    | 2582/5000 [00:25<00:26, 91.88it/s]

Processing notebooks:  52%|█████▏    | 2592/5000 [00:25<00:25, 93.54it/s]

Processing notebooks:  52%|█████▏    | 2603/5000 [00:25<00:25, 95.72it/s]

Processing notebooks:  52%|█████▏    | 2613/5000 [00:25<00:24, 96.93it/s]

Processing notebooks:  52%|█████▏    | 2623/5000 [00:26<00:24, 96.19it/s]

Processing notebooks:  53%|█████▎    | 2634/5000 [00:26<00:24, 97.32it/s]

Processing notebooks:  53%|█████▎    | 2645/5000 [00:26<00:23, 98.50it/s]

Processing notebooks:  53%|█████▎    | 2655/5000 [00:26<00:23, 98.57it/s]

Processing notebooks:  53%|█████▎    | 2665/5000 [00:26<00:23, 98.02it/s]

Processing notebooks:  54%|█████▎    | 2675/5000 [00:26<00:23, 97.09it/s]

Processing notebooks:  54%|█████▎    | 2685/5000 [00:26<00:23, 97.91it/s]

Processing notebooks:  54%|█████▍    | 2695/5000 [00:26<00:23, 98.10it/s]

Processing notebooks:  54%|█████▍    | 2706/5000 [00:26<00:23, 98.53it/s]

Processing notebooks:  54%|█████▍    | 2716/5000 [00:27<00:23, 98.07it/s]

Processing notebooks:  55%|█████▍    | 2726/5000 [00:27<00:23, 98.16it/s]

Processing notebooks:  55%|█████▍    | 2737/5000 [00:27<00:22, 98.81it/s]

Processing notebooks:  55%|█████▍    | 2747/5000 [00:27<00:23, 97.57it/s]

Processing notebooks:  55%|█████▌    | 2757/5000 [00:27<00:23, 96.04it/s]

Processing notebooks:  55%|█████▌    | 2767/5000 [00:27<00:23, 96.42it/s]

Processing notebooks:  56%|█████▌    | 2777/5000 [00:27<00:23, 94.27it/s]

Processing notebooks:  56%|█████▌    | 2787/5000 [00:27<00:24, 91.85it/s]

Processing notebooks:  56%|█████▌    | 2797/5000 [00:27<00:24, 90.16it/s]

Processing notebooks:  56%|█████▌    | 2807/5000 [00:27<00:24, 91.00it/s]

Processing notebooks:  56%|█████▋    | 2818/5000 [00:28<00:23, 94.02it/s]

Processing notebooks:  57%|█████▋    | 2828/5000 [00:28<00:23, 93.91it/s]

Processing notebooks:  57%|█████▋    | 2838/5000 [00:28<00:22, 95.35it/s]

Processing notebooks:  57%|█████▋    | 2849/5000 [00:28<00:22, 96.86it/s]

Processing notebooks:  57%|█████▋    | 2860/5000 [00:28<00:21, 98.05it/s]

Processing notebooks:  57%|█████▋    | 2870/5000 [00:28<00:21, 98.15it/s]

Processing notebooks:  58%|█████▊    | 2880/5000 [00:28<00:21, 97.69it/s]

Processing notebooks:  58%|█████▊    | 2890/5000 [00:28<00:21, 98.32it/s]

Processing notebooks:  58%|█████▊    | 2900/5000 [00:28<00:21, 98.55it/s]

Processing notebooks:  58%|█████▊    | 2910/5000 [00:29<00:21, 97.93it/s]

Processing notebooks:  58%|█████▊    | 2920/5000 [00:29<00:21, 97.24it/s]

Processing notebooks:  59%|█████▊    | 2930/5000 [00:29<00:21, 96.98it/s]

Processing notebooks:  59%|█████▉    | 2940/5000 [00:29<00:21, 95.93it/s]

Processing notebooks:  59%|█████▉    | 2950/5000 [00:29<00:21, 95.82it/s]

Processing notebooks:  59%|█████▉    | 2960/5000 [00:29<00:21, 96.21it/s]

Processing notebooks:  59%|█████▉    | 2970/5000 [00:29<00:21, 96.56it/s]

Processing notebooks:  60%|█████▉    | 2980/5000 [00:29<00:20, 97.24it/s]

Processing notebooks:  60%|█████▉    | 2990/5000 [00:29<00:20, 95.88it/s]

Processing notebooks:  60%|██████    | 3000/5000 [00:29<00:21, 92.95it/s]

Processing notebooks:  60%|██████    | 3010/5000 [00:30<00:21, 93.88it/s]

Processing notebooks:  60%|██████    | 3020/5000 [00:30<00:21, 91.80it/s]

Processing notebooks:  61%|██████    | 3030/5000 [00:30<00:21, 93.15it/s]

Processing notebooks:  61%|██████    | 3040/5000 [00:30<00:20, 93.65it/s]

Processing notebooks:  61%|██████    | 3050/5000 [00:30<00:21, 92.76it/s]

Processing notebooks:  61%|██████    | 3060/5000 [00:30<00:20, 94.50it/s]

Processing notebooks:  61%|██████▏   | 3070/5000 [00:30<00:20, 95.63it/s]

Processing notebooks:  62%|██████▏   | 3080/5000 [00:30<00:19, 96.80it/s]

Processing notebooks:  62%|██████▏   | 3091/5000 [00:30<00:19, 97.92it/s]

Processing notebooks:  62%|██████▏   | 3101/5000 [00:31<00:19, 98.13it/s]

Processing notebooks:  62%|██████▏   | 3111/5000 [00:31<00:19, 98.11it/s]

Processing notebooks:  62%|██████▏   | 3121/5000 [00:31<00:19, 96.53it/s]

Processing notebooks:  63%|██████▎   | 3131/5000 [00:31<00:19, 96.28it/s]

Processing notebooks:  63%|██████▎   | 3141/5000 [00:31<00:19, 95.41it/s]

Processing notebooks:  63%|██████▎   | 3151/5000 [00:31<00:19, 93.54it/s]

Processing notebooks:  63%|██████▎   | 3161/5000 [00:31<00:20, 91.42it/s]

Processing notebooks:  63%|██████▎   | 3171/5000 [00:31<00:20, 90.90it/s]

Processing notebooks:  64%|██████▎   | 3181/5000 [00:31<00:19, 91.68it/s]

Processing notebooks:  64%|██████▍   | 3191/5000 [00:32<00:19, 92.55it/s]

Processing notebooks:  64%|██████▍   | 3201/5000 [00:32<00:19, 91.29it/s]

Processing notebooks:  64%|██████▍   | 3211/5000 [00:32<00:19, 90.06it/s]

Processing notebooks:  64%|██████▍   | 3221/5000 [00:32<00:19, 90.00it/s]

Processing notebooks:  65%|██████▍   | 3231/5000 [00:32<00:19, 90.72it/s]

Processing notebooks:  65%|██████▍   | 3241/5000 [00:32<00:18, 92.64it/s]

Processing notebooks:  65%|██████▌   | 3251/5000 [00:32<00:18, 93.85it/s]

Processing notebooks:  65%|██████▌   | 3261/5000 [00:32<00:18, 94.52it/s]

Processing notebooks:  65%|██████▌   | 3271/5000 [00:32<00:18, 95.15it/s]

Processing notebooks:  66%|██████▌   | 3281/5000 [00:32<00:17, 96.02it/s]

Processing notebooks:  66%|██████▌   | 3291/5000 [00:33<00:17, 96.40it/s]

Processing notebooks:  66%|██████▌   | 3301/5000 [00:33<00:17, 96.27it/s]

Processing notebooks:  66%|██████▌   | 3311/5000 [00:33<00:17, 96.42it/s]

Processing notebooks:  66%|██████▋   | 3321/5000 [00:33<00:17, 97.25it/s]

Processing notebooks:  67%|██████▋   | 3331/5000 [00:33<00:17, 98.04it/s]

Processing notebooks:  67%|██████▋   | 3341/5000 [00:33<00:17, 97.57it/s]

Processing notebooks:  67%|██████▋   | 3351/5000 [00:33<00:17, 96.51it/s]

Processing notebooks:  67%|██████▋   | 3361/5000 [00:33<00:16, 96.52it/s]

Processing notebooks:  67%|██████▋   | 3371/5000 [00:33<00:17, 94.04it/s]

Processing notebooks:  68%|██████▊   | 3381/5000 [00:34<00:17, 92.65it/s]

Processing notebooks:  68%|██████▊   | 3391/5000 [00:34<00:17, 92.16it/s]

Processing notebooks:  68%|██████▊   | 3401/5000 [00:34<00:17, 92.57it/s]

Processing notebooks:  68%|██████▊   | 3411/5000 [00:34<00:17, 92.74it/s]

Processing notebooks:  68%|██████▊   | 3421/5000 [00:34<00:17, 89.36it/s]

Processing notebooks:  69%|██████▊   | 3430/5000 [00:34<00:17, 88.60it/s]

Processing notebooks:  69%|██████▉   | 3440/5000 [00:34<00:17, 89.63it/s]

Processing notebooks:  69%|██████▉   | 3450/5000 [00:34<00:16, 92.38it/s]

Processing notebooks:  69%|██████▉   | 3461/5000 [00:34<00:16, 94.51it/s]

Processing notebooks:  69%|██████▉   | 3472/5000 [00:35<00:15, 96.29it/s]

Processing notebooks:  70%|██████▉   | 3482/5000 [00:35<00:15, 97.22it/s]

Processing notebooks:  70%|██████▉   | 3493/5000 [00:35<00:15, 98.44it/s]

Processing notebooks:  70%|███████   | 3504/5000 [00:35<00:15, 99.21it/s]

Processing notebooks:  70%|███████   | 3514/5000 [00:35<00:15, 98.04it/s]

Processing notebooks:  70%|███████   | 3524/5000 [00:35<00:15, 96.73it/s]

Processing notebooks:  71%|███████   | 3534/5000 [00:35<00:15, 96.71it/s]

Processing notebooks:  71%|███████   | 3544/5000 [00:35<00:15, 96.74it/s]

Processing notebooks:  71%|███████   | 3554/5000 [00:35<00:14, 96.78it/s]

Processing notebooks:  71%|███████▏  | 3564/5000 [00:35<00:15, 91.53it/s]

Processing notebooks:  71%|███████▏  | 3574/5000 [00:36<00:15, 93.34it/s]

Processing notebooks:  72%|███████▏  | 3584/5000 [00:36<00:15, 94.36it/s]

Processing notebooks:  72%|███████▏  | 3594/5000 [00:36<00:14, 94.42it/s]

Processing notebooks:  72%|███████▏  | 3604/5000 [00:36<00:14, 95.43it/s]

Processing notebooks:  72%|███████▏  | 3614/5000 [00:36<00:14, 93.77it/s]

Processing notebooks:  72%|███████▏  | 3624/5000 [00:36<00:14, 92.26it/s]

Processing notebooks:  73%|███████▎  | 3634/5000 [00:36<00:14, 92.09it/s]

Processing notebooks:  73%|███████▎  | 3644/5000 [00:36<00:14, 92.46it/s]

Processing notebooks:  73%|███████▎  | 3654/5000 [00:36<00:14, 94.28it/s]

Processing notebooks:  73%|███████▎  | 3664/5000 [00:37<00:14, 94.58it/s]

Processing notebooks:  73%|███████▎  | 3674/5000 [00:37<00:13, 94.89it/s]

Processing notebooks:  74%|███████▎  | 3684/5000 [00:37<00:13, 96.18it/s]

Processing notebooks:  74%|███████▍  | 3694/5000 [00:37<00:13, 97.00it/s]

Processing notebooks:  74%|███████▍  | 3704/5000 [00:37<00:13, 97.75it/s]

Processing notebooks:  74%|███████▍  | 3714/5000 [00:37<00:13, 98.36it/s]

Processing notebooks:  74%|███████▍  | 3724/5000 [00:37<00:12, 98.50it/s]

Processing notebooks:  75%|███████▍  | 3735/5000 [00:37<00:12, 98.98it/s]

Processing notebooks:  75%|███████▍  | 3746/5000 [00:37<00:12, 99.64it/s]

Processing notebooks:  75%|███████▌  | 3756/5000 [00:37<00:12, 99.48it/s]

Processing notebooks:  75%|███████▌  | 3766/5000 [00:38<00:12, 97.06it/s]

Processing notebooks:  76%|███████▌  | 3776/5000 [00:38<00:12, 95.75it/s]

Processing notebooks:  76%|███████▌  | 3786/5000 [00:38<00:12, 94.76it/s]

Processing notebooks:  76%|███████▌  | 3796/5000 [00:38<00:12, 94.01it/s]

Processing notebooks:  76%|███████▌  | 3806/5000 [00:38<00:12, 94.85it/s]

Processing notebooks:  76%|███████▋  | 3816/5000 [00:38<00:12, 95.02it/s]

Processing notebooks:  77%|███████▋  | 3826/5000 [00:38<00:12, 95.21it/s]

Processing notebooks:  77%|███████▋  | 3836/5000 [00:38<00:12, 90.63it/s]

Processing notebooks:  77%|███████▋  | 3846/5000 [00:38<00:12, 88.87it/s]

Processing notebooks:  77%|███████▋  | 3855/5000 [00:39<00:12, 88.99it/s]

Processing notebooks:  77%|███████▋  | 3865/5000 [00:39<00:12, 91.77it/s]

Processing notebooks:  78%|███████▊  | 3875/5000 [00:39<00:12, 92.89it/s]

Processing notebooks:  78%|███████▊  | 3885/5000 [00:39<00:11, 94.56it/s]

Processing notebooks:  78%|███████▊  | 3895/5000 [00:39<00:11, 95.01it/s]

Processing notebooks:  78%|███████▊  | 3906/5000 [00:39<00:11, 96.72it/s]

Processing notebooks:  78%|███████▊  | 3917/5000 [00:39<00:10, 98.57it/s]

Processing notebooks:  79%|███████▊  | 3928/5000 [00:39<00:10, 99.65it/s]

Processing notebooks:  79%|███████▉  | 3939/5000 [00:39<00:10, 100.22it/s]

Processing notebooks:  79%|███████▉  | 3950/5000 [00:39<00:10, 100.50it/s]

Processing notebooks:  79%|███████▉  | 3961/5000 [00:40<00:10, 98.78it/s] 

Processing notebooks:  79%|███████▉  | 3971/5000 [00:40<00:10, 97.79it/s]

Processing notebooks:  80%|███████▉  | 3981/5000 [00:40<00:10, 97.58it/s]

Processing notebooks:  80%|███████▉  | 3991/5000 [00:40<00:10, 95.78it/s]

Processing notebooks:  80%|████████  | 4001/5000 [00:40<00:10, 94.67it/s]

Processing notebooks:  80%|████████  | 4011/5000 [00:40<00:10, 93.36it/s]

Processing notebooks:  80%|████████  | 4021/5000 [00:40<00:10, 93.60it/s]

Processing notebooks:  81%|████████  | 4031/5000 [00:40<00:10, 93.93it/s]

Processing notebooks:  81%|████████  | 4041/5000 [00:40<00:10, 94.27it/s]

Processing notebooks:  81%|████████  | 4051/5000 [00:41<00:10, 94.87it/s]

Processing notebooks:  81%|████████  | 4061/5000 [00:41<00:10, 92.85it/s]

Processing notebooks:  81%|████████▏ | 4071/5000 [00:41<00:10, 91.65it/s]

Processing notebooks:  82%|████████▏ | 4081/5000 [00:41<00:09, 92.93it/s]

Processing notebooks:  82%|████████▏ | 4091/5000 [00:41<00:09, 93.99it/s]

Processing notebooks:  82%|████████▏ | 4101/5000 [00:41<00:09, 94.41it/s]

Processing notebooks:  82%|████████▏ | 4111/5000 [00:41<00:09, 95.97it/s]

Processing notebooks:  82%|████████▏ | 4121/5000 [00:41<00:09, 95.15it/s]

Processing notebooks:  83%|████████▎ | 4131/5000 [00:41<00:09, 95.50it/s]

Processing notebooks:  83%|████████▎ | 4142/5000 [00:42<00:08, 96.98it/s]

Processing notebooks:  83%|████████▎ | 4153/5000 [00:42<00:08, 98.40it/s]

Processing notebooks:  83%|████████▎ | 4164/5000 [00:42<00:08, 99.51it/s]

Processing notebooks:  83%|████████▎ | 4174/5000 [00:42<00:08, 98.61it/s]

Processing notebooks:  84%|████████▎ | 4184/5000 [00:42<00:08, 97.81it/s]

Processing notebooks:  84%|████████▍ | 4194/5000 [00:42<00:08, 96.80it/s]

Processing notebooks:  84%|████████▍ | 4204/5000 [00:42<00:08, 97.50it/s]

Processing notebooks:  84%|████████▍ | 4215/5000 [00:42<00:07, 98.40it/s]

Processing notebooks:  84%|████████▍ | 4225/5000 [00:42<00:07, 98.73it/s]

Processing notebooks:  85%|████████▍ | 4235/5000 [00:42<00:07, 98.71it/s]

Processing notebooks:  85%|████████▍ | 4246/5000 [00:43<00:07, 99.68it/s]

Processing notebooks:  85%|████████▌ | 4256/5000 [00:43<00:07, 98.77it/s]

Processing notebooks:  85%|████████▌ | 4266/5000 [00:43<00:07, 99.00it/s]

Processing notebooks:  86%|████████▌ | 4276/5000 [00:43<00:07, 98.30it/s]

Processing notebooks:  86%|████████▌ | 4286/5000 [00:43<00:07, 97.67it/s]

Processing notebooks:  86%|████████▌ | 4296/5000 [00:43<00:07, 98.35it/s]

Processing notebooks:  86%|████████▌ | 4306/5000 [00:43<00:07, 98.38it/s]

Processing notebooks:  86%|████████▋ | 4317/5000 [00:43<00:06, 99.08it/s]

Processing notebooks:  87%|████████▋ | 4327/5000 [00:43<00:06, 97.59it/s]

Processing notebooks:  87%|████████▋ | 4337/5000 [00:44<00:07, 94.54it/s]

Processing notebooks:  87%|████████▋ | 4347/5000 [00:44<00:07, 93.01it/s]

Processing notebooks:  87%|████████▋ | 4357/5000 [00:44<00:07, 91.65it/s]

Processing notebooks:  87%|████████▋ | 4367/5000 [00:44<00:06, 93.42it/s]

Processing notebooks:  88%|████████▊ | 4377/5000 [00:44<00:06, 94.43it/s]

Processing notebooks:  88%|████████▊ | 4387/5000 [00:44<00:06, 95.09it/s]

Processing notebooks:  88%|████████▊ | 4397/5000 [00:44<00:06, 96.48it/s]

Processing notebooks:  88%|████████▊ | 4407/5000 [00:44<00:06, 97.29it/s]

Processing notebooks:  88%|████████▊ | 4418/5000 [00:44<00:05, 98.55it/s]

Processing notebooks:  89%|████████▊ | 4429/5000 [00:44<00:05, 99.23it/s]

Processing notebooks:  89%|████████▉ | 4439/5000 [00:45<00:05, 99.08it/s]

Processing notebooks:  89%|████████▉ | 4449/5000 [00:45<00:05, 99.10it/s]

Processing notebooks:  89%|████████▉ | 4459/5000 [00:45<00:05, 98.09it/s]

Processing notebooks:  89%|████████▉ | 4469/5000 [00:45<00:05, 97.61it/s]

Processing notebooks:  90%|████████▉ | 4479/5000 [00:45<00:05, 96.89it/s]

Processing notebooks:  90%|████████▉ | 4489/5000 [00:45<00:05, 95.89it/s]

Processing notebooks:  90%|████████▉ | 4499/5000 [00:45<00:05, 93.98it/s]

Processing notebooks:  90%|█████████ | 4509/5000 [00:45<00:05, 92.33it/s]

Processing notebooks:  90%|█████████ | 4519/5000 [00:45<00:05, 93.10it/s]

Processing notebooks:  91%|█████████ | 4529/5000 [00:46<00:05, 93.27it/s]

Processing notebooks:  91%|█████████ | 4539/5000 [00:46<00:04, 93.81it/s]

Processing notebooks:  91%|█████████ | 4549/5000 [00:46<00:04, 93.92it/s]

Processing notebooks:  91%|█████████ | 4559/5000 [00:46<00:04, 92.10it/s]

Processing notebooks:  91%|█████████▏| 4569/5000 [00:46<00:04, 91.28it/s]

Processing notebooks:  92%|█████████▏| 4579/5000 [00:46<00:04, 91.02it/s]

Processing notebooks:  92%|█████████▏| 4590/5000 [00:46<00:04, 93.81it/s]

Processing notebooks:  92%|█████████▏| 4600/5000 [00:46<00:04, 94.86it/s]

Processing notebooks:  92%|█████████▏| 4610/5000 [00:46<00:04, 95.60it/s]

Processing notebooks:  92%|█████████▏| 4620/5000 [00:46<00:03, 96.30it/s]

Processing notebooks:  93%|█████████▎| 4630/5000 [00:47<00:03, 96.83it/s]

Processing notebooks:  93%|█████████▎| 4641/5000 [00:47<00:03, 97.88it/s]

Processing notebooks:  93%|█████████▎| 4651/5000 [00:47<00:03, 97.64it/s]

Processing notebooks:  93%|█████████▎| 4661/5000 [00:47<00:03, 97.51it/s]

Processing notebooks:  93%|█████████▎| 4671/5000 [00:47<00:03, 98.14it/s]

Processing notebooks:  94%|█████████▎| 4682/5000 [00:47<00:03, 98.73it/s]

Processing notebooks:  94%|█████████▍| 4693/5000 [00:47<00:03, 99.32it/s]

Processing notebooks:  94%|█████████▍| 4703/5000 [00:47<00:02, 99.20it/s]

Processing notebooks:  94%|█████████▍| 4714/5000 [00:47<00:02, 99.11it/s]

Processing notebooks:  94%|█████████▍| 4724/5000 [00:48<00:02, 97.29it/s]

Processing notebooks:  95%|█████████▍| 4734/5000 [00:48<00:02, 95.04it/s]

Processing notebooks:  95%|█████████▍| 4744/5000 [00:48<00:02, 93.78it/s]

Processing notebooks:  95%|█████████▌| 4754/5000 [00:48<00:02, 92.38it/s]

Processing notebooks:  95%|█████████▌| 4764/5000 [00:48<00:02, 92.80it/s]

Processing notebooks:  95%|█████████▌| 4774/5000 [00:48<00:02, 92.66it/s]

Processing notebooks:  96%|█████████▌| 4784/5000 [00:48<00:02, 91.98it/s]

Processing notebooks:  96%|█████████▌| 4794/5000 [00:48<00:02, 91.94it/s]

Processing notebooks:  96%|█████████▌| 4804/5000 [00:48<00:02, 91.10it/s]

Processing notebooks:  96%|█████████▋| 4814/5000 [00:49<00:02, 92.81it/s]

Processing notebooks:  96%|█████████▋| 4824/5000 [00:49<00:01, 94.18it/s]

Processing notebooks:  97%|█████████▋| 4834/5000 [00:49<00:01, 95.11it/s]

Processing notebooks:  97%|█████████▋| 4844/5000 [00:49<00:01, 96.06it/s]

Processing notebooks:  97%|█████████▋| 4854/5000 [00:49<00:01, 96.66it/s]

Processing notebooks:  97%|█████████▋| 4864/5000 [00:49<00:01, 97.40it/s]

Processing notebooks:  97%|█████████▋| 4874/5000 [00:49<00:01, 97.04it/s]

Processing notebooks:  98%|█████████▊| 4884/5000 [00:49<00:01, 96.95it/s]

Processing notebooks:  98%|█████████▊| 4894/5000 [00:49<00:01, 97.63it/s]

Processing notebooks:  98%|█████████▊| 4904/5000 [00:49<00:00, 97.05it/s]

Processing notebooks:  98%|█████████▊| 4914/5000 [00:50<00:00, 97.63it/s]

Processing notebooks:  98%|█████████▊| 4924/5000 [00:50<00:00, 97.33it/s]

Processing notebooks:  99%|█████████▊| 4934/5000 [00:50<00:00, 95.78it/s]

Processing notebooks:  99%|█████████▉| 4944/5000 [00:50<00:00, 94.78it/s]

Processing notebooks:  99%|█████████▉| 4954/5000 [00:50<00:00, 92.93it/s]

Processing notebooks:  99%|█████████▉| 4964/5000 [00:50<00:00, 93.75it/s]

Processing notebooks:  99%|█████████▉| 4974/5000 [00:50<00:00, 93.59it/s]

Processing notebooks: 100%|█████████▉| 4984/5000 [00:50<00:00, 94.11it/s]

Processing notebooks: 100%|█████████▉| 4994/5000 [00:50<00:00, 92.32it/s]

Processing notebooks: 100%|██████████| 5000/5000 [00:50<00:00, 98.07it/s]




Training data shape: (232291, 5)
Average cells per notebook: 46.5


In [5]:
# Extract features
print("Extracting features...")
train_df = extract_basic_features(train_df)
train_df = extract_heading_features(train_df)
train_df = extract_semantic_features(train_df)

# Calculate notebook-level statistics (WITHOUT relative_position to avoid leakage)
notebook_stats = train_df.groupby('notebook_id').agg({
    'source_length': ['mean', 'std'],
    'word_count': ['mean', 'std'],
    'cell_type': 'mean',  # code ratio (was incorrectly named cell_type_code)
    'position': 'max'  # notebook size
}).round(2)

notebook_stats.columns = ['_'.join(col).strip() for col in notebook_stats.columns]
notebook_stats = notebook_stats.reset_index()
notebook_stats.rename(columns={'position_max': 'notebook_size', 'cell_type_mean': 'cell_type_code_mean'}, inplace=True)

# Merge notebook-level features
train_df = train_df.merge(notebook_stats, on='notebook_id', how='left')

# NOTE: DELIBERATELY NOT adding relative_position to avoid data leakage
# train_df['relative_position'] = train_df['position'] / train_df['notebook_size']

print(f"Final training shape: {train_df.shape}")
print(f"Notebook-level features added: {list(notebook_stats.columns)}")

Extracting features...


Final training shape: (232291, 55)
Notebook-level features added: ['notebook_id', 'source_length_mean', 'source_length_std', 'word_count_mean', 'word_count_std', 'cell_type_code_mean', 'notebook_size']


In [6]:
# Prepare TF-IDF features
print("Preparing TF-IDF features...")
# Get all markdown cells for TF-IDF
markdown_cells = train_df[train_df['cell_type'] == 0]['source'].fillna('').tolist()

# Fit TF-IDF vectorizer on markdown content
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9
)

vectorizer.fit(markdown_cells)
tfidf_feature_names = [f'tfidf_{i}' for i in range(len(vectorizer.get_feature_names_out()))]

print(f"TF-IDF features created: {len(tfidf_feature_names)}")

# Transform all cells (both code and markdown)
all_texts = train_df['source'].fillna('').tolist()
tfidf_matrix = vectorizer.transform(all_texts)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)

# Concatenate with main dataframe
train_df = pd.concat([train_df.reset_index(drop=True), tfidf_df], axis=1)
print(f"Final shape with TF-IDF: {train_df.shape}")

Preparing TF-IDF features...


TF-IDF features created: 1000


Final shape with TF-IDF: (232291, 1055)


In [None]:
# Define feature columns (WITHOUT relative_position)
basic_features = ['source_length', 'line_count', 'word_count', 'char_count',
                  'has_import', 'has_comment', 'has_heading', 'has_code_block', 'has_link']

heading_features = ['heading_level'] + [f'heading_{h}' for h in ['introduction', 'conclusion', 'summary', 'results', 'methods', 
                       'analysis', 'eda', 'exploratory', 'data', 'preprocessing',
                       'model', 'training', 'evaluation', 'references', 'appendix',
                       'setup', 'imports', 'installation', 'requirements',
                       'visualization', 'plot', 'train', 'test', 'validation']]

semantic_features = ['has_print', 'has_kaggle', 'has_input', 'has_data', 'has_function',
                     'has_class', 'has_model', 'has_train', 'has_test', 'has_plot']  # Removed 'has_import' to avoid duplication

notebook_features = ['source_length_mean', 'source_length_std', 'word_count_mean', 
                     'word_count_std', 'cell_type_code_mean', 'notebook_size']

# TF-IDF features
tfidf_features = [f'tfidf_{i}' for i in range(1000)]

# Combine all features
feature_cols = basic_features + heading_features + semantic_features + notebook_features + tfidf_features

print(f"Total features: {len(feature_cols)}")
print(f"Basic features: {len(basic_features)}")
print(f"Heading features: {len(heading_features)}")
print(f"Semantic features: {len(semantic_features)}")
print(f"Notebook features: {len(notebook_features)}")
print(f"TF-IDF features: {len(tfidf_features)}")

In [None]:
# Cross-validation setup
def kendall_tau_score(y_true, y_pred):
    """Calculate Kendall tau correlation"""
    return kendalltau(y_true, y_pred)[0]

gkf = GroupKFold(n_splits=5)
groups = train_df['notebook_id']

X = train_df[feature_cols]
y = train_df['position']

print("Starting cross-validation...")
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Calculate per-notebook Kendall tau
    val_df = train_df.iloc[val_idx].copy()
    val_df['pred_position'] = y_pred
    
    fold_scores = []
    for notebook_id in val_df['notebook_id'].unique():
        notebook_data = val_df[val_df['notebook_id'] == notebook_id]
        if len(notebook_data) > 1:
            score = kendall_tau_score(
                notebook_data['position'].values,
                notebook_data['pred_position'].values
            )
            fold_scores.append(score)
    
    fold_score = np.mean(fold_scores)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1}: {fold_score:.4f}")

print(f"\nCV Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")

In [None]:
# Train final model on all data
print("Training final model on all training data...")
final_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_model.fit(X, y)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 most important features:")
print(feature_importance.head(20).to_string(index=False))

In [None]:
# Generate predictions for test set
print("Generating predictions for test set...")
test_notebooks = list(TEST_PATH.glob('*.json'))
print(f"Total test notebooks: {len(test_notebooks)}")

def predict_notebook_order(notebook_path, model, vectorizer, feature_cols):
    """Predict cell order for a single notebook"""
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    notebook_id = notebook_path.stem
    cell_types = notebook['cell_type']
    sources = notebook['source']
    cell_ids = list(cell_types.keys())
    
    # Create features dataframe
    features_df = pd.DataFrame({
        'cell_id': cell_ids,
        'cell_type': [1 if cell_types[cid] == 'code' else 0 for cid in cell_ids],
        'source': [sources[cid] for cid in cell_ids]
    })
    
    # Extract features
    features_df = extract_basic_features(features_df)
    features_df = extract_heading_features(features_df)
    features_df = extract_semantic_features(features_df)
    
    # Calculate notebook-level statistics (same as training)
    features_df['source_length_mean'] = features_df['source_length'].mean()
    features_df['source_length_std'] = features_df['source_length'].std()
    features_df['word_count_mean'] = features_df['word_count'].mean()
    features_df['word_count_std'] = features_df['word_count'].std()
    features_df['cell_type_code_mean'] = features_df['cell_type_code'].mean()
    features_df['notebook_size'] = len(features_df)
    
    # NOTE: NO relative_position feature (avoiding leakage)
    # features_df['relative_position'] = 0.5  # placeholder
    
    # Transform TF-IDF features
    all_texts = features_df['source'].fillna('').tolist()
    tfidf_matrix = vectorizer.transform(all_texts)
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)
    
    features_df = pd.concat([features_df.reset_index(drop=True), tfidf_df], axis=1)
    
    # Ensure all feature columns exist
    for col in feature_cols:
        if col not in features_df.columns:
            features_df[col] = 0
    
    # Predict positions
    X_test = features_df[feature_cols]
    predictions = model.predict(X_test)
    
    # Sort by predicted position
    features_df['predicted_position'] = predictions
    features_df = features_df.sort_values('predicted_position')
    
    # Return ordered cell IDs as space-separated string
    ordered_cells = features_df['cell_id'].tolist()
    return ' '.join(ordered_cells)

In [None]:
# Generate predictions for all test notebooks
submission_data = []

for notebook_path in tqdm(test_notebooks, desc="Predicting test notebooks"):
    try:
        notebook_id = notebook_path.stem
        ordered_cells = predict_notebook_order(notebook_path, final_model, vectorizer, feature_cols)
        
        submission_data.append({
            'id': notebook_id,
            'cell_order': ordered_cells
        })
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")

submission_df = pd.DataFrame(submission_data)
print(f"Submission shape: {submission_df.shape}")
print(submission_df.head())

In [None]:
# Save submission
submission_path = '/home/submission/submission_003.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

# Verify format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nColumns match: {list(submission_df.columns) == list(sample_sub.columns)}")
print(f"Number of rows match: {len(submission_df) == len(sample_sub)}")

In [11]:
# Debug: Check for duplicate columns
print("Duplicate columns in train_df:")
duplicates = train_df.columns[train_df.columns.duplicated()]
print(duplicates.tolist())

print("\nColumns containing 'has_import':")
has_import_cols = [col for col in train_df.columns if 'has_import' in col]
print(has_import_cols)

print("\nAll columns (first 20):")
print(train_df.columns[:20].tolist())

print("\nShape of train_df:", train_df.shape)

Duplicate columns in train_df:
[]

Columns containing 'has_import':
['has_import']

All columns (first 20):
['notebook_id', 'cell_id', 'cell_type', 'source', 'position', 'source_length', 'line_count', 'word_count', 'char_count', 'has_import', 'has_comment', 'has_heading', 'has_code_block', 'has_link', 'heading_level', 'heading_introduction', 'heading_conclusion', 'heading_summary', 'heading_results', 'heading_methods']

Shape of train_df: (232291, 1055)


In [12]:
# Debug: Check feature_cols list
print("Checking feature_cols for duplicates...")
print(f"Length of feature_cols: {len(feature_cols)}")
print(f"Unique features: {len(set(feature_cols))}")

# Find duplicates in feature_cols
from collections import Counter
counts = Counter(feature_cols)
duplicates = [item for item, count in counts.items() if count > 1]
print(f"\nDuplicate features in feature_cols: {duplicates}")

# Check if 'has_import' is in multiple feature categories
print(f"\nhas_import in basic_features: {'has_import' in basic_features}")
print(f"has_import in semantic_features: {'has_import' in semantic_features}")

# Show the problematic categories
if 'has_import' in basic_features:
    print(f"basic_features: {basic_features}")
if 'has_import' in semantic_features:
    print(f"semantic_features: {semantic_features}")

Checking feature_cols for duplicates...
Length of feature_cols: 1051
Unique features: 1050

Duplicate features in feature_cols: ['has_import']

has_import in basic_features: True
has_import in semantic_features: True
basic_features: ['source_length', 'line_count', 'word_count', 'char_count', 'has_import', 'has_comment', 'has_heading', 'has_code_block', 'has_link']
semantic_features: ['has_print', 'has_kaggle', 'has_input', 'has_data', 'has_function', 'has_class', 'has_model', 'has_train', 'has_test', 'has_plot', 'has_import']
