# Experiment 002: Enhanced Features with TF-IDF and Heading Analysis

This experiment adds TF-IDF features and heading-level features to improve upon the baseline.

**Strategy:** Priority 1 - Enhanced Feature Engineering

**Expected improvements:**
- TF-IDF features for semantic content patterns
- Heading level features to capture hierarchy
- Semantic position features for first/last cell patterns
- Notebook-level features for context

**Validation:** 5-fold GroupKFold with notebook_id groups, Kendall tau metric

In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from tqdm import tqdm
import re
from collections import Counter
from sklearn.model_selection import GroupKFold
from sklearn.metrics import make_scorer
from scipy.stats import kendalltau
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Set paths
TRAIN_PATH = Path('/home/data/train')
TEST_PATH = Path('/home/data/test')
ORDERS_PATH = Path('/home/data/train_orders.csv')

print("Loading data...")
orders_df = pd.read_csv(ORDERS_PATH)
print(f"Orders shape: {orders_df.shape}")
print(f"Unique notebooks: {orders_df['id'].nunique()}")

Loading data...


Orders shape: (119256, 2)
Unique notebooks: 119256


In [3]:
# Load a subset of training data for faster iteration
# Use 5000 notebooks for this experiment
train_notebooks = orders_df['id'].unique()[:5000]
print(f"Using {len(train_notebooks)} notebooks for training")

# Create training data
all_cells = []
notebook_sizes = {}

for notebook_id in tqdm(train_notebooks, desc="Loading training notebooks"):
    notebook_path = TRAIN_PATH / f"{notebook_id}.json"
    
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    cell_order = orders_df[orders_df['id'] == notebook_id]['cell_order'].iloc[0].split()
    cell_positions = {cell_id: pos for pos, cell_id in enumerate(cell_order)}
    
    notebook_sizes[notebook_id] = len(cell_order)
    
    # The notebook structure has cell_type and source as separate dictionaries
    cell_types = notebook['cell_type']
    sources = notebook['source']
    
    for cell_id in cell_types.keys():
        cell_type = cell_types[cell_id]
        source = sources[cell_id]
        
        all_cells.append({
            'notebook_id': notebook_id,
            'cell_id': cell_id,
            'cell_type': cell_type,
            'source': source,
            'position': cell_positions[cell_id]
        })

train_df = pd.DataFrame(all_cells)
print(f"Training data shape: {train_df.shape}")
print(f"Average cells per notebook: {train_df.groupby('notebook_id').size().mean():.1f}")

Using 5000 notebooks for training


Loading training notebooks:   0%|          | 0/5000 [00:00<?, ?it/s]

Loading training notebooks:   0%|          | 10/5000 [00:00<00:51, 97.21it/s]

Loading training notebooks:   0%|          | 20/5000 [00:00<00:50, 98.26it/s]

Loading training notebooks:   1%|          | 31/5000 [00:00<00:49, 99.45it/s]

Loading training notebooks:   1%|          | 41/5000 [00:00<00:50, 98.49it/s]

Loading training notebooks:   1%|          | 51/5000 [00:00<00:52, 94.66it/s]

Loading training notebooks:   1%|          | 61/5000 [00:00<00:53, 92.11it/s]

Loading training notebooks:   1%|▏         | 71/5000 [00:00<00:54, 90.50it/s]

Loading training notebooks:   2%|▏         | 82/5000 [00:00<00:52, 93.94it/s]

Loading training notebooks:   2%|▏         | 92/5000 [00:00<00:52, 93.34it/s]

Loading training notebooks:   2%|▏         | 102/5000 [00:01<00:52, 93.45it/s]

Loading training notebooks:   2%|▏         | 113/5000 [00:01<00:51, 95.68it/s]

Loading training notebooks:   2%|▏         | 123/5000 [00:01<00:51, 94.37it/s]

Loading training notebooks:   3%|▎         | 133/5000 [00:01<00:52, 92.96it/s]

Loading training notebooks:   3%|▎         | 143/5000 [00:01<00:52, 93.10it/s]

Loading training notebooks:   3%|▎         | 153/5000 [00:01<00:53, 90.41it/s]

Loading training notebooks:   3%|▎         | 163/5000 [00:01<00:52, 91.49it/s]

Loading training notebooks:   3%|▎         | 173/5000 [00:01<00:51, 93.75it/s]

Loading training notebooks:   4%|▎         | 183/5000 [00:01<00:50, 95.40it/s]

Loading training notebooks:   4%|▍         | 193/5000 [00:02<00:50, 94.41it/s]

Loading training notebooks:   4%|▍         | 203/5000 [00:02<00:51, 92.70it/s]

Loading training notebooks:   4%|▍         | 213/5000 [00:02<00:51, 92.70it/s]

Loading training notebooks:   4%|▍         | 223/5000 [00:02<00:51, 93.24it/s]

Loading training notebooks:   5%|▍         | 233/5000 [00:02<00:50, 94.18it/s]

Loading training notebooks:   5%|▍         | 243/5000 [00:02<00:50, 94.35it/s]

Loading training notebooks:   5%|▌         | 253/5000 [00:02<00:50, 93.09it/s]

Loading training notebooks:   5%|▌         | 263/5000 [00:02<00:51, 91.33it/s]

Loading training notebooks:   5%|▌         | 274/5000 [00:02<00:50, 94.21it/s]

Loading training notebooks:   6%|▌         | 284/5000 [00:03<00:49, 94.89it/s]

Loading training notebooks:   6%|▌         | 294/5000 [00:03<00:49, 94.67it/s]

Loading training notebooks:   6%|▌         | 304/5000 [00:03<00:49, 95.26it/s]

Loading training notebooks:   6%|▋         | 314/5000 [00:03<00:48, 95.96it/s]

Loading training notebooks:   6%|▋         | 324/5000 [00:03<00:48, 96.31it/s]

Loading training notebooks:   7%|▋         | 334/5000 [00:03<00:48, 96.96it/s]

Loading training notebooks:   7%|▋         | 344/5000 [00:03<00:47, 97.07it/s]

Loading training notebooks:   7%|▋         | 354/5000 [00:03<00:49, 94.18it/s]

Loading training notebooks:   7%|▋         | 364/5000 [00:03<00:50, 92.19it/s]

Loading training notebooks:   7%|▋         | 374/5000 [00:03<00:51, 90.49it/s]

Loading training notebooks:   8%|▊         | 385/5000 [00:04<00:49, 93.45it/s]

Loading training notebooks:   8%|▊         | 395/5000 [00:04<00:48, 94.97it/s]

Loading training notebooks:   8%|▊         | 405/5000 [00:04<00:48, 95.37it/s]

Loading training notebooks:   8%|▊         | 415/5000 [00:04<00:47, 96.63it/s]

Loading training notebooks:   8%|▊         | 425/5000 [00:04<00:47, 96.93it/s]

Loading training notebooks:   9%|▊         | 435/5000 [00:04<00:47, 96.81it/s]

Loading training notebooks:   9%|▉         | 445/5000 [00:04<00:47, 96.51it/s]

Loading training notebooks:   9%|▉         | 455/5000 [00:04<00:47, 95.21it/s]

Loading training notebooks:   9%|▉         | 465/5000 [00:04<00:48, 94.34it/s]

Loading training notebooks:  10%|▉         | 475/5000 [00:05<00:48, 93.63it/s]

Loading training notebooks:  10%|▉         | 485/5000 [00:05<00:48, 92.92it/s]

Loading training notebooks:  10%|▉         | 495/5000 [00:05<00:48, 92.15it/s]

Loading training notebooks:  10%|█         | 505/5000 [00:05<00:50, 88.62it/s]

Loading training notebooks:  10%|█         | 514/5000 [00:05<00:52, 86.21it/s]

Loading training notebooks:  10%|█         | 523/5000 [00:05<00:51, 86.98it/s]

Loading training notebooks:  11%|█         | 532/5000 [00:05<00:50, 87.73it/s]

Loading training notebooks:  11%|█         | 541/5000 [00:05<00:50, 87.78it/s]

Loading training notebooks:  11%|█         | 552/5000 [00:05<00:48, 92.30it/s]

Loading training notebooks:  11%|█         | 562/5000 [00:06<00:47, 92.86it/s]

Loading training notebooks:  11%|█▏        | 572/5000 [00:06<00:47, 93.21it/s]

Loading training notebooks:  12%|█▏        | 583/5000 [00:06<00:45, 97.03it/s]

Loading training notebooks:  12%|█▏        | 594/5000 [00:06<00:44, 98.59it/s]

Loading training notebooks:  12%|█▏        | 605/5000 [00:06<00:43, 100.12it/s]

Loading training notebooks:  12%|█▏        | 616/5000 [00:06<00:43, 101.02it/s]

Loading training notebooks:  13%|█▎        | 627/5000 [00:06<00:43, 101.33it/s]

Loading training notebooks:  13%|█▎        | 638/5000 [00:06<00:42, 101.73it/s]

Loading training notebooks:  13%|█▎        | 649/5000 [00:06<00:42, 101.70it/s]

Loading training notebooks:  13%|█▎        | 660/5000 [00:06<00:42, 101.26it/s]

Loading training notebooks:  13%|█▎        | 671/5000 [00:07<00:43, 100.37it/s]

Loading training notebooks:  14%|█▎        | 682/5000 [00:07<00:44, 96.76it/s] 

Loading training notebooks:  14%|█▍        | 692/5000 [00:07<00:45, 95.06it/s]

Loading training notebooks:  14%|█▍        | 702/5000 [00:07<00:44, 95.58it/s]

Loading training notebooks:  14%|█▍        | 712/5000 [00:07<00:46, 93.15it/s]

Loading training notebooks:  14%|█▍        | 722/5000 [00:07<00:47, 89.43it/s]

Loading training notebooks:  15%|█▍        | 731/5000 [00:07<00:48, 87.93it/s]

Loading training notebooks:  15%|█▍        | 741/5000 [00:07<00:46, 90.78it/s]

Loading training notebooks:  15%|█▌        | 751/5000 [00:07<00:45, 93.21it/s]

Loading training notebooks:  15%|█▌        | 761/5000 [00:08<00:45, 92.19it/s]

Loading training notebooks:  15%|█▌        | 771/5000 [00:08<00:45, 92.19it/s]

Loading training notebooks:  16%|█▌        | 781/5000 [00:08<00:46, 89.92it/s]

Loading training notebooks:  16%|█▌        | 791/5000 [00:08<00:47, 89.24it/s]

Loading training notebooks:  16%|█▌        | 801/5000 [00:08<00:45, 91.92it/s]

Loading training notebooks:  16%|█▌        | 812/5000 [00:08<00:43, 95.29it/s]

Loading training notebooks:  16%|█▋        | 823/5000 [00:08<00:43, 97.11it/s]

Loading training notebooks:  17%|█▋        | 834/5000 [00:08<00:42, 98.66it/s]

Loading training notebooks:  17%|█▋        | 845/5000 [00:08<00:41, 99.85it/s]

Loading training notebooks:  17%|█▋        | 856/5000 [00:09<00:41, 100.14it/s]

Loading training notebooks:  17%|█▋        | 867/5000 [00:09<00:41, 100.32it/s]

Loading training notebooks:  18%|█▊        | 878/5000 [00:09<00:41, 99.98it/s] 

Loading training notebooks:  18%|█▊        | 889/5000 [00:09<00:41, 98.77it/s]

Loading training notebooks:  18%|█▊        | 899/5000 [00:09<00:41, 98.37it/s]

Loading training notebooks:  18%|█▊        | 909/5000 [00:09<00:41, 97.69it/s]

Loading training notebooks:  18%|█▊        | 919/5000 [00:09<00:42, 96.64it/s]

Loading training notebooks:  19%|█▊        | 930/5000 [00:09<00:41, 98.24it/s]

Loading training notebooks:  19%|█▉        | 940/5000 [00:09<00:41, 96.85it/s]

Loading training notebooks:  19%|█▉        | 951/5000 [00:10<00:41, 97.81it/s]

Loading training notebooks:  19%|█▉        | 961/5000 [00:10<00:42, 96.10it/s]

Loading training notebooks:  19%|█▉        | 971/5000 [00:10<00:41, 96.34it/s]

Loading training notebooks:  20%|█▉        | 981/5000 [00:10<00:42, 95.66it/s]

Loading training notebooks:  20%|█▉        | 991/5000 [00:10<00:42, 93.76it/s]

Loading training notebooks:  20%|██        | 1001/5000 [00:10<00:43, 92.02it/s]

Loading training notebooks:  20%|██        | 1011/5000 [00:10<00:43, 92.53it/s]

Loading training notebooks:  20%|██        | 1021/5000 [00:10<00:42, 94.62it/s]

Loading training notebooks:  21%|██        | 1031/5000 [00:10<00:42, 93.86it/s]

Loading training notebooks:  21%|██        | 1041/5000 [00:11<00:42, 92.60it/s]

Loading training notebooks:  21%|██        | 1051/5000 [00:11<00:43, 91.76it/s]

Loading training notebooks:  21%|██        | 1061/5000 [00:11<00:42, 92.65it/s]

Loading training notebooks:  21%|██▏       | 1072/5000 [00:11<00:41, 95.40it/s]

Loading training notebooks:  22%|██▏       | 1083/5000 [00:11<00:40, 96.99it/s]

Loading training notebooks:  22%|██▏       | 1093/5000 [00:11<00:40, 97.18it/s]

Loading training notebooks:  22%|██▏       | 1103/5000 [00:11<00:40, 96.06it/s]

Loading training notebooks:  22%|██▏       | 1113/5000 [00:11<00:40, 96.06it/s]

Loading training notebooks:  22%|██▏       | 1123/5000 [00:11<00:40, 96.42it/s]

Loading training notebooks:  23%|██▎       | 1134/5000 [00:11<00:39, 98.75it/s]

Loading training notebooks:  23%|██▎       | 1144/5000 [00:12<00:38, 98.94it/s]

Loading training notebooks:  23%|██▎       | 1155/5000 [00:12<00:38, 99.29it/s]

Loading training notebooks:  23%|██▎       | 1166/5000 [00:12<00:38, 99.88it/s]

Loading training notebooks:  24%|██▎       | 1177/5000 [00:12<00:38, 100.33it/s]

Loading training notebooks:  24%|██▍       | 1188/5000 [00:12<00:37, 100.33it/s]

Loading training notebooks:  24%|██▍       | 1199/5000 [00:12<00:39, 97.04it/s] 

Loading training notebooks:  24%|██▍       | 1209/5000 [00:12<00:39, 95.24it/s]

Loading training notebooks:  24%|██▍       | 1219/5000 [00:12<00:40, 94.20it/s]

Loading training notebooks:  25%|██▍       | 1229/5000 [00:12<00:39, 95.74it/s]

Loading training notebooks:  25%|██▍       | 1239/5000 [00:13<00:39, 94.94it/s]

Loading training notebooks:  25%|██▍       | 1249/5000 [00:13<00:40, 92.64it/s]

Loading training notebooks:  25%|██▌       | 1259/5000 [00:13<00:40, 92.51it/s]

Loading training notebooks:  25%|██▌       | 1269/5000 [00:13<00:39, 93.83it/s]

Loading training notebooks:  26%|██▌       | 1279/5000 [00:13<00:39, 94.63it/s]

Loading training notebooks:  26%|██▌       | 1289/5000 [00:13<00:39, 92.84it/s]

Loading training notebooks:  26%|██▌       | 1299/5000 [00:13<00:40, 90.80it/s]

Loading training notebooks:  26%|██▌       | 1309/5000 [00:13<00:41, 88.17it/s]

Loading training notebooks:  26%|██▋       | 1318/5000 [00:13<00:42, 86.56it/s]

Loading training notebooks:  27%|██▋       | 1328/5000 [00:14<00:41, 88.04it/s]

Loading training notebooks:  27%|██▋       | 1339/5000 [00:14<00:39, 91.94it/s]

Loading training notebooks:  27%|██▋       | 1350/5000 [00:14<00:38, 94.73it/s]

Loading training notebooks:  27%|██▋       | 1361/5000 [00:14<00:37, 97.19it/s]

Loading training notebooks:  27%|██▋       | 1372/5000 [00:14<00:36, 98.71it/s]

Loading training notebooks:  28%|██▊       | 1383/5000 [00:14<00:36, 100.26it/s]

Loading training notebooks:  28%|██▊       | 1394/5000 [00:14<00:36, 99.38it/s] 

Loading training notebooks:  28%|██▊       | 1404/5000 [00:14<00:36, 98.00it/s]

Loading training notebooks:  28%|██▊       | 1414/5000 [00:14<00:36, 97.76it/s]

Loading training notebooks:  28%|██▊       | 1425/5000 [00:15<00:36, 99.17it/s]

Loading training notebooks:  29%|██▊       | 1436/5000 [00:15<00:35, 99.95it/s]

Loading training notebooks:  29%|██▉       | 1447/5000 [00:15<00:35, 100.56it/s]

Loading training notebooks:  29%|██▉       | 1458/5000 [00:15<00:35, 98.77it/s] 

Loading training notebooks:  29%|██▉       | 1468/5000 [00:15<00:36, 97.66it/s]

Loading training notebooks:  30%|██▉       | 1478/5000 [00:15<00:36, 96.61it/s]

Loading training notebooks:  30%|██▉       | 1488/5000 [00:15<00:36, 96.14it/s]

Loading training notebooks:  30%|██▉       | 1498/5000 [00:15<00:36, 94.76it/s]

Loading training notebooks:  30%|███       | 1508/5000 [00:15<00:37, 93.85it/s]

Loading training notebooks:  30%|███       | 1518/5000 [00:15<00:37, 93.98it/s]

Loading training notebooks:  31%|███       | 1528/5000 [00:16<00:37, 92.64it/s]

Loading training notebooks:  31%|███       | 1538/5000 [00:16<00:38, 89.13it/s]

Loading training notebooks:  31%|███       | 1547/5000 [00:16<00:39, 87.38it/s]

Loading training notebooks:  31%|███       | 1557/5000 [00:16<00:38, 90.10it/s]

Loading training notebooks:  31%|███▏      | 1567/5000 [00:16<00:38, 89.81it/s]

Loading training notebooks:  32%|███▏      | 1577/5000 [00:16<00:37, 91.59it/s]

Loading training notebooks:  32%|███▏      | 1588/5000 [00:16<00:36, 94.68it/s]

Loading training notebooks:  32%|███▏      | 1598/5000 [00:16<00:36, 94.46it/s]

Loading training notebooks:  32%|███▏      | 1608/5000 [00:16<00:35, 95.15it/s]

Loading training notebooks:  32%|███▏      | 1619/5000 [00:17<00:34, 97.69it/s]

Loading training notebooks:  33%|███▎      | 1630/5000 [00:17<00:33, 100.46it/s]

Loading training notebooks:  33%|███▎      | 1641/5000 [00:17<00:33, 100.85it/s]

Loading training notebooks:  33%|███▎      | 1652/5000 [00:17<00:33, 101.30it/s]

Loading training notebooks:  33%|███▎      | 1663/5000 [00:17<00:32, 101.30it/s]

Loading training notebooks:  33%|███▎      | 1674/5000 [00:17<00:32, 101.94it/s]

Loading training notebooks:  34%|███▎      | 1685/5000 [00:17<00:32, 101.26it/s]

Loading training notebooks:  34%|███▍      | 1696/5000 [00:17<00:33, 100.09it/s]

Loading training notebooks:  34%|███▍      | 1707/5000 [00:17<00:33, 98.08it/s] 

Loading training notebooks:  34%|███▍      | 1717/5000 [00:18<00:34, 95.91it/s]

Loading training notebooks:  35%|███▍      | 1727/5000 [00:18<00:34, 94.62it/s]

Loading training notebooks:  35%|███▍      | 1737/5000 [00:18<00:35, 91.58it/s]

Loading training notebooks:  35%|███▍      | 1747/5000 [00:18<00:36, 89.06it/s]

Loading training notebooks:  35%|███▌      | 1757/5000 [00:18<00:36, 89.71it/s]

Loading training notebooks:  35%|███▌      | 1767/5000 [00:18<00:35, 91.99it/s]

Loading training notebooks:  36%|███▌      | 1777/5000 [00:18<00:34, 92.32it/s]

Loading training notebooks:  36%|███▌      | 1787/5000 [00:18<00:34, 93.68it/s]

Loading training notebooks:  36%|███▌      | 1797/5000 [00:18<00:35, 90.04it/s]

Loading training notebooks:  36%|███▌      | 1807/5000 [00:19<00:36, 87.75it/s]

Loading training notebooks:  36%|███▋      | 1816/5000 [00:19<00:36, 87.03it/s]

Loading training notebooks:  37%|███▋      | 1827/5000 [00:19<00:34, 91.14it/s]

Loading training notebooks:  37%|███▋      | 1838/5000 [00:19<00:33, 95.35it/s]

Loading training notebooks:  37%|███▋      | 1849/5000 [00:19<00:32, 97.07it/s]

Loading training notebooks:  37%|███▋      | 1860/5000 [00:19<00:31, 98.39it/s]

Loading training notebooks:  37%|███▋      | 1871/5000 [00:19<00:31, 99.69it/s]

Loading training notebooks:  38%|███▊      | 1882/5000 [00:19<00:30, 100.61it/s]

Loading training notebooks:  38%|███▊      | 1893/5000 [00:19<00:30, 100.95it/s]

Loading training notebooks:  38%|███▊      | 1904/5000 [00:20<00:31, 99.85it/s] 

Loading training notebooks:  38%|███▊      | 1914/5000 [00:20<00:31, 99.16it/s]

Loading training notebooks:  38%|███▊      | 1924/5000 [00:20<00:31, 98.62it/s]

Loading training notebooks:  39%|███▊      | 1934/5000 [00:20<00:31, 98.79it/s]

Loading training notebooks:  39%|███▉      | 1944/5000 [00:20<00:31, 98.39it/s]

Loading training notebooks:  39%|███▉      | 1954/5000 [00:20<00:31, 97.43it/s]

Loading training notebooks:  39%|███▉      | 1965/5000 [00:20<00:30, 99.91it/s]

Loading training notebooks:  40%|███▉      | 1976/5000 [00:20<00:29, 101.04it/s]

Loading training notebooks:  40%|███▉      | 1987/5000 [00:20<00:30, 99.74it/s] 

Loading training notebooks:  40%|███▉      | 1997/5000 [00:20<00:30, 98.99it/s]

Loading training notebooks:  40%|████      | 2007/5000 [00:21<00:30, 99.21it/s]

Loading training notebooks:  40%|████      | 2017/5000 [00:21<00:30, 99.27it/s]

Loading training notebooks:  41%|████      | 2027/5000 [00:21<00:30, 98.36it/s]

Loading training notebooks:  41%|████      | 2037/5000 [00:21<00:30, 97.54it/s]

Loading training notebooks:  41%|████      | 2047/5000 [00:21<00:31, 93.76it/s]

Loading training notebooks:  41%|████      | 2057/5000 [00:21<00:31, 92.90it/s]

Loading training notebooks:  41%|████▏     | 2067/5000 [00:21<00:31, 92.19it/s]

Loading training notebooks:  42%|████▏     | 2077/5000 [00:21<00:30, 94.30it/s]

Loading training notebooks:  42%|████▏     | 2087/5000 [00:21<00:30, 94.38it/s]

Loading training notebooks:  42%|████▏     | 2097/5000 [00:22<00:31, 91.73it/s]

Loading training notebooks:  42%|████▏     | 2107/5000 [00:22<00:31, 90.69it/s]

Loading training notebooks:  42%|████▏     | 2117/5000 [00:22<00:31, 91.01it/s]

Loading training notebooks:  43%|████▎     | 2128/5000 [00:22<00:30, 94.08it/s]

Loading training notebooks:  43%|████▎     | 2139/5000 [00:22<00:29, 95.97it/s]

Loading training notebooks:  43%|████▎     | 2149/5000 [00:22<00:29, 96.40it/s]

Loading training notebooks:  43%|████▎     | 2159/5000 [00:22<00:29, 95.35it/s]

Loading training notebooks:  43%|████▎     | 2169/5000 [00:22<00:29, 96.04it/s]

Loading training notebooks:  44%|████▎     | 2180/5000 [00:22<00:28, 98.06it/s]

Loading training notebooks:  44%|████▍     | 2191/5000 [00:23<00:28, 100.02it/s]

Loading training notebooks:  44%|████▍     | 2202/5000 [00:23<00:27, 100.58it/s]

Loading training notebooks:  44%|████▍     | 2213/5000 [00:23<00:27, 100.44it/s]

Loading training notebooks:  44%|████▍     | 2224/5000 [00:23<00:27, 99.74it/s] 

Loading training notebooks:  45%|████▍     | 2235/5000 [00:23<00:27, 100.53it/s]

Loading training notebooks:  45%|████▍     | 2246/5000 [00:23<00:27, 100.94it/s]

Loading training notebooks:  45%|████▌     | 2257/5000 [00:23<00:27, 101.35it/s]

Loading training notebooks:  45%|████▌     | 2268/5000 [00:23<00:27, 97.85it/s] 

Loading training notebooks:  46%|████▌     | 2278/5000 [00:23<00:28, 95.18it/s]

Loading training notebooks:  46%|████▌     | 2288/5000 [00:24<00:29, 93.24it/s]

Loading training notebooks:  46%|████▌     | 2298/5000 [00:24<00:28, 94.06it/s]

Loading training notebooks:  46%|████▌     | 2308/5000 [00:24<00:28, 94.43it/s]

Loading training notebooks:  46%|████▋     | 2318/5000 [00:24<00:28, 93.16it/s]

Loading training notebooks:  47%|████▋     | 2328/5000 [00:24<00:28, 93.93it/s]

Loading training notebooks:  47%|████▋     | 2338/5000 [00:24<00:28, 94.28it/s]

Loading training notebooks:  47%|████▋     | 2348/5000 [00:24<00:28, 92.93it/s]

Loading training notebooks:  47%|████▋     | 2358/5000 [00:24<00:28, 93.16it/s]

Loading training notebooks:  47%|████▋     | 2368/5000 [00:24<00:28, 92.07it/s]

Loading training notebooks:  48%|████▊     | 2378/5000 [00:24<00:29, 89.97it/s]

Loading training notebooks:  48%|████▊     | 2388/5000 [00:25<00:30, 86.89it/s]

Loading training notebooks:  48%|████▊     | 2398/5000 [00:25<00:28, 90.00it/s]

Loading training notebooks:  48%|████▊     | 2409/5000 [00:25<00:27, 94.06it/s]

Loading training notebooks:  48%|████▊     | 2419/5000 [00:25<00:27, 94.82it/s]

Loading training notebooks:  49%|████▊     | 2430/5000 [00:25<00:26, 96.84it/s]

Loading training notebooks:  49%|████▉     | 2441/5000 [00:25<00:25, 99.30it/s]

Loading training notebooks:  49%|████▉     | 2452/5000 [00:25<00:25, 100.38it/s]

Loading training notebooks:  49%|████▉     | 2463/5000 [00:25<00:25, 100.88it/s]

Loading training notebooks:  49%|████▉     | 2474/5000 [00:25<00:25, 99.41it/s] 

Loading training notebooks:  50%|████▉     | 2484/5000 [00:26<00:26, 96.48it/s]

Loading training notebooks:  50%|████▉     | 2495/5000 [00:26<00:25, 97.67it/s]

Loading training notebooks:  50%|█████     | 2506/5000 [00:26<00:24, 99.80it/s]

Loading training notebooks:  50%|█████     | 2516/5000 [00:26<00:24, 99.75it/s]

Loading training notebooks:  51%|█████     | 2527/5000 [00:26<00:24, 102.21it/s]

Loading training notebooks:  51%|█████     | 2538/5000 [00:26<00:24, 101.52it/s]

Loading training notebooks:  51%|█████     | 2549/5000 [00:26<00:25, 97.83it/s] 

Loading training notebooks:  51%|█████     | 2559/5000 [00:26<00:25, 96.16it/s]

Loading training notebooks:  51%|█████▏    | 2569/5000 [00:26<00:25, 96.06it/s]

Loading training notebooks:  52%|█████▏    | 2579/5000 [00:27<00:25, 95.87it/s]

Loading training notebooks:  52%|█████▏    | 2589/5000 [00:27<00:25, 95.31it/s]

Loading training notebooks:  52%|█████▏    | 2599/5000 [00:27<00:25, 94.16it/s]

Loading training notebooks:  52%|█████▏    | 2609/5000 [00:27<00:25, 92.75it/s]

Loading training notebooks:  52%|█████▏    | 2619/5000 [00:27<00:26, 90.80it/s]

Loading training notebooks:  53%|█████▎    | 2629/5000 [00:27<00:26, 91.00it/s]

Loading training notebooks:  53%|█████▎    | 2639/5000 [00:27<00:25, 91.09it/s]

Loading training notebooks:  53%|█████▎    | 2649/5000 [00:27<00:25, 90.62it/s]

Loading training notebooks:  53%|█████▎    | 2659/5000 [00:27<00:25, 93.14it/s]

Loading training notebooks:  53%|█████▎    | 2670/5000 [00:28<00:24, 95.64it/s]

Loading training notebooks:  54%|█████▎    | 2681/5000 [00:28<00:23, 97.54it/s]

Loading training notebooks:  54%|█████▍    | 2691/5000 [00:28<00:23, 96.74it/s]

Loading training notebooks:  54%|█████▍    | 2701/5000 [00:28<00:23, 96.07it/s]

Loading training notebooks:  54%|█████▍    | 2711/5000 [00:28<00:23, 96.78it/s]

Loading training notebooks:  54%|█████▍    | 2722/5000 [00:28<00:23, 98.28it/s]

Loading training notebooks:  55%|█████▍    | 2732/5000 [00:28<00:22, 98.64it/s]

Loading training notebooks:  55%|█████▍    | 2742/5000 [00:28<00:23, 97.02it/s]

Loading training notebooks:  55%|█████▌    | 2752/5000 [00:28<00:23, 97.31it/s]

Loading training notebooks:  55%|█████▌    | 2762/5000 [00:28<00:22, 97.68it/s]

Loading training notebooks:  55%|█████▌    | 2772/5000 [00:29<00:22, 98.35it/s]

Loading training notebooks:  56%|█████▌    | 2782/5000 [00:29<00:22, 98.06it/s]

Loading training notebooks:  56%|█████▌    | 2792/5000 [00:29<00:23, 95.29it/s]

Loading training notebooks:  56%|█████▌    | 2802/5000 [00:29<00:23, 92.36it/s]

Loading training notebooks:  56%|█████▌    | 2812/5000 [00:29<00:23, 92.22it/s]

Loading training notebooks:  56%|█████▋    | 2822/5000 [00:29<00:23, 93.05it/s]

Loading training notebooks:  57%|█████▋    | 2833/5000 [00:29<00:22, 95.32it/s]

Loading training notebooks:  57%|█████▋    | 2843/5000 [00:29<00:22, 96.09it/s]

Loading training notebooks:  57%|█████▋    | 2854/5000 [00:29<00:21, 98.08it/s]

Loading training notebooks:  57%|█████▋    | 2864/5000 [00:30<00:22, 96.16it/s]

Loading training notebooks:  57%|█████▋    | 2874/5000 [00:30<00:22, 94.26it/s]

Loading training notebooks:  58%|█████▊    | 2884/5000 [00:30<00:22, 94.47it/s]

Loading training notebooks:  58%|█████▊    | 2894/5000 [00:30<00:22, 93.58it/s]

Loading training notebooks:  58%|█████▊    | 2905/5000 [00:30<00:21, 95.82it/s]

Loading training notebooks:  58%|█████▊    | 2916/5000 [00:30<00:21, 97.44it/s]

Loading training notebooks:  59%|█████▊    | 2926/5000 [00:30<00:22, 93.75it/s]

Loading training notebooks:  59%|█████▊    | 2936/5000 [00:30<00:22, 91.82it/s]

Loading training notebooks:  59%|█████▉    | 2946/5000 [00:30<00:22, 93.02it/s]

Loading training notebooks:  59%|█████▉    | 2956/5000 [00:31<00:21, 94.51it/s]

Loading training notebooks:  59%|█████▉    | 2966/5000 [00:31<00:21, 95.07it/s]

Loading training notebooks:  60%|█████▉    | 2977/5000 [00:31<00:20, 96.39it/s]

Loading training notebooks:  60%|█████▉    | 2987/5000 [00:31<00:21, 94.68it/s]

Loading training notebooks:  60%|█████▉    | 2997/5000 [00:31<00:21, 94.21it/s]

Loading training notebooks:  60%|██████    | 3007/5000 [00:31<00:20, 95.72it/s]

Loading training notebooks:  60%|██████    | 3018/5000 [00:31<00:20, 98.82it/s]

Loading training notebooks:  61%|██████    | 3029/5000 [00:31<00:19, 99.48it/s]

Loading training notebooks:  61%|██████    | 3039/5000 [00:31<00:19, 99.01it/s]

Loading training notebooks:  61%|██████    | 3049/5000 [00:31<00:19, 98.97it/s]

Loading training notebooks:  61%|██████    | 3060/5000 [00:32<00:19, 99.39it/s]

Loading training notebooks:  61%|██████▏   | 3070/5000 [00:32<00:19, 99.23it/s]

Loading training notebooks:  62%|██████▏   | 3081/5000 [00:32<00:19, 99.52it/s]

Loading training notebooks:  62%|██████▏   | 3091/5000 [00:32<00:19, 99.22it/s]

Loading training notebooks:  62%|██████▏   | 3101/5000 [00:32<00:19, 95.42it/s]

Loading training notebooks:  62%|██████▏   | 3111/5000 [00:32<00:20, 92.79it/s]

Loading training notebooks:  62%|██████▏   | 3121/5000 [00:32<00:20, 91.06it/s]

Loading training notebooks:  63%|██████▎   | 3131/5000 [00:32<00:21, 88.78it/s]

Loading training notebooks:  63%|██████▎   | 3141/5000 [00:32<00:20, 90.10it/s]

Loading training notebooks:  63%|██████▎   | 3151/5000 [00:33<00:20, 91.82it/s]

Loading training notebooks:  63%|██████▎   | 3161/5000 [00:33<00:20, 91.36it/s]

Loading training notebooks:  63%|██████▎   | 3171/5000 [00:33<00:20, 89.91it/s]

Loading training notebooks:  64%|██████▎   | 3181/5000 [00:33<00:20, 88.41it/s]

Loading training notebooks:  64%|██████▍   | 3190/5000 [00:33<00:20, 88.30it/s]

Loading training notebooks:  64%|██████▍   | 3201/5000 [00:33<00:19, 92.88it/s]

Loading training notebooks:  64%|██████▍   | 3212/5000 [00:33<00:18, 96.00it/s]

Loading training notebooks:  64%|██████▍   | 3223/5000 [00:33<00:18, 98.27it/s]

Loading training notebooks:  65%|██████▍   | 3234/5000 [00:33<00:17, 100.01it/s]

Loading training notebooks:  65%|██████▍   | 3245/5000 [00:34<00:17, 101.52it/s]

Loading training notebooks:  65%|██████▌   | 3256/5000 [00:34<00:16, 102.69it/s]

Loading training notebooks:  65%|██████▌   | 3267/5000 [00:34<00:16, 102.14it/s]

Loading training notebooks:  66%|██████▌   | 3278/5000 [00:34<00:16, 102.04it/s]

Loading training notebooks:  66%|██████▌   | 3289/5000 [00:34<00:16, 102.21it/s]

Loading training notebooks:  66%|██████▌   | 3300/5000 [00:34<00:16, 101.18it/s]

Loading training notebooks:  66%|██████▌   | 3311/5000 [00:34<00:17, 95.40it/s] 

Loading training notebooks:  66%|██████▋   | 3321/5000 [00:34<00:17, 93.33it/s]

Loading training notebooks:  67%|██████▋   | 3331/5000 [00:34<00:17, 93.44it/s]

Loading training notebooks:  67%|██████▋   | 3341/5000 [00:35<00:18, 91.66it/s]

Loading training notebooks:  67%|██████▋   | 3351/5000 [00:35<00:18, 91.35it/s]

Loading training notebooks:  67%|██████▋   | 3361/5000 [00:35<00:18, 90.84it/s]

Loading training notebooks:  67%|██████▋   | 3371/5000 [00:35<00:18, 88.68it/s]

Loading training notebooks:  68%|██████▊   | 3380/5000 [00:35<00:18, 86.89it/s]

Loading training notebooks:  68%|██████▊   | 3389/5000 [00:35<00:19, 84.61it/s]

Loading training notebooks:  68%|██████▊   | 3398/5000 [00:35<00:19, 81.68it/s]

Loading training notebooks:  68%|██████▊   | 3407/5000 [00:35<00:19, 82.95it/s]

Loading training notebooks:  68%|██████▊   | 3417/5000 [00:35<00:18, 87.61it/s]

Loading training notebooks:  69%|██████▊   | 3427/5000 [00:36<00:17, 90.66it/s]

Loading training notebooks:  69%|██████▉   | 3438/5000 [00:36<00:16, 94.29it/s]

Loading training notebooks:  69%|██████▉   | 3449/5000 [00:36<00:16, 96.68it/s]

Loading training notebooks:  69%|██████▉   | 3460/5000 [00:36<00:15, 98.68it/s]

Loading training notebooks:  69%|██████▉   | 3471/5000 [00:36<00:15, 99.79it/s]

Loading training notebooks:  70%|██████▉   | 3482/5000 [00:36<00:14, 101.80it/s]

Loading training notebooks:  70%|██████▉   | 3493/5000 [00:36<00:14, 102.17it/s]

Loading training notebooks:  70%|███████   | 3504/5000 [00:36<00:14, 102.36it/s]

Loading training notebooks:  70%|███████   | 3515/5000 [00:36<00:14, 100.45it/s]

Loading training notebooks:  71%|███████   | 3526/5000 [00:37<00:15, 97.92it/s] 

Loading training notebooks:  71%|███████   | 3536/5000 [00:37<00:15, 97.04it/s]

Loading training notebooks:  71%|███████   | 3546/5000 [00:37<00:14, 97.21it/s]

Loading training notebooks:  71%|███████   | 3556/5000 [00:37<00:14, 97.38it/s]

Loading training notebooks:  71%|███████▏  | 3566/5000 [00:37<00:15, 94.14it/s]

Loading training notebooks:  72%|███████▏  | 3576/5000 [00:37<00:15, 92.03it/s]

Loading training notebooks:  72%|███████▏  | 3586/5000 [00:37<00:15, 88.52it/s]

Loading training notebooks:  72%|███████▏  | 3595/5000 [00:37<00:16, 85.90it/s]

Loading training notebooks:  72%|███████▏  | 3604/5000 [00:37<00:16, 85.94it/s]

Loading training notebooks:  72%|███████▏  | 3614/5000 [00:37<00:15, 87.67it/s]

Loading training notebooks:  72%|███████▏  | 3624/5000 [00:38<00:15, 90.68it/s]

Loading training notebooks:  73%|███████▎  | 3635/5000 [00:38<00:14, 93.75it/s]

Loading training notebooks:  73%|███████▎  | 3645/5000 [00:38<00:14, 92.13it/s]

Loading training notebooks:  73%|███████▎  | 3655/5000 [00:38<00:14, 92.93it/s]

Loading training notebooks:  73%|███████▎  | 3665/5000 [00:38<00:14, 94.24it/s]

Loading training notebooks:  74%|███████▎  | 3675/5000 [00:38<00:13, 95.63it/s]

Loading training notebooks:  74%|███████▎  | 3686/5000 [00:38<00:13, 98.87it/s]

Loading training notebooks:  74%|███████▍  | 3697/5000 [00:38<00:13, 100.16it/s]

Loading training notebooks:  74%|███████▍  | 3708/5000 [00:38<00:12, 99.89it/s] 

Loading training notebooks:  74%|███████▍  | 3718/5000 [00:39<00:12, 99.74it/s]

Loading training notebooks:  75%|███████▍  | 3729/5000 [00:39<00:12, 99.80it/s]

Loading training notebooks:  75%|███████▍  | 3739/5000 [00:39<00:12, 99.84it/s]

Loading training notebooks:  75%|███████▍  | 3749/5000 [00:39<00:12, 97.39it/s]

Loading training notebooks:  75%|███████▌  | 3759/5000 [00:39<00:12, 96.63it/s]

Loading training notebooks:  75%|███████▌  | 3769/5000 [00:39<00:12, 95.93it/s]

Loading training notebooks:  76%|███████▌  | 3779/5000 [00:39<00:13, 93.70it/s]

Loading training notebooks:  76%|███████▌  | 3789/5000 [00:39<00:12, 93.26it/s]

Loading training notebooks:  76%|███████▌  | 3799/5000 [00:39<00:12, 94.86it/s]

Loading training notebooks:  76%|███████▌  | 3809/5000 [00:39<00:12, 95.67it/s]

Loading training notebooks:  76%|███████▋  | 3819/5000 [00:40<00:12, 96.39it/s]

Loading training notebooks:  77%|███████▋  | 3829/5000 [00:40<00:12, 96.23it/s]

Loading training notebooks:  77%|███████▋  | 3840/5000 [00:40<00:11, 97.88it/s]

Loading training notebooks:  77%|███████▋  | 3850/5000 [00:40<00:11, 98.10it/s]

Loading training notebooks:  77%|███████▋  | 3860/5000 [00:40<00:11, 98.02it/s]

Loading training notebooks:  77%|███████▋  | 3870/5000 [00:40<00:11, 98.54it/s]

Loading training notebooks:  78%|███████▊  | 3881/5000 [00:40<00:11, 99.39it/s]

Loading training notebooks:  78%|███████▊  | 3891/5000 [00:40<00:11, 99.17it/s]

Loading training notebooks:  78%|███████▊  | 3901/5000 [00:40<00:11, 95.69it/s]

Loading training notebooks:  78%|███████▊  | 3911/5000 [00:41<00:11, 94.12it/s]

Loading training notebooks:  78%|███████▊  | 3921/5000 [00:41<00:11, 93.89it/s]

Loading training notebooks:  79%|███████▊  | 3931/5000 [00:41<00:11, 92.94it/s]

Loading training notebooks:  79%|███████▉  | 3941/5000 [00:41<00:11, 94.14it/s]

Loading training notebooks:  79%|███████▉  | 3951/5000 [00:41<00:11, 94.68it/s]

Loading training notebooks:  79%|███████▉  | 3961/5000 [00:41<00:10, 95.10it/s]

Loading training notebooks:  79%|███████▉  | 3971/5000 [00:41<00:10, 95.01it/s]

Loading training notebooks:  80%|███████▉  | 3982/5000 [00:41<00:10, 98.17it/s]

Loading training notebooks:  80%|███████▉  | 3993/5000 [00:41<00:10, 99.30it/s]

Loading training notebooks:  80%|████████  | 4004/5000 [00:42<00:09, 100.04it/s]

Loading training notebooks:  80%|████████  | 4015/5000 [00:42<00:09, 98.98it/s] 

Loading training notebooks:  80%|████████  | 4025/5000 [00:42<00:09, 98.67it/s]

Loading training notebooks:  81%|████████  | 4035/5000 [00:42<00:09, 98.30it/s]

Loading training notebooks:  81%|████████  | 4045/5000 [00:42<00:09, 98.09it/s]

Loading training notebooks:  81%|████████  | 4056/5000 [00:42<00:09, 99.83it/s]

Loading training notebooks:  81%|████████▏ | 4067/5000 [00:42<00:09, 100.79it/s]

Loading training notebooks:  82%|████████▏ | 4078/5000 [00:42<00:09, 101.47it/s]

Loading training notebooks:  82%|████████▏ | 4089/5000 [00:42<00:08, 101.29it/s]

Loading training notebooks:  82%|████████▏ | 4100/5000 [00:42<00:09, 98.60it/s] 

Loading training notebooks:  82%|████████▏ | 4110/5000 [00:43<00:09, 97.67it/s]

Loading training notebooks:  82%|████████▏ | 4120/5000 [00:43<00:09, 97.13it/s]

Loading training notebooks:  83%|████████▎ | 4130/5000 [00:43<00:09, 95.63it/s]

Loading training notebooks:  83%|████████▎ | 4140/5000 [00:43<00:09, 94.30it/s]

Loading training notebooks:  83%|████████▎ | 4150/5000 [00:43<00:09, 93.25it/s]

Loading training notebooks:  83%|████████▎ | 4160/5000 [00:43<00:09, 92.36it/s]

Loading training notebooks:  83%|████████▎ | 4170/5000 [00:43<00:08, 92.96it/s]

Loading training notebooks:  84%|████████▎ | 4180/5000 [00:43<00:08, 92.41it/s]

Loading training notebooks:  84%|████████▍ | 4190/5000 [00:43<00:08, 93.72it/s]

Loading training notebooks:  84%|████████▍ | 4201/5000 [00:44<00:08, 95.72it/s]

Loading training notebooks:  84%|████████▍ | 4212/5000 [00:44<00:07, 98.80it/s]

Loading training notebooks:  84%|████████▍ | 4223/5000 [00:44<00:07, 101.56it/s]

Loading training notebooks:  85%|████████▍ | 4234/5000 [00:44<00:07, 101.86it/s]

Loading training notebooks:  85%|████████▍ | 4245/5000 [00:44<00:07, 102.84it/s]

Loading training notebooks:  85%|████████▌ | 4256/5000 [00:44<00:07, 104.28it/s]

Loading training notebooks:  85%|████████▌ | 4267/5000 [00:44<00:06, 105.74it/s]

Loading training notebooks:  86%|████████▌ | 4278/5000 [00:44<00:06, 105.72it/s]

Loading training notebooks:  86%|████████▌ | 4289/5000 [00:44<00:06, 105.69it/s]

Loading training notebooks:  86%|████████▌ | 4300/5000 [00:44<00:06, 104.58it/s]

Loading training notebooks:  86%|████████▌ | 4311/5000 [00:45<00:06, 104.52it/s]

Loading training notebooks:  86%|████████▋ | 4322/5000 [00:45<00:06, 102.72it/s]

Loading training notebooks:  87%|████████▋ | 4333/5000 [00:45<00:06, 102.01it/s]

Loading training notebooks:  87%|████████▋ | 4344/5000 [00:45<00:06, 100.17it/s]

Loading training notebooks:  87%|████████▋ | 4355/5000 [00:45<00:06, 98.95it/s] 

Loading training notebooks:  87%|████████▋ | 4365/5000 [00:45<00:06, 98.84it/s]

Loading training notebooks:  88%|████████▊ | 4375/5000 [00:45<00:06, 96.73it/s]

Loading training notebooks:  88%|████████▊ | 4385/5000 [00:45<00:06, 95.08it/s]

Loading training notebooks:  88%|████████▊ | 4396/5000 [00:45<00:06, 96.62it/s]

Loading training notebooks:  88%|████████▊ | 4406/5000 [00:46<00:06, 97.52it/s]

Loading training notebooks:  88%|████████▊ | 4417/5000 [00:46<00:05, 98.80it/s]

Loading training notebooks:  89%|████████▊ | 4427/5000 [00:46<00:05, 96.37it/s]

Loading training notebooks:  89%|████████▊ | 4437/5000 [00:46<00:05, 97.29it/s]

Loading training notebooks:  89%|████████▉ | 4447/5000 [00:46<00:05, 97.44it/s]

Loading training notebooks:  89%|████████▉ | 4458/5000 [00:46<00:05, 99.93it/s]

Loading training notebooks:  89%|████████▉ | 4469/5000 [00:46<00:05, 102.43it/s]

Loading training notebooks:  90%|████████▉ | 4480/5000 [00:46<00:05, 102.89it/s]

Loading training notebooks:  90%|████████▉ | 4491/5000 [00:46<00:04, 103.03it/s]

Loading training notebooks:  90%|█████████ | 4502/5000 [00:47<00:04, 103.62it/s]

Loading training notebooks:  90%|█████████ | 4513/5000 [00:47<00:04, 104.39it/s]

Loading training notebooks:  90%|█████████ | 4524/5000 [00:47<00:04, 105.16it/s]

Loading training notebooks:  91%|█████████ | 4535/5000 [00:47<00:04, 106.12it/s]

Loading training notebooks:  91%|█████████ | 4546/5000 [00:47<00:04, 105.36it/s]

Loading training notebooks:  91%|█████████ | 4557/5000 [00:47<00:04, 105.65it/s]

Loading training notebooks:  91%|█████████▏| 4568/5000 [00:47<00:04, 105.63it/s]

Loading training notebooks:  92%|█████████▏| 4579/5000 [00:47<00:04, 103.90it/s]

Loading training notebooks:  92%|█████████▏| 4590/5000 [00:47<00:04, 102.39it/s]

Loading training notebooks:  92%|█████████▏| 4601/5000 [00:47<00:03, 101.80it/s]

Loading training notebooks:  92%|█████████▏| 4612/5000 [00:48<00:03, 101.32it/s]

Loading training notebooks:  92%|█████████▏| 4623/5000 [00:48<00:03, 101.35it/s]

Loading training notebooks:  93%|█████████▎| 4634/5000 [00:48<00:03, 100.50it/s]

Loading training notebooks:  93%|█████████▎| 4645/5000 [00:48<00:03, 99.66it/s] 

Loading training notebooks:  93%|█████████▎| 4656/5000 [00:48<00:03, 101.04it/s]

Loading training notebooks:  93%|█████████▎| 4667/5000 [00:48<00:03, 103.23it/s]

Loading training notebooks:  94%|█████████▎| 4678/5000 [00:48<00:03, 104.31it/s]

Loading training notebooks:  94%|█████████▍| 4689/5000 [00:48<00:02, 105.00it/s]

Loading training notebooks:  94%|█████████▍| 4700/5000 [00:48<00:02, 105.72it/s]

Loading training notebooks:  94%|█████████▍| 4711/5000 [00:49<00:02, 106.60it/s]

Loading training notebooks:  94%|█████████▍| 4722/5000 [00:49<00:02, 107.34it/s]

Loading training notebooks:  95%|█████████▍| 4733/5000 [00:49<00:02, 107.18it/s]

Loading training notebooks:  95%|█████████▍| 4744/5000 [00:49<00:02, 106.45it/s]

Loading training notebooks:  95%|█████████▌| 4755/5000 [00:49<00:02, 106.44it/s]

Loading training notebooks:  95%|█████████▌| 4766/5000 [00:49<00:02, 106.62it/s]

Loading training notebooks:  96%|█████████▌| 4777/5000 [00:49<00:02, 105.20it/s]

Loading training notebooks:  96%|█████████▌| 4788/5000 [00:49<00:02, 104.09it/s]

Loading training notebooks:  96%|█████████▌| 4799/5000 [00:49<00:01, 103.22it/s]

Loading training notebooks:  96%|█████████▌| 4810/5000 [00:49<00:01, 102.55it/s]

Loading training notebooks:  96%|█████████▋| 4821/5000 [00:50<00:01, 102.30it/s]

Loading training notebooks:  97%|█████████▋| 4832/5000 [00:50<00:01, 100.65it/s]

Loading training notebooks:  97%|█████████▋| 4843/5000 [00:50<00:01, 100.46it/s]

Loading training notebooks:  97%|█████████▋| 4854/5000 [00:50<00:01, 102.60it/s]

Loading training notebooks:  97%|█████████▋| 4865/5000 [00:50<00:01, 103.87it/s]

Loading training notebooks:  98%|█████████▊| 4876/5000 [00:50<00:01, 104.82it/s]

Loading training notebooks:  98%|█████████▊| 4887/5000 [00:50<00:01, 105.68it/s]

Loading training notebooks:  98%|█████████▊| 4898/5000 [00:50<00:00, 106.81it/s]

Loading training notebooks:  98%|█████████▊| 4909/5000 [00:50<00:00, 107.05it/s]

Loading training notebooks:  98%|█████████▊| 4920/5000 [00:51<00:00, 106.36it/s]

Loading training notebooks:  99%|█████████▊| 4931/5000 [00:51<00:00, 106.77it/s]

Loading training notebooks:  99%|█████████▉| 4942/5000 [00:51<00:00, 106.96it/s]

Loading training notebooks:  99%|█████████▉| 4953/5000 [00:51<00:00, 105.75it/s]

Loading training notebooks:  99%|█████████▉| 4964/5000 [00:51<00:00, 104.65it/s]

Loading training notebooks: 100%|█████████▉| 4975/5000 [00:51<00:00, 103.38it/s]

Loading training notebooks: 100%|█████████▉| 4986/5000 [00:51<00:00, 102.74it/s]

Loading training notebooks: 100%|█████████▉| 4997/5000 [00:51<00:00, 102.49it/s]

Loading training notebooks: 100%|██████████| 5000/5000 [00:51<00:00, 96.53it/s] 




Training data shape: (233581, 5)
Average cells per notebook: 46.7


In [4]:
# Enhanced feature extraction function
def extract_enhanced_features(df):
    """Extract enhanced features including TF-IDF and heading analysis"""
    
    # Basic text features
    df['source_length'] = df['source'].str.len()
    df['line_count'] = df['source'].str.count('\\n') + 1
    df['word_count'] = df['source'].str.split().str.len()
    df['char_count'] = df['source'].str.replace('\\s+', '', regex=True).str.len()
    
    # Binary flags
    df['has_import'] = df['source'].str.contains('import ', case=False, na=False).astype(int)
    df['has_comment'] = df['source'].str.contains('#', na=False).astype(int)
    df['has_heading'] = df['source'].str.contains('^#+\\s', regex=True, na=False).astype(int)
    df['has_code_block'] = df['source'].str.contains('```', na=False).astype(int)
    df['has_link'] = df['source'].str.contains('\\[.*\\]\\(http', regex=True, na=False).astype(int)
    
    # Cell type
    df['cell_type_code'] = (df['cell_type'] == 'code').astype(int)
    
    # Heading level analysis
    df['heading_level'] = 0
    for level in range(1, 7):
        mask = df['source'].str.match(f'^#{{{level}}}\\s', na=False)
        df.loc[mask, 'heading_level'] = level
    
    # Common heading text features
    common_headings = ['introduction', 'conclusion', 'eda', 'exploratory data analysis', 
                       'model', 'results', 'analysis', 'data', 'preprocessing', 
                       'visualization', 'plot', 'train', 'test', 'validation']
    
    for heading in common_headings:
        df[f'heading_{heading}'] = df['source'].str.contains(heading, case=False, na=False).astype(int)
    
    # Semantic position features
    df['has_print'] = df['source'].str.contains('print\\s*\\(', na=False).astype(int)
    df['has_kaggle'] = df['source'].str.contains('kaggle', case=False, na=False).astype(int)
    df['has_input'] = df['source'].str.contains('input', case=False, na=False).astype(int)
    df['has_data'] = df['source'].str.contains('\\bdata\\b', case=False, na=False).astype(int)
    df['has_function'] = df['source'].str.contains('def\\s+\\w+\\s*\\(', regex=True, na=False).astype(int)
    df['has_class'] = df['source'].str.contains('class\\s+\\w+\\s*\\(', regex=True, na=False).astype(int)
    df['has_plot'] = df['source'].str.contains('\\.plot\\s*\\(|\\.show\\s*\\(|plt\\.', regex=True, na=False).astype(int)
    
    # First/last cell indicators
    df['likely_first_cell'] = ((df['has_import'] == 1) | (df['has_kaggle'] == 1) | (df['has_input'] == 1)).astype(int)
    df['likely_last_cell'] = ((df['has_print'] == 1) | (df['has_plot'] == 1)).astype(int)
    
    return df

print("Extracting enhanced features...")
train_df = extract_enhanced_features(train_df)
print(f"Features extracted. Shape: {train_df.shape}")

Extracting enhanced features...


Features extracted. Shape: (233581, 39)


In [5]:
# Add notebook-level features
print("Adding notebook-level features...")

# Calculate notebook-level statistics
notebook_stats = train_df.groupby('notebook_id').agg({
    'source_length': ['mean', 'std'],
    'word_count': ['mean', 'std'],
    'cell_type_code': 'mean',  # code ratio
    'position': 'max'  # notebook size
}).round(2)

notebook_stats.columns = ['_'.join(col).strip() for col in notebook_stats.columns]
notebook_stats = notebook_stats.reset_index()
notebook_stats.rename(columns={'position_max': 'notebook_size'}, inplace=True)

# Merge notebook-level features
train_df = train_df.merge(notebook_stats, on='notebook_id', how='left')

# Add relative position feature (percentile within notebook)
train_df['relative_position'] = train_df['position'] / train_df['notebook_size']

print(f"Final training shape: {train_df.shape}")
print(f"Notebook-level features added: {list(notebook_stats.columns)}")

Adding notebook-level features...


Final training shape: (233581, 46)
Notebook-level features added: ['notebook_id', 'source_length_mean', 'source_length_std', 'word_count_mean', 'word_count_std', 'cell_type_code_mean', 'notebook_size']


In [6]:
# TF-IDF features for semantic content
print("Extracting TF-IDF features...")

# Get text from markdown cells for TF-IDF
markdown_texts = train_df[train_df['cell_type'] == 'markdown']['source'].fillna('').tolist()

# Use a subset of terms for efficiency
max_features = 1000
vectorizer = TfidfVectorizer(
    max_features=max_features,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.95
)

# Fit TF-IDF on markdown texts
vectorizer.fit(markdown_texts)
print(f"TF-IDF vocabulary size: {len(vectorizer.vocabulary_)}")

# Transform all texts (both code and markdown)
all_texts = train_df['source'].fillna('').tolist()
tfidf_matrix = vectorizer.transform(all_texts)

# Add TF-IDF features to dataframe
tfidf_feature_names = [f'tfidf_{i}' for i in range(max_features)]
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_feature_names,
    index=train_df.index
)

# Concatenate with original dataframe
train_df = pd.concat([train_df, tfidf_df], axis=1)

print(f"Final shape with TF-IDF: {train_df.shape}")
print(f"TF-IDF features added: {max_features}")

Extracting TF-IDF features...


TF-IDF vocabulary size: 1000


Final shape with TF-IDF: (233581, 1046)
TF-IDF features added: 1000


In [7]:
# Prepare feature columns
basic_features = ['source_length', 'line_count', 'word_count', 'char_count', 
                  'has_import', 'has_comment', 'has_heading', 'has_code_block', 
                  'has_link', 'cell_type_code', 'heading_level']

heading_features = [f'heading_{h}' for h in ['introduction', 'conclusion', 'eda', 
                    'exploratory data analysis', 'model', 'results', 'analysis', 
                    'data', 'preprocessing', 'visualization', 'plot', 'train', 
                    'test', 'validation']]

semantic_features = ['has_print', 'has_kaggle', 'has_input', 'has_data', 
                     'has_function', 'has_class', 'has_plot', 'likely_first_cell', 
                     'likely_last_cell']

notebook_features = ['source_length_mean', 'source_length_std', 'word_count_mean', 
                     'word_count_std', 'cell_type_code_mean', 'notebook_size']

tfidf_features = tfidf_feature_names

feature_cols = (basic_features + heading_features + semantic_features + 
                notebook_features + tfidf_features + ['relative_position'])

print(f"Total features: {len(feature_cols)}")
print(f"Basic features: {len(basic_features)}")
print(f"Heading features: {len(heading_features)}")
print(f"Semantic features: {len(semantic_features)}")
print(f"Notebook features: {len(notebook_features)}")
print(f"TF-IDF features: {len(tfidf_features)}")

Total features: 1041
Basic features: 11
Heading features: 14
Semantic features: 9
Notebook features: 6
TF-IDF features: 1000


In [8]:
# Define evaluation metric
def kendall_tau_score(y_true, y_pred):
    """Calculate Kendall tau correlation"""
    return kendalltau(y_true, y_pred).correlation

kendall_scorer = make_scorer(kendall_tau_score, greater_is_better=True)

# Cross-validation setup
gkf = GroupKFold(n_splits=5)
groups = train_df['notebook_id']

X = train_df[feature_cols]
y = train_df['position']

print("Starting cross-validation...")
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Calculate per-notebook Kendall tau
    val_df = train_df.iloc[val_idx].copy()
    val_df['pred_position'] = y_pred
    
    fold_scores = []
    for notebook_id in val_df['notebook_id'].unique():
        notebook_data = val_df[val_df['notebook_id'] == notebook_id]
        if len(notebook_data) > 1:
            score = kendall_tau_score(
                notebook_data['position'].values,
                notebook_data['pred_position'].values
            )
            fold_scores.append(score)
    
    fold_score = np.mean(fold_scores)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1}: {fold_score:.4f}")

print(f"\nCV Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Starting cross-validation...




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.271855 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184263
[LightGBM] [Info] Number of data points in the train set: 186865, number of used features: 1041
[LightGBM] [Info] Start training from score 40.286688


Fold 1: 0.9951




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.263395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 183210
[LightGBM] [Info] Number of data points in the train set: 186865, number of used features: 1041
[LightGBM] [Info] Start training from score 41.092607


Fold 2: 0.9918




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.296738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184081
[LightGBM] [Info] Number of data points in the train set: 186865, number of used features: 1041
[LightGBM] [Info] Start training from score 41.116710


Fold 3: 0.9943




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.351872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184229
[LightGBM] [Info] Number of data points in the train set: 186865, number of used features: 1041
[LightGBM] [Info] Start training from score 41.117052


Fold 4: 0.9950




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.276111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 185604
[LightGBM] [Info] Number of data points in the train set: 186864, number of used features: 1041
[LightGBM] [Info] Start training from score 41.118685


Fold 5: 0.9919

CV Score: 0.9936 ± 0.0015


In [None]:
# Train final model on all data
print("Training final model on all data...")

final_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_model.fit(X, y)

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 most important features:")
print(importance_df.head(20))

In [None]:
# Prediction function for test data
def predict_notebook_order(notebook_id, path, model, feature_cols, vectorizer):
    """Predict cell order for a notebook"""
    notebook_path = path / f"{notebook_id}.json"
    
    with open(notebook_path, 'r') as f:
        notebook = json.load(f)
    
    # Extract features
    cells = []
    cell_ids = []
    
    for cell_id, cell_data in notebook.items():
        cells.append({
            'cell_id': cell_id,
            'cell_type': cell_data['cell_type'],
            'source': cell_data['source']
        })
        cell_ids.append(cell_id)
    
    features_df = pd.DataFrame(cells)
    
    # Extract enhanced features
    features_df = extract_enhanced_features(features_df)
    
    # Add placeholder for notebook-level features (will be filled)
    features_df['source_length_mean'] = features_df['source_length'].mean()
    features_df['source_length_std'] = features_df['source_length'].std()
    features_df['word_count_mean'] = features_df['word_count'].mean()
    features_df['word_count_std'] = features_df['word_count'].std()
    features_df['cell_type_code_mean'] = features_df['cell_type_code'].mean()
    features_df['notebook_size'] = len(features_df)
    features_df['relative_position'] = 0.5  # placeholder
    
    # TF-IDF features
    texts = features_df['source'].fillna('').tolist()
    tfidf_matrix = vectorizer.transform(texts)
    
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=tfidf_feature_names,
        index=features_df.index
    )
    
    features_df = pd.concat([features_df, tfidf_df], axis=1)
    
    # Predict
    X_test = features_df[feature_cols]
    predicted_positions = model.predict(X_test)
    
    # Sort by predicted position
    order_df = pd.DataFrame({
        'cell_id': cell_ids,
        'pred_position': predicted_positions
    })
    
    ordered_cells = order_df.sort_values('pred_position')['cell_id'].tolist()
    return ' '.join(ordered_cells)

# Test on a few notebooks
print("Testing prediction on sample notebooks...")
test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')[:5]]

for notebook_id in test_notebooks:
    try:
        predicted_order = predict_notebook_order(notebook_id, TEST_PATH, final_model, feature_cols, vectorizer)
        print(f"{notebook_id}: {predicted_order[:100]}...")
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")

In [None]:
# Generate submission for all test notebooks
print("Generating submission for all test notebooks...")

test_notebooks = [f.stem for f in TEST_PATH.glob('*.json')]
submission_data = []

for notebook_id in tqdm(test_notebooks, desc="Predicting notebooks"):
    try:
        predicted_order = predict_notebook_order(notebook_id, TEST_PATH, final_model, feature_cols, vectorizer)
        submission_data.append({
            'id': notebook_id,
            'cell_order': predicted_order
        })
    except Exception as e:
        print(f"Error with {notebook_id}: {e}")

submission_df = pd.DataFrame(submission_data)
print(f"Submission shape: {submission_df.shape}")
print(submission_df.head())

In [None]:
# Save submission
submission_path = '/home/submission/submission_002.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

# Verify format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nColumns match: {list(submission_df.columns) == list(sample_sub.columns)}")
print(f"Number of rows match: {len(submission_df) == len(sample_sub)}")