# Ingest CodeNet Data

This notebook downloads a small subset of IBM Project CodeNet (Python only), parses it, and outputs a CSV.

In [None]:
import os
import pandas as pd
from datasets import load_dataset

# Configuration
OUTPUT_PATH = "../data/codenet_sample.csv"
NUM_SAMPLES = 1000

print("Loading dataset...")
# We use iidai/codenet from Hugging Face as a proxy for the full dataset
try:
    dataset = load_dataset("iidai/codenet", split="train", streaming=True)
except Exception as e:
    print(f"Failed to load iidai/codenet: {e}")
    # Fallback
    dataset = load_dataset("codeparrot/codeparrot-clean-valid", split="train", streaming=True)

print("Filtering for Python...")
data = []
count = 0

for sample in dataset:
    language = sample.get('language', 'Python')
    
    if language == 'Python':
        record = {
            'id': sample.get('id', f'sample_{count}'),
            'language': language,
            'problem_id': sample.get('problem_id', 'unknown'),
            'code': sample.get('source_code', sample.get('content', ''))
        }
        
        data.append(record)
        count += 1
        
        if count >= NUM_SAMPLES:
            break

print(f"Collected {len(data)} samples.")
df = pd.DataFrame(data)

# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved to {OUTPUT_PATH}")
df.head()