In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/openwebtext-2gb/openwebtext_2GB.txt


In [2]:
!git clone https://github.com/Playmaker3334/gpt-oss-from-scratch.git
%cd gpt-oss-from-scratch/gpt_oss_20b
!pip install -r requirements.txt

Cloning into 'gpt-oss-from-scratch'...
remote: Enumerating objects: 287, done.[K
remote: Counting objects: 100% (287/287), done.[K
remote: Compressing objects: 100% (187/187), done.[K
remote: Total 287 (delta 147), reused 231 (delta 91), pack-reused 0 (from 0)[K
Receiving objects: 100% (287/287), 229.14 KiB | 7.90 MiB/s, done.
Resolving deltas: 100% (147/147), done.
/kaggle/working/gpt-oss-from-scratch/gpt_oss_20b
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->-r requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->-r requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->-r requirements.txt (line 1))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6

In [3]:
import os
import random
import time
import subprocess
from io import StringIO

# Configuration
SOURCE_FILE = "/kaggle/input/openwebtext-2gb/openwebtext_2GB.txt"
INTERMEDIATE_FILE = "/kaggle/working/small_data.txt"
TRAIN_OUT = "/kaggle/working/train.txt"
EVAL_OUT = "/kaggle/working/eval.txt"
MAX_MB = 150
INTERMEDIATE_MB = 500
EVAL_RATIO = 0.10
MIN_CHARS = 100
TARGET_CHARS = 4000
HARD_MAX_CHARS = 6000
SPLIT_LOOKBACK = 1200
SEED = 42

def create_intermediate_file():
    """Create intermediate file using head command for faster repeated processing."""
    print(f"=== Step 1: Creating intermediate file ({INTERMEDIATE_MB}MB) ===")
    step1_start = time.time()
    
    # Create intermediate file
    cmd = f"head -c {INTERMEDIATE_MB}M {SOURCE_FILE} > {INTERMEDIATE_FILE}"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"Error creating intermediate file: {result.stderr}")
        return False
    
    step1_time = time.time() - step1_start
    
    # Check file size
    file_info = subprocess.run(["ls", "-lh", INTERMEDIATE_FILE], 
                              capture_output=True, text=True)
    print(f"Intermediate file created: {file_info.stdout.strip()}")
    
    # Count lines
    line_count = subprocess.run(["wc", "-l", INTERMEDIATE_FILE], 
                               capture_output=True, text=True)
    print(f"Line count: {line_count.stdout.strip()}")
    
    # Preview first few lines to understand structure
    preview = subprocess.run(["head", "-5", INTERMEDIATE_FILE], 
                            capture_output=True, text=True)
    print("Sample lines:")
    for i, line in enumerate(preview.stdout.strip().split('\n')[:3]):
        print(f"  Line {i+1} ({len(line)} chars): {line[:100]}...")
    
    print(f"Step 1 completed in {step1_time:.2f}s")
    print()
    
    return True

def cut_chunk(text):
    """Optimized chunk cutting with reduced string operations."""
    if len(text) < HARD_MAX_CHARS:
        return None, text
    
    # Find sentence boundary more efficiently
    tail_start = max(0, len(text) - SPLIT_LOOKBACK)
    tail = text[tail_start:]
    
    # Use single pass to find last sentence boundary
    last_pos = -1
    for i in range(len(tail) - 1, -1, -1):
        if tail[i] in '.!?。！？':
            last_pos = tail_start + i + 1
            break
    
    cut_at = last_pos if last_pos > 0 else TARGET_CHARS
    return text[:cut_at].strip(), text[cut_at:].lstrip()

def process_dataset():
    """Process the intermediate file into train and eval datasets - each line is a document."""
    print(f"=== Step 2: Processing dataset (target: {MAX_MB}MB) ===")
    step2_start = time.time()
    
    random.seed(SEED)
    os.makedirs("/kaggle/working", exist_ok=True)
    
    max_bytes = MAX_MB * 1024 * 1024
    total_bytes_written = 0
    total_samples = 0
    train_samples_count = 0
    eval_samples_count = 0
    train_bytes_total = 0
    eval_bytes_total = 0
    
    print(f"Processing with {EVAL_RATIO:.0%} eval ratio, min_chars={MIN_CHARS}")
    print("Processing each line as a separate document...")
    
    with open(TRAIN_OUT, "w", encoding="utf-8", buffering=8192) as f_train, \
         open(EVAL_OUT, "w", encoding="utf-8", buffering=8192) as f_eval, \
         open(INTERMEDIATE_FILE, "r", encoding="utf-8", buffering=8192) as f_in:
        
        lines_processed = 0
        short_lines_skipped = 0
        
        for line in f_in:
            lines_processed += 1
            
            if lines_processed % 10000 == 0:
                mb_used = total_bytes_written / (1024 * 1024)
                print(f"  Lines: {lines_processed}, Samples: {total_samples}, MB: {mb_used:.1f}")
            
            # Check if we've reached the limit
            if total_bytes_written >= max_bytes:
                print(f"  Reached {MAX_MB}MB limit")
                break
            
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
                
            # Check if line meets minimum length requirement
            if len(line) < MIN_CHARS:
                short_lines_skipped += 1
                continue
                
            # Process this line as a document
            current_text = line
            
            # Handle very long lines by chunking
            while len(current_text) >= HARD_MAX_CHARS:
                chunk, current_text = cut_chunk(current_text)
                if chunk and len(chunk) >= MIN_CHARS:
                    # Process this chunk
                    chunk_with_newline = chunk + "\n"
                    chunk_bytes = len(chunk_with_newline.encode('utf-8'))
                    
                    if total_bytes_written + chunk_bytes > max_bytes:
                        break
                    
                    # Assign to train or eval
                    if random.random() < EVAL_RATIO:
                        f_eval.write(chunk_with_newline)
                        eval_samples_count += 1
                        eval_bytes_total += chunk_bytes
                    else:
                        f_train.write(chunk_with_newline)
                        train_samples_count += 1
                        train_bytes_total += chunk_bytes
                    
                    total_bytes_written += chunk_bytes
                    total_samples += 1
            
            # Process remaining text if long enough
            if len(current_text) >= MIN_CHARS:
                sample_with_newline = current_text + "\n"
                sample_bytes = len(sample_with_newline.encode('utf-8'))
                
                if total_bytes_written + sample_bytes <= max_bytes:
                    # Assign to train or eval
                    if random.random() < EVAL_RATIO:
                        f_eval.write(sample_with_newline)
                        eval_samples_count += 1
                        eval_bytes_total += sample_bytes
                    else:
                        f_train.write(sample_with_newline)
                        train_samples_count += 1
                        train_bytes_total += sample_bytes
                    
                    total_bytes_written += sample_bytes
                    total_samples += 1
            else:
                short_lines_skipped += 1
    
    step2_time = time.time() - step2_start
    final_mb = total_bytes_written / (1024 * 1024)
    train_mb = train_bytes_total / (1024 * 1024)
    eval_mb = eval_bytes_total / (1024 * 1024)
    
    print(f"\nStep 2 Debug Info:")
    print(f"  Lines processed: {lines_processed}")
    print(f"  Short lines skipped: {short_lines_skipped}")
    print(f"  Total samples created: {total_samples}")
    print(f"  Average sample size: {final_mb*1024/total_samples:.1f}KB" if total_samples > 0 else "  No samples created")
    
    print(f"\nStep 2 completed in {step2_time:.2f}s")
    if step2_time > 0 and final_mb > 0:
        print(f"Processing speed: {final_mb/step2_time:.1f}MB/s")
    
    return {
        'total_samples': total_samples,
        'train_samples': train_samples_count,
        'eval_samples': eval_samples_count,
        'total_mb': final_mb,
        'train_mb': train_mb,
        'eval_mb': eval_mb,
        'processing_time': step2_time,
        'lines_processed': lines_processed,
        'short_lines_skipped': short_lines_skipped
    }

def main():
    """Main function that orchestrates the entire process."""
    print("=" * 50)
    print("DATASET PROCESSOR - TWO-STEP APPROACH")
    print("=" * 50)
    
    total_start_time = time.time()
    
    # Step 1: Create intermediate file
    if not create_intermediate_file():
        print("Failed to create intermediate file. Exiting.")
        return
    
    # Step 2: Process dataset
    results = process_dataset()
    
    # Final summary
    total_time = time.time() - total_start_time
    
    print("\n" + "=" * 50)
    print("FINAL SUMMARY")
    print("=" * 50)
    print(f"Lines processed: {results['lines_processed']:,}")
    print(f"Short lines skipped: {results['short_lines_skipped']:,}")
    print(f"Total samples processed: {results['total_samples']:,}")
    print(f"Train samples: {results['train_samples']:,} ({results['train_mb']:.1f}MB)")
    print(f"Eval samples: {results['eval_samples']:,} ({results['eval_mb']:.1f}MB)")
    print(f"Total output size: {results['total_mb']:.1f}MB")
    print(f"Processing time: {results['processing_time']:.2f}s")
    print(f"TOTAL SCRIPT TIME: {total_time:.2f}s")
    if total_time > 0 and results['total_mb'] > 0:
        print(f"Overall speed: {results['total_mb']/total_time:.1f}MB/s")
    
    # Show final file sizes
    print("\nOutput files:")
    subprocess.run(["ls", "-lh", TRAIN_OUT, EVAL_OUT])
    
    # Show sample from train file
    print("\nSample from train.txt:")
    try:
        sample = subprocess.run(["head", "-3", TRAIN_OUT], 
                               capture_output=True, text=True)
        for i, line in enumerate(sample.stdout.split('\n')[:2]):
            if line.strip():
                print(f"  Sample {i+1}: {line[:100]}...")
    except:
        pass
    
    # Cleanup intermediate file (optional)
    try:
        cleanup_response = input("\nDelete intermediate file? (y/N): ").strip().lower()
        if cleanup_response == 'y':
            os.remove(INTERMEDIATE_FILE)
            print(f"Intermediate file {INTERMEDIATE_FILE} deleted.")
        else:
            print(f"Intermediate file kept: {INTERMEDIATE_FILE}")
    except:
        print(f"Intermediate file kept: {INTERMEDIATE_FILE}")

if __name__ == "__main__":
    main()

DATASET PROCESSOR - TWO-STEP APPROACH
=== Step 1: Creating intermediate file (500MB) ===
Intermediate file created: -rw-r--r-- 1 root root 500M Sep 13 23:50 /kaggle/working/small_data.txt
Line count: 105298 /kaggle/working/small_data.txt
Sample lines:
  Line 1 (5516 chars): Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched do...
  Line 2 (10286 chars): Former secretary of state Hillary Clinton meets voters at a campaign rally in St. Louis on Saturday....
  Line 3 (6021 chars): The opinions expressed by columnists are their own and do not represent the views of Townhall.com.  ...
Step 1 completed in 5.46s

=== Step 2: Processing dataset (target: 150MB) ===
Processing with 10% eval ratio, min_chars=100
Processing each line as a separate document...
  Lines: 10000, Samples: 10154, MB: 46.7
  Lines: 20000, Samples: 20359, MB: 94.5
  Lines: 30000, Samples: 30531, MB: 142.8
  Lines: 40000, Samples: 32123, MB: 150.0
  Lines: 50000, Samples: 3212


Delete intermediate file? (y/N):  y


Intermediate file /kaggle/working/small_data.txt deleted.


In [4]:
!python /kaggle/working/gpt-oss-from-scratch/gpt_oss_20b/main.py \
    --train_data /kaggle/working/train.txt \
    --eval_data /kaggle/working/eval.txt \
    --batch_size 4 \
    --learning_rate 3e-4 \
    --warmup_steps 200 \
    --use_epochs \
    --num_epochs 3 \
    --grad_accum_steps 16 \
    --log_every 50 \
    --save_every_epochs 1 \
    --mixed_precision \
    --gradient_checkpointing \
    --seq_len 256 \
    --generation_prompt "The future of AI is" \
    --output_dir /kaggle/working/output \
    --tokenizer_dir /kaggle/working/output/tokenizer

Device: cuda
Model parameters: 73,380,352

Starting training for 3 epochs

===== Epoch 1/3 =====
  with torch.cuda.amp.autocast(enabled=self.mixed_precision):
[Epoch 1 Step 50] loss=11.7525 ppl=127124.53 lr=0.000116 tok/s=5398
[Epoch 1 Step 100] loss=11.7289 ppl=124198.77 lr=0.000107 tok/s=5532
[Epoch 1 Step 150] loss=11.6637 ppl=116467.82 lr=0.000134 tok/s=5579
[Epoch 1 Step 200] loss=11.4924 ppl=99399.75 lr=0.000206 tok/s=5602
[Epoch 1 Step 250] loss=10.9683 ppl=64214.36 lr=0.000270 tok/s=5614
[Epoch 1 Step 300] loss=10.2353 ppl=30785.95 lr=0.000298 tok/s=5622
[Epoch 1 Step 350] loss=9.6609 ppl=16399.66 lr=0.000300 tok/s=5625
[Epoch 1 Step 400] loss=9.2356 ppl=10575.96 lr=0.000300 tok/s=5626
[Epoch 1 Step 450] loss=8.8714 ppl=7351.38 lr=0.000300 tok/s=5626
[Epoch 1 Step 500] loss=8.5453 ppl=5286.56 lr=0.000300 tok/s=5627
[Epoch 1 Step 550] loss=8.2983 ppl=4144.14 lr=0.000300 tok/s=5626
[Epoch 1 Step 600] loss=8.0846 ppl=3373.55 lr=0.000300 tok/s=5626
[Epoch 1 Step 650] loss=7.9071 pp