In [1]:
pip install pandas numpy torch scikit-learn transformers datasets

Collecting torch
  Downloading torch-2.9.1-cp311-cp311-win_amd64.whl.metadata (30 kB)
Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/44.0 kB ? eta -:--:--
     ---------------------------------------- 44.0/44.0 kB 1.1 MB/s eta 0:00:00
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from torch)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.6-py3-none-any.whl.metadata (6.8 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=0.8.5 (from torch)
  Downloading fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install pyarrow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from collections import defaultdict
import time

# 1. Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--> Using device: {device}")

# 2. Load Data
print("--> Loading data...")
try:
    df_pd = pd.read_parquet('aita_train.parquet')
    
    # --- SMOKE TEST CHANGE: SAMPLE ONLY 200 ROWS ---
    print(f"    Original dataset size: {len(df_pd)}")
    df_pd = df_pd.sample(n=200, random_state=42) 
    print(f"    Reduced dataset size for testing: {len(df_pd)}")
    # -----------------------------------------------

except FileNotFoundError:
    print("    Error: File not found. Generating dummy data.")
    df_pd = pd.DataFrame({
        'title': ['Test Title'] * 200,
        'text': ['Test content text.'] * 200,
        'verdict': ['nta', 'yta', 'esh', 'nah'] * 50
    })

# 3. Data Preprocessing
print("--> Preprocessing data...")
df_pd['full_text'] = df_pd['title'] + " " + df_pd['text']
label_map = {'nta': 0, 'nah': 0, 'yta': 1, 'esh': 1}
df_pd['label'] = df_pd['verdict'].map(label_map)
df_pd = df_pd.dropna(subset=['label'])
df_pd['label'] = df_pd['label'].astype(int)

# Split Data
train_df, test_df = train_test_split(df_pd, test_size=0.2, stratify=df_pd['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

print(f"    Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

# Compute Class Weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# 4. Tokenization & Dataset Creation
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['full_text'], padding="max_length", truncation=True, max_length=128) # Reduced max_length for speed test

train_ds = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_ds = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)

columns_to_return = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns_to_return)
val_ds.set_format(type='torch', columns=columns_to_return)

# DataLoaders
batch_size = 8 # Smaller batch size for test
train_loader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_ds, batch_size=batch_size)

# 5. Model Initialization
print("--> Initializing BERT Model...")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# 6. Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 1  # --- SMOKE TEST CHANGE: ONLY 1 EPOCH ---
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor)

# 7. Training Loop
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0
    n_batches = len(data_loader)
    
    for step, batch in enumerate(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, targets)
        losses.append(loss.item())

        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == targets)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Print every few batches
        if step % 5 == 0:
            print(f"    Batch {step}/{n_batches} | Loss: {loss.item():.4f}")

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

print("\n--> Starting SMOKE TEST training loop (1 Epoch, 200 rows)...")
start_time = time.time()

train_acc, train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler)

print(f'  -- Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
print(f'  -- Time elapsed: {(time.time() - start_time):.2f} seconds')
print("\n--> Test Complete! Pipeline is working.")

--> Using device: cuda
--> Loading data...
    Original dataset size: 60709
    Reduced dataset size for testing: 200
--> Preprocessing data...
    Train size: 144, Val size: 16, Test size: 40


Map: 100%|██████████| 144/144 [00:00<00:00, 196.89 examples/s]
Map: 100%|██████████| 16/16 [00:00<00:00, 212.44 examples/s]


--> Initializing BERT Model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--> Starting SMOKE TEST training loop (1 Epoch, 200 rows)...
    Batch 0/18 | Loss: 0.9346
    Batch 5/18 | Loss: 0.8585
    Batch 10/18 | Loss: 0.4706
    Batch 15/18 | Loss: 0.7197
  -- Train loss 0.6885 accuracy 0.7500
  -- Time elapsed: 2.80 seconds

--> Test Complete! Pipeline is working.


In [4]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from collections import defaultdict
import time # Added for timing

# 1. Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--> Using device: {device}")

# 2. Load Data
print("--> Loading data...")
try:
    df_pd = pd.read_parquet('aita_train.parquet')
    print(f"    Data loaded successfully. Total rows: {len(df_pd)}")
except FileNotFoundError:
    print("    Error: 'aita_train.parquet' not found. Creating DUMMY data for testing.")
    df_pd = pd.DataFrame({
        'title': ['Example Title'] * 100,
        'text': ['Example text content.'] * 100,
        'verdict': ['nta', 'yta', 'esh', 'nah'] * 25
    })

# 3. Data Preprocessing
print("--> Preprocessing data...")
# Combine title and text
df_pd['full_text'] = df_pd['title'] + " " + df_pd['text']

# Map verdicts
label_map = {'nta': 0, 'nah': 0, 'yta': 1, 'esh': 1}
df_pd['label'] = df_pd['verdict'].map(label_map)

# Drop rows with unmapped labels
df_pd = df_pd.dropna(subset=['label'])
df_pd['label'] = df_pd['label'].astype(int)

print(f"    Label distribution: \n{df_pd['label'].value_counts()}")

# Split Data
train_df, test_df = train_test_split(df_pd, test_size=0.2, stratify=df_pd['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

print(f"    Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

# Compute Class Weights
print("--> Computing Class Weights...")
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"    Class Weights: {class_weights}")

# 4. Tokenization & Dataset Creation
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples['full_text'], 
        padding="max_length", 
        truncation=True, 
        max_length=256
    )

# Convert to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

# Apply Tokenization
print("--> Tokenizing data (this might take a moment)...")
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)
print("    Tokenization complete.")

# Set PyTorch Format
columns_to_return = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns_to_return)
val_ds.set_format(type='torch', columns=columns_to_return)
test_ds.set_format(type='torch', columns=columns_to_return)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_ds, batch_size=batch_size)
test_loader = DataLoader(test_ds, batch_size=batch_size)

# 5. Model Initialization
print("--> Initializing BERT Model...")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)
model.to(device)
print("    Model loaded to device.")

# 6. Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor)

# 7. Define Training & Eval Functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, epoch_index):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    n_batches = len(data_loader)
    
    for step, batch in enumerate(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        logits = outputs.logits
        loss = loss_fn(logits, targets)
        losses.append(loss.item())

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # --- NEW PRINT STATEMENT ---
        # Print status every 50 batches
        if step % 50 == 0:
            print(f"    [Epoch {epoch_index + 1}] Batch {step}/{n_batches} | Loss: {loss.item():.4f}")

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            loss = loss_fn(logits, targets)
            losses.append(loss.item())

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# 8. Main Training Loop
history = defaultdict(list)

print("\n--> Starting training loop...")
for epoch in range(epochs):
    print(f'\nEpoch {epoch + 1}/{epochs}')
    print('-' * 10)
    
    start_time = time.time()

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        epoch # Pass epoch number for printing
    )

    print(f'  -- Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
    print(f'  -- Time elapsed for training epoch: {(time.time() - start_time):.2f} seconds')
    print("  -- Validating...")

    val_acc, val_loss = eval_model(
        model,
        val_loader,
        loss_fn,
        device
    )

    print(f'  -- Val   loss {val_loss:.4f} accuracy {val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

print("\n--> Training complete!")

--> Using device: cuda
--> Loading data...
    Data loaded successfully. Total rows: 60709
--> Preprocessing data...
    Label distribution: 
label
0    51492
1     9217
Name: count, dtype: int64
    Train size: 43710, Val size: 4857, Test size: 12142
--> Computing Class Weights...
    Class Weights: [0.58951258 3.29290342]
--> Tokenizing data (this might take a moment)...


Map: 100%|██████████| 43710/43710 [03:04<00:00, 237.13 examples/s]
Map: 100%|██████████| 4857/4857 [00:20<00:00, 235.00 examples/s]
Map: 100%|██████████| 12142/12142 [00:51<00:00, 236.84 examples/s]


    Tokenization complete.
--> Initializing BERT Model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


    Model loaded to device.

--> Starting training loop...

Epoch 1/3
----------
    [Epoch 1] Batch 0/2732 | Loss: 0.7604
    [Epoch 1] Batch 50/2732 | Loss: 0.6923
    [Epoch 1] Batch 100/2732 | Loss: 0.7951
    [Epoch 1] Batch 150/2732 | Loss: 0.6636
    [Epoch 1] Batch 200/2732 | Loss: 0.5866
    [Epoch 1] Batch 250/2732 | Loss: 0.6440
    [Epoch 1] Batch 300/2732 | Loss: 0.7351
    [Epoch 1] Batch 350/2732 | Loss: 0.6519
    [Epoch 1] Batch 400/2732 | Loss: 0.6502
    [Epoch 1] Batch 450/2732 | Loss: 0.5843
    [Epoch 1] Batch 500/2732 | Loss: 0.7247
    [Epoch 1] Batch 550/2732 | Loss: 0.7668
    [Epoch 1] Batch 600/2732 | Loss: 0.5812
    [Epoch 1] Batch 650/2732 | Loss: 0.6388
    [Epoch 1] Batch 700/2732 | Loss: 0.7513
    [Epoch 1] Batch 750/2732 | Loss: 0.6360
    [Epoch 1] Batch 800/2732 | Loss: 0.6402
    [Epoch 1] Batch 850/2732 | Loss: 0.6178
    [Epoch 1] Batch 900/2732 | Loss: 0.6750
    [Epoch 1] Batch 950/2732 | Loss: 0.6418
    [Epoch 1] Batch 1000/2732 | Loss: 0.65

In [2]:
pip uninstall torch torchvision torchaudio -y

Found existing installation: torch 2.9.1
Uninstalling torch-2.9.1:
  Successfully uninstalled torch-2.9.1
Note: you may need to restart the kernel to use updated packages.


You can safely remove it manually.


In [3]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-win_amd64.whl (2449.4 MB)
     ---------------------------------------- 0.0/2.4 GB ? eta -:--:--
     ---------------------------------------- 0.0/2.4 GB ? eta -:--:--
     ---------------------------------------- 0.0/2.4 GB ? eta -:--:--
     ---------------------------------------- 0.0/2.4 GB ? eta -:--:--
     ---------------------------------------- 0.0/2.4 GB ? eta -:--:--
     ---------------------------------------- 0.0/2.4 GB 233.8 kB/s eta 2:54:37
     ---------------------------------------- 0.0/2.4 GB 233.8 kB/s eta 2:54:37
     ---------------------------------------- 0.0/2.4 GB 233.8 kB/s eta 2:54:37
     ---------------------------------------- 0.0/2.4 GB 233.8 kB/s eta 2:54:37
     ---------------------------------------- 0.0/2.4 GB 233.8 kB/s eta 2:54:37
     ---------------------------------------- 0.0/2.4 GB 3


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
import sys

print(f"Python Version: {sys.version}")
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Version in PyTorch: {torch.version.cuda}")

# Try to see if system sees the GPU
try:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
except:
    print("GPU Name: None found")

Python Version: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
PyTorch Version: 2.5.1+cu121
CUDA Available: True
CUDA Version in PyTorch: 12.1
GPU Name: NVIDIA GeForce RTX 3060 Laptop GPU
