# Pipeline-DeBERTa: Subtask 2 Submission Generation
## DimABSA 2026 - Track A

Trains restaurant + laptop models, runs inference on **test** data, produces submission-ready JSONL files.

## Cell 1: Setup and Clone Repository

In [None]:
%cd /kaggle/working
!rm -rf dimabsa-2026
!git clone https://github.com/VishalRepos/dimabsa-2026.git
%cd dimabsa-2026/Pipeline-DeBERTa

# Verify test data exists
import os
test_dir = '../DimABSA2026/task-dataset/track_a/subtask_2/eng/'
for f in ['eng_restaurant_test_task2.jsonl', 'eng_laptop_test_task2.jsonl',
          'eng_restaurant_train_alltasks_filtered.jsonl', 'eng_laptop_train_alltasks_filtered.jsonl']:
    path = os.path.join(test_dir, f)
    status = '✓' if os.path.exists(path) else '✗'
    print(f'{status} {f}')

## Cell 2: Install Dependencies

In [None]:
!pip install -q transformers==4.36.0 torch==2.1.0

import torch
print(f"CUDA: {torch.cuda.is_available()}, GPUs: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Cell 3: Train Restaurant Model + Inference on Test Data

In [None]:
# Train on filtered restaurant data, inference on TEST file (not dev)
!python 'run_task2&3_trainer_multilingual.py' \
  --task 2 \
  --domain res \
  --language eng \
  --train_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_train_alltasks_filtered.jsonl \
  --infer_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_test_task2.jsonl \
  --bert_model_type microsoft/deberta-v3-base \
  --mode train \
  --epoch_num 3 \
  --batch_size 8

## Cell 4: Train Laptop Model + Inference on Test Data

In [None]:
# Train on filtered laptop data, inference on TEST file
!python 'run_task2&3_trainer_multilingual.py' \
  --task 2 \
  --domain lap \
  --language eng \
  --train_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_laptop_train_alltasks_filtered.jsonl \
  --infer_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_laptop_test_task2.jsonl \
  --bert_model_type microsoft/deberta-v3-base \
  --mode train \
  --epoch_num 3 \
  --batch_size 8

## Cell 5: Verify Submission Files

In [None]:
import json

submission_files = {
    'Restaurant': 'tasks/subtask_2/pred_eng_restaurant.jsonl',
    'Laptop': 'tasks/subtask_2/pred_eng_laptop.jsonl',
}

for domain, path in submission_files.items():
    if not os.path.exists(path):
        print(f'✗ {domain}: {path} NOT FOUND')
        continue

    with open(path, 'r') as f:
        data = [json.loads(line) for line in f]

    total_triplets = sum(len(d['Triplet']) for d in data)
    with_triplets = sum(1 for d in data if d['Triplet'])

    # Validate VA format
    va_errors = 0
    for d in data:
        for t in d['Triplet']:
            try:
                v, a = map(float, t['VA'].split('#'))
                if not (1.0 <= v <= 9.0 and 1.0 <= a <= 9.0):
                    va_errors += 1
            except:
                va_errors += 1

    print(f'\n✓ {domain}: {path}')
    print(f'  Samples: {len(data)}')
    print(f'  Total triplets: {total_triplets}')
    print(f'  Samples with triplets: {with_triplets}/{len(data)}')
    print(f'  Avg triplets/sample: {total_triplets/len(data):.2f}')
    print(f'  VA format errors: {va_errors}')
    print(f'  First prediction: {json.dumps(data[0], ensure_ascii=False)[:200]}')

## Cell 6: Cross-check IDs Against Test Files

In [None]:
test_files = {
    'Restaurant': '../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_test_task2.jsonl',
    'Laptop': '../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_laptop_test_task2.jsonl',
}

all_ok = True
for domain, test_path in test_files.items():
    pred_path = submission_files[domain]
    if not os.path.exists(pred_path) or not os.path.exists(test_path):
        print(f'✗ {domain}: files missing')
        all_ok = False
        continue

    with open(test_path, 'r') as f:
        test_ids = {json.loads(line)['ID'] for line in f}
    with open(pred_path, 'r') as f:
        pred_ids = {json.loads(line)['ID'] for line in f}

    missing = test_ids - pred_ids
    extra = pred_ids - test_ids

    if not missing and not extra:
        print(f'✓ {domain}: All {len(test_ids)} IDs match')
    else:
        all_ok = False
        if missing:
            print(f'✗ {domain}: {len(missing)} missing IDs: {list(missing)[:5]}')
        if extra:
            print(f'✗ {domain}: {len(extra)} extra IDs: {list(extra)[:5]}')

print(f'\n{"✓ SUBMISSION READY" if all_ok else "✗ FIX ISSUES ABOVE"}')

## Cell 7: Copy Submission Files to Output

In [None]:
import shutil

out_dir = '/kaggle/working/submission_subtask2'
os.makedirs(out_dir, exist_ok=True)

for domain, path in submission_files.items():
    if os.path.exists(path):
        dest = os.path.join(out_dir, os.path.basename(path))
        shutil.copy2(path, dest)
        print(f'✓ Copied {path} → {dest}')

# Also save models
model_dir = '/kaggle/working/trained_models'
os.makedirs(model_dir, exist_ok=True)
for f in ['model/task2_res_eng.pth', 'model/task2_lap_eng.pth']:
    if os.path.exists(f):
        shutil.copy2(f, os.path.join(model_dir, os.path.basename(f)))
        print(f'✓ Copied {f}')

print(f'\nDownload from Output panel →')
!ls -lh /kaggle/working/submission_subtask2/
!ls -lh /kaggle/working/trained_models/

---
## (Optional) Inference-Only Mode
Use these cells if models are already trained and uploaded as a Kaggle dataset.

In [None]:
# # Uncomment and set your model dataset path
# MODEL_DATASET = '/kaggle/input/your-model-dataset'
# 
# # Copy pre-trained models into expected location
# os.makedirs('model', exist_ok=True)
# for f in ['task2_res_eng.pth', 'task2_lap_eng.pth']:
#     src = os.path.join(MODEL_DATASET, f)
#     if os.path.exists(src):
#         shutil.copy2(src, f'model/{f}')
#         print(f'✓ Loaded {f}')
# 
# # Run inference only - Restaurant
# !python 'run_task2&3_trainer_multilingual.py' \
#   --task 2 --domain res --language eng \
#   --train_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_train_alltasks_filtered.jsonl \
#   --infer_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_test_task2.jsonl \
#   --bert_model_type microsoft/deberta-v3-base \
#   --mode inference
# 
# # Run inference only - Laptop
# !python 'run_task2&3_trainer_multilingual.py' \
#   --task 2 --domain lap --language eng \
#   --train_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_laptop_train_alltasks_filtered.jsonl \
#   --infer_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_laptop_test_task2.jsonl \
#   --bert_model_type microsoft/deberta-v3-base \
#   --mode inference