# Pipeline-DeBERTa Training with Vector Extraction
## DimABSA 2026 - Subtask 2

## Cell 1: Setup and Clone Repository

In [None]:
%cd /kaggle/working
!rm -rf dimabsa-2026
!git clone https://github.com/VishalRepos/dimabsa-2026.git
%cd dimabsa-2026/Pipeline-DeBERTa

# Verify data
!ls -lh ../DimABSA2026/task-dataset/track_a/subtask_2/eng/

## Cell 2: Install Dependencies

In [None]:
!pip install -q transformers==4.36.0 torch==2.1.0

import torch
import transformers
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Cell 3: Training

In [None]:
!python run_task2&3_trainer_multilingual.py \
  --task 2 \
  --domain res \
  --language eng \
  --train_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_train_alltasks.jsonl \
  --infer_data ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_dev_task2.jsonl \
  --bert_model_type microsoft/deberta-v3-base \
  --mode train \
  --epoch_num 3 \
  --batch_size 8

## Cell 4: Extract Vectors from Training Data

In [None]:
# Create vectors directory
!mkdir -p /kaggle/working/vectors

# Extract vectors from training data
!python extract_vectors.py \
  --model_path model/task2_eng_res_best.pth \
  --data_file ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_train_alltasks.jsonl \
  --output_file /kaggle/working/vectors/train_vectors.jsonl \
  --device cuda

## Cell 5: Extract Vectors from Test Data

In [None]:
# Extract vectors from test data
!python extract_vectors.py \
  --model_path model/task2_eng_res_best.pth \
  --data_file ../DimABSA2026/task-dataset/track_a/subtask_2/eng/eng_restaurant_dev_task2.jsonl \
  --output_file /kaggle/working/vectors/test_vectors.jsonl \
  --device cuda

## Cell 6: Verify Vectors

In [None]:
import json
import numpy as np

# Load and check train vectors
with open('/kaggle/working/vectors/train_vectors.jsonl', 'r') as f:
    train_vectors = [json.loads(line) for line in f]

print(f"Total training vectors: {len(train_vectors)}")
print(f"Vector dimension: {len(train_vectors[0]['vector'])}")
print(f"\nFirst sample:")
print(f"  ID: {train_vectors[0]['id']}")
print(f"  Text: {train_vectors[0]['text'][:50]}...")
print(f"  Vector shape: {len(train_vectors[0]['vector'])}")
if 'aspect' in train_vectors[0]:
    print(f"  Aspect: {train_vectors[0]['aspect']}")
    print(f"  Opinion: {train_vectors[0]['opinion']}")
    print(f"  VA: {train_vectors[0]['valence']:.2f}#{train_vectors[0]['arousal']:.2f}")

# Check test vectors
with open('/kaggle/working/vectors/test_vectors.jsonl', 'r') as f:
    test_vectors = [json.loads(line) for line in f]

print(f"\nTotal test vectors: {len(test_vectors)}")

# File sizes
!ls -lh /kaggle/working/vectors/

## Cell 7: Save Model and Vectors

In [None]:
import shutil

# Copy model
shutil.copytree('model', '/kaggle/working/trained_pipeline_model')

# Vectors already in /kaggle/working/vectors/

print("✓ Model saved to: /kaggle/working/trained_pipeline_model")
print("✓ Vectors saved to: /kaggle/working/vectors/")
print("\nDownload from Output panel →")

!ls -lh /kaggle/working/

## Cell 8: Optional - Visualize Vectors (Sample)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load vectors
vectors = np.array([v['vector'] for v in train_vectors[:100]])  # First 100 samples
valences = [v.get('valence', 5.0) for v in train_vectors[:100]]

# PCA to 2D
pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(vectors)

# Plot
plt.figure(figsize=(10, 6))
scatter = plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c=valences, cmap='RdYlGn', alpha=0.6)
plt.colorbar(scatter, label='Valence')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Vector Embeddings (PCA) colored by Valence')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Explained variance: {pca.explained_variance_ratio_}")