## 1. Setup Environment

In [None]:
# Check if running in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print(f"Running in Colab: {IN_COLAB}")

# Check GPU availability
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Mount Google Drive (for Colab)
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set project path in Drive
    PROJECT_PATH = "/content/drive/MyDrive/ML_Sentiment_Analysis"
else:
    PROJECT_PATH = ".."  # Local path

In [None]:
# Install required packages
!pip install -q nlpaug transformers sentencepiece spacy pandas tqdm

# Download spaCy Romanian model
!python -m spacy download ro_core_news_sm -q

In [None]:
# Clone/setup project (if in Colab and not already present)
import os

if IN_COLAB:
    # Option 1: Clone from GitHub (uncomment if using git)
    # !git clone https://github.com/YOUR_USERNAME/sentiment_analysis.git
    # %cd sentiment_analysis
    
    # Option 2: Use project from Drive
    %cd {PROJECT_PATH}
    
# Verify project structure
!ls -la

In [None]:
# Add project to Python path
import sys
sys.path.insert(0, '.')

# Verify imports work
from src.data.augmentations import back_translate, contextual_word_replacement
from src.preprocessing.text import tokenize, detokenize
print("✓ Imports successful")

## 2. Explore Dataset

In [None]:
import pandas as pd

# Load training data
train_df = pd.read_csv("data/processed/train.csv")
print(f"Training samples: {len(train_df)}")

# Class distribution
class_counts = train_df['label'].value_counts()
print(f"\nClass distribution:")
for label, count in class_counts.items():
    pct = count / len(train_df) * 100
    sentiment = "Positive" if label == 1 else "Negative"
    print(f"  {sentiment} (label={label}): {count} ({pct:.1f}%)")

# Calculate imbalance
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")

In [None]:
# Sample texts from each class
print("Sample NEGATIVE reviews:")
for text in train_df[train_df['label'] == 0]['text'].head(3):
    print(f"  • {text[:100]}...")

print("\nSample POSITIVE reviews:")
for text in train_df[train_df['label'] == 1]['text'].head(3):
    print(f"  • {text[:100]}...")

## 3. Test Augmentation Functions

In [None]:
# Test back-translation (this will download M2M100 model ~2GB on first run)
test_text = "Produsul este foarte bun, recomand cu încredere!"
tokens = tokenize(test_text)

print(f"Original: {test_text}")
print(f"Tokens: {tokens}")

# This may take a minute on first run (model download)
print("\nTesting back-translation (may take a minute on first run)...")
aug_tokens = back_translate(tokens, device=DEVICE)
aug_text = detokenize(aug_tokens)
print(f"Back-translated: {aug_text}")

In [None]:
# Test contextual word replacement
print(f"Original: {test_text}")
print("\nTesting contextual word replacement...")
aug_tokens = contextual_word_replacement(tokens, n_replacements=2, device=DEVICE)
aug_text = detokenize(aug_tokens)
print(f"Contextual: {aug_text}")

## 4. Run Offline Augmentation

Choose your augmentation strategy:

| Strategy | Speed | Quality | Use Case |
|----------|-------|---------|----------|
| `back_translate` | Slow | High | Best paraphrasing |
| `contextual` | Medium | High | Semantic variation |
| `synonym` | Fast | Medium | Simple word replacement |
| `swap` + `delete` | Very Fast | Low | Structural noise |

In [None]:
# Configuration
AUGMENTATION_CONFIG = {
    "input": "data/processed/train.csv",
    "output": "data/augmented/train_balanced.csv",
    "augment": ["back_translate"],  # Options: back_translate, contextual, synonym, swap, delete
    "balance": True,                 # Balance classes
    "target_ratio": 1.0,             # 1.0 = perfectly balanced
    "minority_only": True,           # Only augment minority class
    "device": DEVICE,
    "seed": 42,
}

print("Augmentation Configuration:")
for k, v in AUGMENTATION_CONFIG.items():
    print(f"  {k}: {v}")

In [None]:
# Build command
cmd = f"""python -m src.data.offline_augmentation \
    --input {AUGMENTATION_CONFIG['input']} \
    --output {AUGMENTATION_CONFIG['output']} \
    --augment {' '.join(AUGMENTATION_CONFIG['augment'])} \
    --device {AUGMENTATION_CONFIG['device']} \
    --seed {AUGMENTATION_CONFIG['seed']}"""

if AUGMENTATION_CONFIG['balance']:
    cmd += f" --balance --target_ratio {AUGMENTATION_CONFIG['target_ratio']}"

if AUGMENTATION_CONFIG.get('minority_only'):
    cmd += " --minority_only"

print("Command to run:")
print(cmd)

In [None]:
# Run augmentation (this may take 30-60 minutes depending on dataset size and GPU)
!{cmd}

## 5. Verify Results

In [None]:
# Load augmented dataset
output_path = AUGMENTATION_CONFIG['output']
aug_df = pd.read_csv(output_path)

print(f"Augmented dataset: {len(aug_df)} samples")
print(f"Original dataset: {len(train_df)} samples")
print(f"Expansion: {len(aug_df) / len(train_df):.2f}x")

In [None]:
# Class distribution comparison
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Original
orig_counts = train_df['label'].value_counts().sort_index()
axes[0].bar(['Negative (0)', 'Positive (1)'], orig_counts.values, color=['#ff6b6b', '#51cf66'])
axes[0].set_title('Original Dataset')
axes[0].set_ylabel('Count')
for i, v in enumerate(orig_counts.values):
    axes[0].text(i, v + 100, f'{v}\n({v/len(train_df)*100:.1f}%)', ha='center')

# Augmented
aug_counts = aug_df['label'].value_counts().sort_index()
axes[1].bar(['Negative (0)', 'Positive (1)'], aug_counts.values, color=['#ff6b6b', '#51cf66'])
axes[1].set_title('Augmented Dataset')
axes[1].set_ylabel('Count')
for i, v in enumerate(aug_counts.values):
    axes[1].text(i, v + 100, f'{v}\n({v/len(aug_df)*100:.1f}%)', ha='center')

plt.tight_layout()
plt.savefig('plots/augmentation_balance.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nImbalance ratio:")
print(f"  Original: {orig_counts.max()/orig_counts.min():.2f}:1")
print(f"  Augmented: {aug_counts.max()/aug_counts.min():.2f}:1")

In [None]:
# Show augmented vs original breakdown
if 'augmented' in aug_df.columns:
    aug_breakdown = aug_df.groupby(['label', 'augmented']).size().unstack(fill_value=0)
    print("Samples by type:")
    print(aug_breakdown)
    
    # Visualize
    aug_breakdown.plot(kind='bar', stacked=True, figsize=(8, 5), 
                       color=['#74c0fc', '#ffd43b'])
    plt.title('Original vs Augmented Samples by Class')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.legend(['Original', 'Augmented'])
    plt.xticks([0, 1], ['Negative (0)', 'Positive (1)'], rotation=0)
    plt.tight_layout()
    plt.savefig('plots/augmentation_breakdown.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# Sample augmented texts
print("Sample AUGMENTED texts (minority class):")
print("="*60)

minority_label = train_df['label'].value_counts().idxmin()
augmented_samples = aug_df[(aug_df['label'] == minority_label) & (aug_df.get('augmented', False) == True)]

for i, row in augmented_samples.head(5).iterrows():
    print(f"\n[{row.get('aug_type', 'augmented')}]")
    print(f"  {row['text'][:150]}..." if len(row['text']) > 150 else f"  {row['text']}")

## 6. Copy to Drive (Optional)

In [None]:
# If running locally, copy augmented data to Drive for later use
if IN_COLAB:
    import shutil
    
    drive_output = "/content/drive/MyDrive/ML_Sentiment_Analysis/data/augmented/"
    os.makedirs(drive_output, exist_ok=True)
    
    # Copy files
    shutil.copy(output_path, drive_output)
    clean_path = output_path.replace('.csv', '_clean.csv')
    if os.path.exists(clean_path):
        shutil.copy(clean_path, drive_output)
    
    print(f"✓ Copied augmented data to {drive_output}")
    !ls -la {drive_output}

## 7. Next Steps

Now you can train your model on the augmented dataset:

```bash
# Use the augmented data directory
python -m src.run_experiment \
    --model bilstm \
    --data_dir data/augmented \
    --epochs 20 \
    --device cuda
```

**Note:** When using pre-augmented data:
- DON'T use `--balance_classes` (already balanced)
- DON'T use `--expand_factor` (already expanded)
- You CAN still use light online augmentation (`--augment swap`) for variety