## 1. Check GPU Availability

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
DEVICE = None
if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    DEVICE = "cpu"
    print("‚ö†Ô∏è No GPU detected! Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
GPU Memory: 15.8 GB


## 2. Mount Google Drive (for persistent storage)

In [None]:
import sys
import os

# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
BASE_DIR = "/content/drive/MyDrive/ML_Sentiment_Analysis"
os.makedirs(os.path.join(BASE_DIR, 'checkpoints'), exist_ok=True)
os.makedirs(os.path.join(BASE_DIR, 'results'), exist_ok=True)
os.makedirs(os.path.join(BASE_DIR, 'plots'), exist_ok=True)

print("‚úÖ Google Drive mounted!")
print(f"üìÅ Base directory: {BASE_DIR}")

IN_COLAB: True
üìÅ Using /content for storage
‚ö†Ô∏è Remember to download results before session ends!
BASE_DIR: /content


## 3. Clone/Upload Project

In [None]:
import os

# ==================== CONFIGURE YOUR REPO HERE ====================
GITHUB_REPO = "https://github.com/R3dP4ndaXD/sentiment_analysis.git"
# ==================================================================

repo_name = "sentiment_analysis"
target_dir = f'/content/{repo_name}'

# Ensure we are in a stable directory before attempting to clone
# This helps avoid issues if the previous working directory was deleted
os.chdir('/content')

# Remove existing directory if it exists
if os.path.exists(target_dir):
    !rm -rf {target_dir}

# Clone from GitHub
!git clone {GITHUB_REPO} {target_dir}

# Change to project directory
os.chdir(target_dir)
print(f"‚úÖ Cloned {GITHUB_REPO}")
print(f"üìÅ Working directory: {os.getcwd()}")

üì• Cloning from https://github.com/R3dP4ndaXD/sentiment_analysis...
‚úÖ Downloaded and extracted to: /content/sentiment_analysis

üìÅ Working directory: /content/sentiment_analysis


In [4]:
# Verify project structure
!ls -la
print("\nüìÇ Source directory:")
!ls -la src/

total 44
drwxr-xr-x 7 root root 4096 Jan  3 21:05 .
drwxr-xr-x 1 root root 4096 Jan  3 21:05 ..
-rw-r--r-- 1 root root 6714 Jan  3 21:05 cerinta.md
drwxr-xr-x 3 root root 4096 Jan  3 21:05 data
drwxr-xr-x 2 root root 4096 Jan  3 21:05 .github
-rw-r--r-- 1 root root  593 Jan  3 21:05 .gitignore
drwxr-xr-x 2 root root 4096 Jan  3 21:05 notebooks
-rw-r--r-- 1 root root  153 Jan  3 21:05 requirements.txt
drwxr-xr-x 9 root root 4096 Jan  3 21:05 results
drwxr-xr-x 8 root root 4096 Jan  3 21:05 src

üìÇ Source directory:
total 56
drwxr-xr-x 8 root root  4096 Jan  3 21:05 .
drwxr-xr-x 7 root root  4096 Jan  3 21:05 ..
-rw-r--r-- 1 root root  3687 Jan  3 21:05 config.py
drwxr-xr-x 2 root root  4096 Jan  3 21:05 data
drwxr-xr-x 2 root root  4096 Jan  3 21:05 embeddings
drwxr-xr-x 2 root root  4096 Jan  3 21:05 evaluate
drwxr-xr-x 2 root root  4096 Jan  3 21:05 models
drwxr-xr-x 2 root root  4096 Jan  3 21:05 preprocessing
-rw-r--r-- 1 root root 19259 Jan  3 21:05 run_experiment.py
drwxr-xr-x 2

## 4. Install Dependencies

In [5]:
# Install from requirements.txt if exists
!pip install -q -r requirements.txt 2>/dev/null || echo "No requirements.txt found"

# Install core dependencies
!pip install -q torch pandas scikit-learn matplotlib seaborn spacy

# Download Romanian spaCy model
!python -m spacy download ro_core_news_sm -q

print("‚úÖ Dependencies installed!")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.9/12.9 MB[0m [31m127.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('ro_core_news_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
‚úÖ Dependencies installed!


## 5. Download Dataset

In [10]:
import os
import pandas as pd

# Create data directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

# Download ro_sent dataset
TRAIN_URL = "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/train.csv"
TEST_URL = "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/test.csv"

!wget -q -O data/raw/train.csv "{TRAIN_URL}" 2>/dev/null || echo "Downloading train.csv..."
!wget -q -O data/raw/test.csv "{TEST_URL}" 2>/dev/null || echo "Downloading test.csv..."

# Check if download succeeded, if not try alternative method
if not os.path.exists('data/raw/train.csv') or os.path.getsize('data/raw/train.csv') < 1000:
    print("Trying alternative download method...")
    # Use datasets library as fallback
    !pip install -q datasets
    from datasets import load_dataset
    dataset = load_dataset("dumitrescustefan/ro_sent")
    dataset['train'].to_pandas().to_csv('data/raw/train.csv', index=False)
    dataset['test'].to_pandas().to_csv('data/raw/test.csv', index=False)

# Verify download
train_df = pd.read_csv('data/raw/train.csv')
test_df = pd.read_csv('data/raw/test.csv')
print(f"‚úÖ Train samples: {len(train_df)}")
print(f"‚úÖ Test samples: {len(test_df)}")
print(f"\nColumns: {train_df.columns.tolist()}")
print(f"\nLabel distribution (train):")
print(train_df['label'].value_counts())

‚úÖ Train samples: 17941
‚úÖ Test samples: 11005

Columns: ['index', 'text', 'label']

Label distribution (train):
label
1    11094
0     6847
Name: count, dtype: int64


In [None]:
# Create train/val/test split
from sklearn.model_selection import train_test_split

# Split train into train/val (90/10)
train_data, val_data = train_test_split(
    train_df, 
    test_size=0.15, 
    random_state=42, 
    stratify=train_df['label']
)

# Save processed splits
DATA_DIR = "data/processed"
train_data.to_csv('data/processed/train.csv', index=False)
val_data.to_csv('data/processed/val.csv', index=False)
test_df.to_csv('data/processed/test.csv', index=False)

print(f"‚úÖ Train: {len(train_data)} | Val: {len(val_data)} | Test: {len(test_df)}")

‚úÖ Train: 16146 | Val: 1795 | Test: 11005


## 6. Run Experiments

Configure and run your training experiments below.

In [None]:
# Paths for persistent storage on Google Drive
CHECKPOINT_DIR = "/content/drive/MyDrive/ML_Sentiment_Analysis/checkpoints"
RESULTS_DIR = "/content/drive/MyDrive/ML_Sentiment_Analysis/results"
PLOTS_DIR = "/content/drive/MyDrive/ML_Sentiment_Analysis/plots"

# Create directories if they don't exist
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

print(f"Checkpoint dir: {CHECKPOINT_DIR}")
print(f"Results dir: {RESULTS_DIR}")
print(f"Plots dir: {PLOTS_DIR}")

# ==================== EMBEDDINGS CONFIG ====================
# Download Romanian fastText embeddings (run once - ~4.5GB)
# !wget -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz
# !gunzip cc.ro.300.bin.gz
# !mv cc.ro.300.bin /content/drive/MyDrive/ML_Sentiment_Analysis/

FASTTEXT_PATH = "/content/drive/MyDrive/ML_Sentiment_Analysis/cc.ro.300.bin"
FREEZE_EMBEDDINGS = False
# ============================================================


Checkpoint dir: /content/checkpoints
Results dir: /content/results
Plots dir: /content/plots


### Experiment 1: LSTM Baseline

In [None]:
!python -m src.run_experiment \
    --model lstm \
    --hidden_dim 256 \
    --num_layers 2 \
    --epochs 20 \
    --batch_size 64 \
    --lr 0.001 \
    --device {DEVICE} \
    --experiment_name lstm_baseline \
    --checkpoint_dir {CHECKPOINT_DIR} \
    --results_dir {RESULTS_DIR} \
    --plots_dir {PLOTS_DIR} \
    --pretrained_embeddings {FASTTEXT_PATH}

Running: python -m src.run_experiment --model lstm --hidden_dim 256 --num_layers 2 --epochs 20 --batch_size 64 --lr 0.001 --device cuda --experiment_name lstm_baseline --checkpoint_dir /content/checkpoints --results_dir /content/results --plots_dir /content/plots --pretrained_embeddings /content/cc.ro.300.bin
Working directory: /content/sentiment_analysis


KeyboardInterrupt: 

### Experiment 2: BiLSTM with Attention

In [None]:
!python -m src.run_experiment \
    --model bilstm_attention \
    --hidden_dim 256 \
    --num_layers 2 \
    --epochs 20 \
    --batch_size 64 \
    --lr 0.001 \
    --device {DEVICE} \
    --experiment_name bilstm_attention \
    --checkpoint_dir {CHECKPOINT_DIR} \
    --results_dir {RESULTS_DIR} \
    --plots_dir {PLOTS_DIR} \
    --pretrained_embeddings {FASTTEXT_PATH}

Running: python -m src.run_experiment --model bilstm_attention --hidden_dim 256 --num_layers 2 --epochs 20 --batch_size 64 --lr 0.001 --device cuda --experiment_name bilstm_attention --checkpoint_dir /content/checkpoints --results_dir /content/results --plots_dir /content/plots


CompletedProcess(args=['python', '-m', 'src.run_experiment', '--model', 'bilstm_attention', '--hidden_dim', '256', '--num_layers', '2', '--epochs', '20', '--batch_size', '64', '--lr', '0.001', '--device', 'cuda', '--experiment_name', 'bilstm_attention', '--checkpoint_dir', '/content/checkpoints', '--results_dir', '/content/results', '--plots_dir', '/content/plots'], returncode=1)

### Experiment 3: LSTM with Augmentation

In [None]:
!python -m src.run_experiment \
    --model lstm \
    --hidden_dim 256 \
    --num_layers 2 \
    --epochs 20 \
    --batch_size 64 \
    --lr 0.001 \
    --augment random_swap \
    --aug_prob 0.1 \
    --device {DEVICE} \
    --experiment_name lstm_augmented \
    --checkpoint_dir {CHECKPOINT_DIR} \
    --results_dir {RESULTS_DIR} \
    --plots_dir {PLOTS_DIR} \
    --pretrained_embeddings {FASTTEXT_PATH}

Simple RNN


In [None]:
!python -m src.run_experiment \
    --model simple_rnn \
    --embedding_dim 300 \
    --hidden_dim 256 \
    --num_layers 2 \
    --dropout 0.3 \
    --pooling last \
    --epochs 20 \
    --batch_size 64 \
    --lr 1e-3 \
    --weight_decay 1e-5 \
    --optimizer adamw \
    --scheduler plateau \
    --gradient_clip 1.0 \
    --max_seq_len 128 \
    --min_freq 2 \
    --max_vocab_size 50000 \
    --augment none \
    --pretrained_embeddings {FASTTEXT_PATH} \
    --early_stopping 5 \
    --checkpoint_metric val_f1 \
    --experiment_name custom_experiment \
    --device {DEVICE} \
    --data_dir {DATA_DIR} \
    --checkpoint_dir {CHECKPOINT_DIR} \
    --plots_dir {PLOTS_DIR} \
    --results_dir {RESULTS_DIR}

    #--bidirectional \
    #--freeze_embeddings \
    #--resume {PATH}
    #--evaluate_only /
    #--checkpoint {PATH} \
    #--verbose \
    #--no_plots \

Simple RNN aug



In [None]:
!python -m src.run_experiment \
    --model simple_rnn \
    --embedding_dim 300 \
    --hidden_dim 256 \
    --num_layers 2 \
    --dropout 0.3 \
    --pooling last \
    --epochs 20 \
    --batch_size 64 \
    --lr 1e-3 \
    --weight_decay 1e-5 \
    --optimizer adamw \
    --scheduler plateau \
    --gradient_clip 1.0 \
    --max_seq_len 128 \
    --min_freq 2 \
    --max_vocab_size 50000 \
    --augment eda \
    --aug_prob 0.1 \
    --aug_mode one_of \
    --pretrained_embeddings {FASTTEXT_PATH} \
    --early_stopping 5 \
    --checkpoint_metric val_f1 \
    --experiment_name simple_rnn_aug_eda \
    --device {DEVICE} \
    --data_dir {DATA_DIR} \
    --checkpoint_dir {CHECKPOINT_DIR} \
    --plots_dir {PLOTS_DIR} \
    --results_dir {RESULTS_DIR}

    #--bidirectional \
    #--freeze_embeddings \
    #--resume {PATH}
    #--evaluate_only /
    #--checkpoint {PATH} \
    #--verbose \
    #--no_plots \

## 7. View Results

In [None]:
import json
import pandas as pd
from pathlib import Path

# Load all experiment summaries
results_dir = Path(RESULTS_DIR)
summaries = []

if results_dir.exists():
    for exp_dir in results_dir.iterdir():
        if exp_dir.is_dir():
            summary_file = exp_dir / 'summary.json'
            if summary_file.exists():
                with open(summary_file) as f:
                    summary = json.load(f)
                    summary['experiment'] = exp_dir.name
                    summaries.append(summary)

if summaries:
    df = pd.DataFrame(summaries)
    print("\n" + "="*60)
    print("EXPERIMENT RESULTS SUMMARY")
    print("="*60)
    display(df[['experiment', 'model', 'best_val_f1', 'best_val_acc', 'epochs_trained']].sort_values('best_val_f1', ascending=False))
else:
    print("No results found yet. Run experiments first!")

In [None]:
# Display plots from experiments
from IPython.display import Image, display
from pathlib import Path

plots_dir = Path(PLOTS_DIR)
if plots_dir.exists():
    for exp_dir in sorted(plots_dir.iterdir()):
        if exp_dir.is_dir():
            print(f"\nüìä {exp_dir.name}")
            print("-" * 40)
            for plot in sorted(exp_dir.glob('*.png')):
                print(f"\n{plot.name}:")
                display(Image(filename=str(plot), width=600))
else:
    print(f"Plots directory not found: {PLOTS_DIR}")

## 8. Download Results to Local Machine

In [None]:
# Zip and download all results
!zip -r /content/experiment_results.zip {RESULTS_DIR} {PLOTS_DIR}

from google.colab import files
files.download('/content/experiment_results.zip')
print("‚úÖ Results downloaded!")

# Note: Results are also saved to Google Drive at:
print(f"\nüìÅ Results persist in Google Drive:")
print(f"   {RESULTS_DIR}")
print(f"   {PLOTS_DIR}")