In [None]:
# 📦 Environment Setup
!pip install -q transformers datasets mlflow seqeval
!pip install -q git+https://github.com/huggingface/transformers.git  # Optional: latest
!pip install -q dvc[gdrive]
!git clone https://github.com/Teshager21/amharic-telegram-ecommerce-ner.git
%cd amharic-telegram-ecommerce-ner

# 📁 Mount Google Drive (Optional)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ⚙️ DVC Setup (if needed to pull model/data from Google Drive)
!pip install -q dvc[gdrive]

In [None]:
# 📎 Upload your service account JSON
from google.colab import files

uploaded = files.upload()  # Select your `dvc-drive-remote-XXXX.json`

In [None]:
!dvc remote add -d gdrive_remote gdrive://1UveBgDaVcNQi1T-fLA-Kz03hPRqJ_ayk

In [None]:
# 💾 Upload service account key and point DVC to it

from google.colab import files

# Upload your service account JSON key file from local machine
uploaded = files.upload()

# Get the filename of the uploaded file (assumes single file upload)
filename = next(iter(uploaded))
key_path = f"/content/{filename}"

# Check that the DVC remote exists before modifying
!dvc remote list

# Modify the DVC remote config to use the uploaded service account key
!dvc remote modify gdrive_remote gdrive_service_account_json_file_path {key_path}

print(f"✅ DVC remote 'gdrive_remote' now uses service account JSON at: {key_path}")


In [None]:
# 🔍 Check DVC remote config before pulling data
!dvc remote list

# Optionally, check detailed config
!cat .dvc/config

# Pull data and model if remote exists
import subprocess

def dvc_pull_if_remote_exists(remote_name="gdrive_remote"):
    remotes = subprocess.getoutput("dvc remote list").splitlines()
    remote_names = [line.split()[0] for line in remotes]
    if remote_name in remote_names:
        print(f"✅ Found remote '{remote_name}'. Pulling data...")
        # Use subprocess.run to set environment var and run dvc pull properly
        result = subprocess.run(
            ["dvc", "pull"],
            env={**os.environ, "DVC_GDRIVE_USE_SERVICE_ACCOUNT": "true"},
            capture_output=True,
            text=True,
        )
        print(result.stdout)
        if result.returncode != 0:
            print("❌ Error during dvc pull:")
            print(result.stderr)
    else:
        print(f"❌ Remote '{remote_name}' not found. Please configure your DVC remote first.")

import os
dvc_pull_if_remote_exists()

In [None]:
# Colab cell to run your fine-tuning main.py with Hydra config overrides

!python src/train/main.py \
  model.name_or_path=xlm-roberta-base \
  training.epochs=3 \
  training.batch_size=16 \
  data.conll_file=data/labeled/train.conll \
  output_dir=models/ner \
  logging.level=INFO


In [None]:
# 🧠 Evaluate NER Model
from src.eval.evaluate import evaluate

LABEL_LIST = [
    "O",
    "B-PRODUCT", "I-PRODUCT",
    "B-PRICE", "I-PRICE",
    "B-LOC", "I-LOC",
]

In [None]:
# Paths for model and test data
MODEL_PATH = "models/ner"  # Path to your fine-tuned model directory
TEST_DATA_PATH = "data/processed/ner_test.conll"  # Path to your test dataset in CoNLL format


In [None]:
# Run evaluation
try:
    evaluate(
        model_path=MODEL_PATH,
        data_path=TEST_DATA_PATH,
        label_list=LABEL_LIST
    )
except Exception as e:
    print(f"Evaluation failed: {e}")


In [None]:
# Launch MLflow UI in Colab
!mlflow ui --port 5000 &

# To open the UI, click the link below after running this cell:
print("MLflow UI should be running on port 5000")


In [None]:
# 📊 View MLflow Run (Optional)
!ls -lh mlruns/