In [39]:
# 📦 1. Install required dependencies
!pip install -q "dvc[gdrive]" transformers datasets mlflow seqeval
!pip install -q git+https://github.com/huggingface/transformers.git
!pip show transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Name: transformers
Version: 4.53.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


In [None]:
import os

# 1. Set API key and switch to online mode
os.environ["WANDB_API_KEY"] = "a3e57f43963988bf3e9d50b1663ff787f79afb7d"
os.environ["WANDB_MODE"] = "online"

In [40]:
# 🧠 2. Clone the repo from the correct branch
# 🚫 Remove the nested repo folder if it exists
!rm -rf amharic-telegram-ecommerce-ner

# ✅ Then re-clone cleanly
!git clone --branch task-3 https://github.com/Teshager21/amharic-telegram-ecommerce-ner.git

# Move into the clean project directory
%cd amharic-telegram-ecommerce-ner

Cloning into 'amharic-telegram-ecommerce-ner'...
remote: Enumerating objects: 234, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 234 (delta 24), reused 42 (delta 16), pack-reused 171 (from 1)[K
Receiving objects: 100% (234/234), 55.20 KiB | 1.00 MiB/s, done.
Resolving deltas: 100% (88/88), done.
/content/amharic-telegram-ecommerce-ner/amharic-telegram-ecommerce-ner/amharic-telegram-ecommerce-ner/amharic-telegram-ecommerce-ner/amharic-telegram-ecommerce-ner/amharic-telegram-ecommerce-ner/amharic-telegram-ecommerce-ner/amharic-telegram-ecommerce-ner


In [41]:
# 🔐 3. Upload service account key and configure DVC remote
from google.colab import files
import shutil, os
from pathlib import Path

# Upload the file
uploaded = files.upload()
filename = next(iter(uploaded))  # get the uploaded filename

# Move it to a safe path (avoid special characters)
key_path = Path("/content/sa_key.json")
shutil.move(filename, key_path)

# Configure DVC to use the uploaded key
!dvc remote modify gdrive_remote gdrive_use_service_account true
!dvc remote modify gdrive_remote gdrive_service_account_json_file_path {key_path}

Saving dvc-drive-remote-8f00f1ce2758.json to dvc-drive-remote-8f00f1ce2758.json
[0m[0m

In [42]:
# ✅ 4. Pull data using DVC (with service account)
import subprocess

def dvc_pull_with_feedback():
    print("📥 Pulling DVC-tracked data from Google Drive remote...")
    result = subprocess.run(["dvc", "pull", "-v"], capture_output=True, text=True)
    if result.returncode == 0:
        print("✅ DVC pull successful.")
    else:
        print("❌ DVC pull failed:")
        print(result.stderr)

dvc_pull_with_feedback()


📥 Pulling DVC-tracked data from Google Drive remote...
✅ DVC pull successful.


In [None]:
# ✅ Colab cell to run fine-tuning with updated Hydra config (no old keys)
!PYTHONPATH=src python src/train/main.py \
  model.name_or_path=xlm-roberta-base \
  training.epochs=3 \
  training.batch_size=16 \
  training.eval_strategy=epoch \
  data.train_file=data/labeled/train_split.conll \
  data.eval_file=data/labeled/eval_split.conll \
  output_dir=models/ner \
  logging.level=INFO

In [None]:
# 🧠 Evaluate NER Model
from src.eval.evaluate import evaluate

LABEL_LIST = [
    "O",
    "B-PRODUCT", "I-PRODUCT",
    "B-PRICE", "I-PRICE",
    "B-LOC", "I-LOC",
]

In [None]:
# Paths for model and test data
MODEL_PATH = "models/ner"  # Path to your fine-tuned model directory
TEST_DATA_PATH = "data/processed/ner_test.conll"  # Path to your test dataset in CoNLL format


In [None]:
# Run evaluation
try:
    evaluate(
        model_path=MODEL_PATH,
        data_path=TEST_DATA_PATH,
        label_list=LABEL_LIST
    )
except Exception as e:
    print(f"Evaluation failed: {e}")


In [None]:
# Launch MLflow UI in Colab
!mlflow ui --port 5000 &

# To open the UI, click the link below after running this cell:
print("MLflow UI should be running on port 5000")


In [None]:
# 📊 View MLflow Run (Optional)
!ls -lh mlruns/

In [None]:
!dvc add mlruns
!dvc push
!git add mlruns.dvc .gitignore
!git commit -m "feat:track MLflow experiment logs"
!git push