In [None]:
# Install necessary packages
!pip install transformers datasets seqeval mlflow --quiet

# Mount Google Drive to access data and save models
from google.colab import drive
drive.mount('/content/drive')

# Set paths - adjust to your paths in Google Drive or local
MODEL_OUTPUT_DIR = "/content/drive/MyDrive/ner_model_output"
TRAIN_FILE = "/content/drive/MyDrive/dataset/train.conll.txt"
EVAL_FILE = "/content/drive/MyDrive/dataset/eval.conll.txt"

import os
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)


In [None]:
# If you have your repo on GitHub:
!git clone https://github.com/your-username/your-repo.git
%cd your-repo

# Or upload your src folder manually to Colab, then:
import sys
sys.path.append("/content/your-repo/src")  # Adjust if needed


In [None]:
from train.trainer import NerTrainer
from models.ner_model import load_model_and_tokenizer
from datasets import load_dataset
import pandas as pd

# Load train and eval data using your CoNLL loader
from data.conll_loader import load_conll_data

train_df = load_conll_data(TRAIN_FILE)
eval_df = load_conll_data(EVAL_FILE)

# Define your label list
LABEL_LIST = [
    "O",
    "B-PRODUCT", "I-PRODUCT",
    "B-PRICE", "I-PRICE",
    "B-LOC", "I-LOC",
]

label2id = {l: i for i, l in enumerate(LABEL_LIST)}

# Tokenize and prepare datasets using your tokenizer and helper functions
from train.trainer import tokenize_and_align_labels

model_name = "xlm-roberta-base"
model, tokenizer = load_model_and_tokenizer(model_name, num_labels=len(LABEL_LIST))

train_dataset = tokenize_and_align_labels(train_df, tokenizer, label2id)
eval_dataset = tokenize_and_align_labels(eval_df, tokenizer, label2id)

# Create the trainer and train
trainer = NerTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    output_dir=MODEL_OUTPUT_DIR,
)

metrics = trainer.train()
print("Training complete. Metrics:", metrics)


In [None]:
from eval.evaluate import evaluate

evaluate(
    model_path=MODEL_OUTPUT_DIR,
    data_path=EVAL_FILE,
    label_list=LABEL_LIST,
)

In [None]:
# List saved models
!ls -lh $MODEL_OUTPUT_DIR