In [4]:
!pip install transformers accelerate peft bitsandbytes datasets sentencepiece scikit-learn torch pdfplumber gradio huggingface_hub

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.1-py3-non

In [2]:
from huggingface_hub import login

# Reemplaza con tu token real (¡no lo subas a repos públicos!)
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# --- 2 · Imports y configuración básica ---
import re
import unicodedata
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# --- 3 · Funciones de normalización y limpieza ---
_SPACE_VARIANTS = r"[\u202f\u00a0\u2009\u200a\u2060]"

def _normalise_apostrophes(text: str) -> str:
    return text.replace("´", "'").replace("’", "'")

def _normalise_spaces(text: str, collapse: bool = True) -> str:
    text = re.sub(_SPACE_VARIANTS, " ", text)
    text = unicodedata.normalize("NFKC", text)
    if collapse:
        text = re.sub(r"[ ]{2,}", " ", text)
    return text.strip()

def _clean_timex(ent: str) -> str:
    ent = ent.replace("</s>", "").strip()
    return re.sub(r"[\.]+$", "", ent)

# --- 4 · Carga del primer modelo (NER) ---
MODEL1_ID = "Rhulli/Roberta-ner-temporal-expresions-secondtrain"
ID2LABEL = {0: "O", 1: "B-TIMEX", 2: "I-TIMEX"}

def load_ner_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL1_ID)
    model = AutoModelForTokenClassification.from_pretrained(MODEL1_ID)
    model.eval()
    if torch.cuda.is_available():
        model.to("cuda")
    return tokenizer, model

def extract_timex(text: str, tokenizer, model):
    text = _normalise_spaces(_normalise_apostrophes(text))
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=2)[0].cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    entities, current = [], []
    for tok, lab_id in zip(tokens, preds):
        lab = ID2LABEL.get(lab_id, "O")
        if lab == "B-TIMEX":
            if current:
                entities.append(tokenizer.convert_tokens_to_string(current).strip())
            current = [tok]
        elif lab == "I-TIMEX" and current:
            current.append(tok)
        else:
            if current:
                entities.append(tokenizer.convert_tokens_to_string(current).strip())
                current = []
    if current:
        entities.append(tokenizer.convert_tokens_to_string(current).strip())
    return [_clean_timex(e) for e in entities]

# --- 5 · Carga del segundo modelo (Normalización) ---
BASE_MODEL_ID = "google/gemma-2b-it"
ADAPTER_REPO_ID = "Rhulli/gemma-2b-it-TIMEX3"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

def load_norm_model():
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_ID,
        quantization_config=quant_config,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
    model = PeftModel.from_pretrained(base_model, ADAPTER_REPO_ID, device_map="auto")
    model.eval()
    return tokenizer, model

def normalize_timex(expressions, dct, tokenizer, model):
    eos_id = tokenizer.convert_tokens_to_ids("<end_of_turn>")
    results = []
    for expr in expressions:
        prompt = (
            f"<start_of_turn>user\n"
            f"Tu tarea es normalizar la expresión temporal al formato TIMEX3, utilizando la fecha de anclaje (DCT) cuando sea necesaria.\n"
            f"Fecha de Anclaje (DCT): {dct}\n"
            f"Expresión Original: {expr}<end_of_turn>\n"
            f"<start_of_turn>model\n"
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=64, eos_token_id=eos_id)
        decoded = tokenizer.decode(outputs[0, inputs.input_ids.shape[1]:], skip_special_tokens=False)
        clean = decoded.split("<end_of_turn>")[0].strip()
        results.append(clean)
    return results

# --- 6 · Definir DCT y frases de prueba ---
dct = "2025-06-11"
frases = [
    "Ms. Vance reports that the dry cough began approximately three months ago, around early March 2025. Initially, the cough was mild and occurred only intermittently, perhaps a few times each day, particularly in the morning upon waking. However, over the past four weeks, the cough has become more frequent and persistent, now occurring throughout the day and sometimes even at night, disrupting her sleep two to three times per week. She denies any fever or chills recently."


]

# --- 7 · Ejecutar pipeline ---
ner_tok, ner_model = load_ner_model()
norm_tok, norm_model = load_norm_model()

for text in frases:
    print(f"Texto: {text}")
    entidades = extract_timex(text, ner_tok, ner_model)
    for ent in entidades:
        norm = normalize_timex([ent], dct, norm_tok, norm_model)[0]
        print(f"Expresión temporal: {ent}")
        print(f"Normalización: {norm}")
    print("-" * 40)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/29.5M [00:00<?, ?B/s]

Texto: Ms. Vance reports that the dry cough began approximately three months ago, around early March 2025. Initially, the cough was mild and occurred only intermittently, perhaps a few times each day, particularly in the morning upon waking. However, over the past four weeks, the cough has become more frequent and persistent, now occurring throughout the day and sometimes even at night, disrupting her sleep two to three times per week. She denies any fever or chills recently.
Expresión temporal: early March 2025
Normalización: [TIMEX3 type="DATE" value="2025-03"]early March 2025[/TIMEX3]
Expresión temporal: each day
Normalización: [TIMEX3 type="SET" value="P1D" freq="1D"]each day[/TIMEX3]
Expresión temporal: morning
Normalización: [TIMEX3 type="TIME" value="T08:00"]morning[/TIMEX3]
Expresión temporal: past four weeks
Normalización: [TIMEX3 type="DATE" value="2025-05-25"]past four weeks[/TIMEX3]
Expresión temporal: night
Normalización: [TIMEX3 type="TIME" value="T00:00"]night[/TIMEX3]
E

In [5]:
import os
import re
import unicodedata
import torch
import gradio as gr
import pdfplumber
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
from peft import PeftModel
from huggingface_hub import login


# --- Funciones de normalización y limpieza ---
_SPACE_VARIANTS = r"[\u202f\u00a0\u2009\u200a\u2060]"

def _normalise_apostrophes(text: str) -> str:
    return text.replace("´", "'").replace("’", "'")

def _normalise_spaces(text: str, collapse: bool = True) -> str:
    text = re.sub(_SPACE_VARIANTS, " ", text)
    text = unicodedata.normalize("NFKC", text)
    if collapse:
        text = re.sub(r"[ ]{2,}", " ", text)
    return text.strip()

def _clean_timex(ent: str) -> str:
    ent = ent.replace("</s>", "").strip()
    return re.sub(r"[\.]+$", "", ent)

# --- Identificadores de los modelos ---
NER_ID      = "Rhulli/Roberta-ner-temporal-expresions-secondtrain"
ID2LABEL    = {0: "O", 1: "B-TIMEX", 2: "I-TIMEX"}
BASE_ID     = "google/gemma-2b-it"
ADAPTER_ID  = "Rhulli/gemma-2b-it-TIMEX3"

# --- Configuración de cuantización para el modelo de normalización ---
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

def load_models():
    # Carga NER con autenticación automática
    ner_tok = AutoTokenizer.from_pretrained(NER_ID, use_auth_token=True)
    ner_mod = AutoModelForTokenClassification.from_pretrained(NER_ID, use_auth_token=True)
    ner_mod.eval()
    if torch.cuda.is_available():
        ner_mod.to("cuda")

    # Carga modelo base con cuantización
    base_mod = AutoModelForCausalLM.from_pretrained(
        BASE_ID,
        quantization_config=quant_config,
        device_map="auto",
        use_auth_token=True
    )

    # Carga adaptador de normalización
    norm_tok = AutoTokenizer.from_pretrained(ADAPTER_ID, use_fast=True, use_auth_token=True)
    norm_mod = PeftModel.from_pretrained(
        base_mod,
        ADAPTER_ID,
        device_map="auto",
        use_auth_token=True
    )
    norm_mod.eval()

    return ner_tok, ner_mod, norm_tok, norm_mod, base_mod

# Carga inicial de los modelos
ner_tok, ner_mod, norm_tok, norm_mod, base_mod = load_models()
eos_id = norm_tok.convert_tokens_to_ids("<end_of_turn>")

# --- Lectura de archivos ---
def read_file(file_obj) -> str:
    path = file_obj.name
    if path.lower().endswith('.pdf'):
        full = ''
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                txt = page.extract_text()
                if txt:
                    full += txt + '\n'
        return full
    else:
        with open(path, 'rb') as f:
            data = f.read()
        try:
            return data.decode('utf-8')
        except:
            return data.decode('latin-1', errors='ignore')

# --- Procesamiento de texto ---
def extract_timex(text: str):
    text_norm = _normalise_spaces(_normalise_apostrophes(text))
    inputs = ner_tok(text_norm, return_tensors="pt", truncation=True)
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    with torch.no_grad():
        logits = ner_mod(**inputs).logits

    preds  = torch.argmax(logits, dim=2)[0].cpu().numpy()
    tokens = ner_tok.convert_ids_to_tokens(inputs["input_ids"][0])

    entities = []
    current  = []
    for tok, lab in zip(tokens, preds):
        tag = ID2LABEL.get(lab, "O")
        if tag == "B-TIMEX":
            if current:
                entities.append(ner_tok.convert_tokens_to_string(current).strip())
            current = [tok]
        elif tag == "I-TIMEX" and current:
            current.append(tok)
        else:
            if current:
                entities.append(ner_tok.convert_tokens_to_string(current).strip())
                current = []
    if current:
        entities.append(ner_tok.convert_tokens_to_string(current).strip())

    return [_clean_timex(e) for e in entities]

def normalize_timex(expr: str, dct: str) -> str:
    prompt = (
        f"<start_of_turn>user\n"
        f"Tu tarea es normalizar la expresión temporal al formato TIMEX3, utilizando la fecha de anclaje (DCT) cuando sea necesaria.\n"
        f"Fecha de Anclaje (DCT): {dct}\n"
        f"Expresión Original: {expr}<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )
    inputs  = norm_tok(prompt, return_tensors="pt").to(norm_mod.device)
    outputs = norm_mod.generate(**inputs, max_new_tokens=64, eos_token_id=eos_id)

    full_decoded = norm_tok.decode(
        outputs[0, inputs.input_ids.shape[1]:],
        skip_special_tokens=False
    )
    raw_tag  = full_decoded.split("<end_of_turn>")[0].strip()
    return raw_tag.replace("[", "<").replace("]", ">")

# --- Pipeline principal ---
def run_pipeline(files, raw_text, dct):
    rows = []
    file_list = files if isinstance(files, list) else ([files] if files else [])

    if raw_text:
        for line in raw_text.splitlines():
            if line.strip():
                for expr in extract_timex(line):
                    rows.append({
                        'Expresión': expr,
                        'Normalización': normalize_timex(expr, dct)
                    })

    for f in file_list:
        content = read_file(f)
        for line in content.splitlines():
            if line.strip():
                for expr in extract_timex(line):
                    rows.append({
                        'Expresión': expr,
                        'Normalización': normalize_timex(expr, dct)
                    })

    df = pd.DataFrame(rows)
    if df.empty:
        df = pd.DataFrame([], columns=['Expresión', 'Normalización'])

    return df, ""

# --- Interfaz Gradio ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        ## TIMEX Extractor & Normalizer

        Esta aplicación permite extraer expresiones temporales de textos o archivos (.txt, .pdf)
        y normalizarlas a formato TIMEX3.

        **Cómo usar:**
        - Sube uno o varios archivos en la columna izquierda.
        - Ajusta la *Fecha de Anclaje (DCT)* justo debajo de los archivos.
        - Escribe o pega tu texto en la columna derecha.
        - Pulsa **Procesar** para ver los resultados en la tabla debajo.

        **Columnas de salida:**
        - *Expresión*: la frase temporal extraída.
        - *Normalización*: la etiqueta TIMEX3 generada.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            files     = gr.File(file_types=['.txt', '.pdf'], file_count='multiple', label='Archivos (.txt, .pdf)')
            dct_input = gr.Textbox(value="2025-06-11", label="Fecha de Anclaje (YYYY-MM-DD)")
            run_btn   = gr.Button("Procesar")
        with gr.Column(scale=2):
            raw_text  = gr.Textbox(lines=15, placeholder='Pega o escribe aquí tu texto...', label='Texto libre')

    output_table = gr.Dataframe(headers=['Expresión', 'Normalización'], label="Resultados", type="pandas")
    output_logs  = gr.Textbox(label="Logs", lines=5, interactive=False)

    download_btn      = gr.Button("Descargar CSV")
    csv_file_output  = gr.File(label="Descargar resultados en CSV", visible=False)

    run_btn.click(
        fn=run_pipeline,
        inputs=[files, raw_text, dct_input],
        outputs=[output_table, output_logs]
    )

    def export_csv(df):
        csv_path = "resultados.csv"
        df.to_csv(csv_path, index=False)
        return gr.update(value=csv_path, visible=True)

    download_btn.click(
        fn=export_csv,
        inputs=[output_table],
        outputs=[csv_file_output]
    )

    demo.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/29.5M [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://91d52a14a0fe508c48.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
