In [None]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/MyDrive/DATA/ote_acd.csv

ls: cannot access '/content/drive/MyDrive/DATA/ote_acd.csv': No such file or directory


In [None]:
!python -m spacy download pt_core_news_sm


Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
import pandas as pd
import unicodedata
import ast
import spacy

# ===============================
# Clase de limpieza de texto
# ===============================
class TextProcessing(object):
    @staticmethod
    def remove_patterns(text: str) -> str:
        try:
            text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
            text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
            text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
            text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
            text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
            return text.lower()
        except Exception as e:
            print('Error remove_patterns: {0}'.format(e))
            return text

# ===============================
# Cargar el archivo completo
# ===============================
df = pd.read_csv('/content/drive/MyDrive/Datas_Notebooks/ote_acd.csv', sep=';', encoding='utf-8-sig', on_bad_lines='skip')
df['aspect'] = df['aspect'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['text_clean'] = df['text'].apply(lambda x: TextProcessing.remove_patterns(str(x)))

# ===============================
# Seleccionar una muestra
# ===============================
df_sample = df.head(2886).copy()

# ===============================
# Procesar con spaCy para extraer noun chunks
# ===============================
nlp = spacy.load("pt_core_news_sm")

def extract_noun_chunks(text):
    doc = nlp(text)
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]
    return ' '.join(noun_chunks)

# Crear una nueva columna con la reseña reconstruida basada en noun chunks
df_sample['text_noun_chunks'] = df_sample['text_clean'].apply(extract_noun_chunks)

# Mostrar resultados para verificar
print(df_sample[['text','text_noun_chunks', 'aspect', 'target_opinion_terms']].head(10))


                                                text  \
0  Hotel com condições gerais muito más. Infraest...   
1  Hotel com condições gerais muito más. Infraest...   
2  Hotel com condições gerais muito más. Infraest...   
3  Hotel com condições gerais muito más. Infraest...   
4  Hotel com condições gerais muito más. Infraest...   
5  Hotel com condições gerais muito más. Infraest...   
6  Hotel com condições gerais muito más. Infraest...   
7  Hotel com condições gerais muito más. Infraest...   
8  O hotel segue corretamente os padrões do Mercu...   
9  O hotel segue corretamente os padrões do Mercu...   

                                    text_noun_chunks  \
0  hotel condições gerais infraestruturas bastant...   
1  hotel condições gerais infraestruturas bastant...   
2  hotel condições gerais infraestruturas bastant...   
3  hotel condições gerais infraestruturas bastant...   
4  hotel condições gerais infraestruturas bastant...   
5  hotel condições gerais infraestruturas basta

In [None]:
# Asegúrate de que todos los valores sean diccionarios válidos
df_sample['aspect'] = df_sample['aspect'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_sample['aspect'] = df_sample['aspect'].apply(lambda x: x if isinstance(x, dict) else {})

# Extraer campos del diccionario
df_sample['term'] = df_sample['aspect'].apply(lambda x: x.get('term'))
df_sample['start_pos'] = df_sample['aspect'].apply(lambda x: x.get('start_pos'))
df_sample['end_pos'] = df_sample['aspect'].apply(lambda x: x.get('end_pos'))

import ast

# Asegurarse de que cada entrada sea una lista real
df_sample['target_opinion_terms'] = df_sample['target_opinion_terms'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Unir todos los términos con comas
df_sample['opinion_term'] = df_sample['target_opinion_terms'].apply(
    lambda x: ', '.join(x) if isinstance(x, list) and len(x) > 0 else None
)

# Mostrar el resultado
print(df_sample[['text', 'term', 'start_pos', 'end_pos', 'opinion_term']].head(10))




                                                text                 term  \
0  Hotel com condições gerais muito más. Infraest...                Hotel   
1  Hotel com condições gerais muito más. Infraest...      Infraestruturas   
2  Hotel com condições gerais muito más. Infraest...             elevador   
3  Hotel com condições gerais muito más. Infraest...        casa de banho   
4  Hotel com condições gerais muito más. Infraest...                   tv   
5  Hotel com condições gerais muito más. Infraest...              cortina   
6  Hotel com condições gerais muito más. Infraest...  isolamento acústico   
7  Hotel com condições gerais muito más. Infraest...                 caro   
8  O hotel segue corretamente os padrões do Mercu...          localização   
9  O hotel segue corretamente os padrões do Mercu...  isolamento acústico   

   start_pos  end_pos  opinion_term  
0          0        5           más  
1         38       53    degradadas  
2         79       87           sem  


In [None]:
import spacy

# Cargar modelo en portugués (el mismo que usaste antes)
nlp = spacy.load("pt_core_news_sm")

def apply_bio_tagging(text, term, start_pos, end_pos):
    doc = nlp(text)
    bio_tags = []
    tokens = [token.text for token in doc]

    # Si no hay término válido, todo es O
    if not isinstance(term, str) or start_pos is None or end_pos is None:
        return tokens, ['O'] * len(tokens)

    for token in doc:
        token_start = token.idx
        token_end = token.idx + len(token)

        if token_start >= start_pos and token_end <= end_pos:
            # B-TERM si es el primer token del término, I-TERM si es continuación
            if token_start == start_pos:
                bio_tags.append('B-TERM')
            else:
                bio_tags.append('I-TERM')
        else:
            bio_tags.append('O')

    return tokens, bio_tags


In [None]:
# Aplicar función a cada fila
df_sample[['text_tokens', 'text_bio']] = df_sample.apply(
    lambda row: pd.Series(apply_bio_tagging(row['text'], row['term'], row['start_pos'], row['end_pos'])),
    axis=1
)

# Ver ejemplo
df_sample[[ 'text_tokens', 'text_bio']].head()


Unnamed: 0,text_tokens,text_bio
0,"[Hotel, com, condições, gerais, muito, más, .,...","[B-TERM, O, O, O, O, O, O, O, O, O, O, O, O, O..."
1,"[Hotel, com, condições, gerais, muito, más, .,...","[O, O, O, O, O, O, O, B-TERM, O, O, O, O, O, O..."
2,"[Hotel, com, condições, gerais, muito, más, .,...","[O, O, O, O, O, O, O, O, O, O, O, O, B-TERM, O..."
3,"[Hotel, com, condições, gerais, muito, más, .,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-T..."
4,"[Hotel, com, condições, gerais, muito, más, .,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np

label_list = ['O', 'B-TERM', 'I-TERM']
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Dividir antes de calcular pesos
train_df, test_df = train_test_split(df_sample, test_size=0.2, random_state=42)

# Extraer etiquetas solo del train
all_train_labels = [tag for seq in train_df['text_bio'] for tag in seq]
train_label_ids = [label_to_id[tag] for tag in all_train_labels]

# Calcular pesos solo sobre etiquetas del entrenamiento
class_weights = compute_class_weight('balanced', classes=np.unique(train_label_ids), y=train_label_ids)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Mostrar resultado
print("📊 Pesos de clase (calculados sobre entrenamiento):", dict(zip(label_list, class_weights)))


📊 Pesos de clase (calculados sobre entrenamiento): {'O': np.float64(0.3377534833237563), 'B-TERM': np.float64(29.9917139119058), 'I-TERM': np.float64(168.97051597051598)}


In [None]:
!pip install --upgrade transformers


Collecting transformers
  Using cached transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
Using cached transformers-4.52.3-py3-none-any.whl (10.5 MB)
Installing collected packages: transformers
Successfully installed transformers-4.52.3


In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize_and_align_labels(example):
    encoding = tokenizer(
        example['text_tokens'],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128
    )
    word_ids = encoding.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label_to_id[example['text_bio'][word_idx]])
        else:
            label = example['text_bio'][word_idx]
            labels.append(label_to_id['I-TERM'] if label == 'B-TERM' else label_to_id[label])
        previous_word_idx = word_idx
    encoding["labels"] = labels
    return encoding


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
!pip install datasets




In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)
dataset = dataset.map(tokenize_and_align_labels)


Map:   0%|          | 0/2308 [00:00<?, ? examples/s]

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification
from transformers import EarlyStoppingCallback
import torch.nn as nn
from sklearn.model_selection import KFold
import numpy as np

# Define Trainer personalizado con pesos de clase y métricas
class CustomTrainer(Trainer):
    def __init__(self, class_weights_tensor=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights_tensor = class_weights_tensor

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # <- Aceptar kwargs
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        weight = self.class_weights_tensor.to(logits.device)
        loss_fct = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


# Métricas personalizadas
label_list = ['O', 'B-TERM', 'I-TERM']
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    # Ignorar tokens con -100
    true_labels = []
    true_preds = []

    for pred, label in zip(preds, labels):
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                true_preds.append(p_i)
                true_labels.append(l_i)

    precision, recall, f1, support = precision_recall_fscore_support(true_labels, true_preds, average='macro')
    acc = accuracy_score(true_labels, true_preds)

    # Métricas por etiqueta
    report = classification_report(true_labels, true_preds, target_names=label_list, output_dict=True)
    detailed_metrics = {}
    for label in label_list:
        detailed_metrics[f'{label}_precision'] = report[label]['precision']
        detailed_metrics[f'{label}_recall'] = report[label]['recall']
        detailed_metrics[f'{label}_f1'] = report[label]['f1-score']
        detailed_metrics[f'{label}_support'] = report[label]['support']

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        **detailed_metrics
    }


early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,  # Detener si no mejora en 3 épocas
    early_stopping_threshold=0.0
)

# K-Fold Training
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
    print(f"\n🔁 Fold {fold + 1}")
    train_split = dataset.select(train_idx)
    val_split = dataset.select(val_idx)

    model = AutoModelForTokenClassification.from_pretrained(
        "neuralmind/bert-base-portuguese-cased",
        num_labels=len(label_list),
        id2label=id_to_label,
        label2id=label_to_id
    )

    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold}",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=10,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=200,
        logging_dir=f"./logs_fold_{fold}",
        logging_steps=50,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_split,
        eval_dataset=val_split,
        tokenizer=tokenizer,
        class_weights_tensor=class_weights_tensor,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping]
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    print(f"\n📊 Métricas Fold {fold+1}: {eval_metrics}")




🔁 Fold 1


Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
test_df = test_df = test_df.reset_index(drop=True)


In [None]:
for i in range(len(test_df)):
    tokens = test_df.loc[i, 'text_tokens']  # o test_df.iloc[i]['text_tokens']


In [None]:
from datasets import Dataset

# Convertir test_df a Dataset y tokenizar
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_and_align_labels)


In [None]:
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
labels = predictions.label_ids
pred_ids = np.argmax(logits, axis=-1)
true_ids = labels


In [None]:
def infer_start_end_positions(tokens, pred_tags, original_text):
    """
    Dado un conjunto de tokens y etiquetas BIO, extrae los términos y sus posiciones en el texto original.
    """
    terms = []
    current_term = ""
    current_start = -1
    current_end = -1
    pointer = 0

    for token, tag in zip(tokens, pred_tags):
        norm_token = token.strip()

        # Avanzar el puntero hasta encontrar el token en el texto
        while pointer < len(original_text) and original_text[pointer].isspace():
            pointer += 1

        idx = original_text.find(norm_token, pointer)
        if idx == -1:
            continue  # token no encontrado

        if tag == "B-TERM":
            if current_term:
                terms.append((current_term.strip(), current_start, current_end))
            current_term = norm_token
            current_start = idx
            current_end = idx + len(norm_token)
        elif tag == "I-TERM" and current_term:
            current_term += " " + norm_token
            current_end = idx + len(norm_token)
        else:
            if current_term:
                terms.append((current_term.strip(), current_start, current_end))
                current_term = ""
                current_start = -1
                current_end = -1
        pointer = idx + len(norm_token)

    if current_term:
        terms.append((current_term.strip(), current_start, current_end))

    return terms


In [None]:
decoded_results = []

for i in range(len(test_df)):
    tokens = test_df.loc[i, 'text_tokens']
    text = test_df.loc[i, 'text']
    true_bio = test_df.loc[i, 'text_bio']

    encoding = tokenizer(tokens, is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
    word_ids = encoding.word_ids()

    pred_bio = []
    token_output = []

    for j, word_idx in enumerate(word_ids):
        if word_idx is None or true_ids[i][j] == -100:
            continue
        pred_tag = id_to_label[pred_ids[i][j]]
        token = tokens[word_idx]
        pred_bio.append(pred_tag)
        token_output.append(token)

    term_spans = infer_start_end_positions(token_output, pred_bio, text)

    decoded_results.append({
        'text': text,
        'text_tokens': token_output,
        'text_bio': true_bio[:len(token_output)],
        'text_bio_prediction': pred_bio,
        'terms_extracted': [t[0] for t in term_spans],
        'start_positions': [t[1] for t in term_spans],
        'end_positions': [t[2] for t in term_spans]
    })


In [None]:
df_results = pd.DataFrame(decoded_results)
df_results.to_excel("resultados_test_con_posiciones.xlsx", index=False)
print("✅ Resultados guardados con posiciones en 'resultados_test_con_posiciones.xlsx'")
