Paso 1

In [37]:
!pip install torch torchvision
!pip install tqdm



In [66]:
import os
import requests
import zipfile

# URL del archivo GloVe
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = "glove.6B.zip"
glove_extract_path = "glove"

# Descarga el archivo zip de GloVe si no existe
if not os.path.exists(glove_zip_path):
    print("Descargando GloVe...")
    response = requests.get(glove_url, stream=True)
    with open(glove_zip_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)

# Extrae el contenido del archivo zip
if not os.path.exists(glove_extract_path):
    print("Extrayendo GloVe...")
    with zipfile.ZipFile(glove_zip_path, "r") as zip_ref:
        zip_ref.extractall(glove_extract_path)

print("GloVe descargado y extraído correctamente.")

GloVe descargado y extraído correctamente.


Paso 2

In [67]:
import numpy as np

def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Carga de un archivo de embeddings
embedding_file = "glove/glove.6B.100d.txt"
concept_embeddings = load_embeddings(embedding_file)

Paso 3

In [68]:
def retrofit(embeddings, lexicon, alpha=0.5, iterations=10):
    retrofitted_embeddings = embeddings.copy()
    for _ in range(iterations):
        for word, neighbors in lexicon.items():
            if word not in embeddings:
                continue
            updated_vector = (alpha * embeddings[word])
            count = 0
            for neighbor in neighbors:
                if neighbor in embeddings:
                    updated_vector += ((1 - alpha) * embeddings[neighbor])
                    count += 1
            if count > 0:
                retrofitted_embeddings[word] = updated_vector / (count + alpha)
    return retrofitted_embeddings

# Ejemplo de léxico semántico
semantic_lexicon = {
    "person": ["human", "man", "woman", "boy", "girl", "child", "adult", "worker", "teacher", "student"],
    "city": ["metropolis", "town", "urban", "capital", "village", "borough", "municipality", "district", "region", "suburb"],
    "state": ["province", "region", "territory", "area", "zone", "division", "domain", "realm", "land", "nation"],
    "country": ["nation", "sovereign", "republic", "kingdom", "state", "land", "territory", "empire", "federation", "domain"],
    "organization": ["company", "corporation", "institution", "agency", "firm", "society", "association", "foundation", "enterprise", "business"],
    "animal": ["dog", "cat", "lion", "tiger", "elephant", "zebra", "giraffe", "horse", "cow", "sheep"],
    "vehicle": ["car", "truck", "bus", "bicycle", "motorcycle", "train", "airplane", "boat", "ship", "scooter"],
    "technology": ["computer", "internet", "software", "hardware", "mobile", "robot", "AI", "blockchain", "cloud", "server"],
    "emotion": ["happiness", "sadness", "anger", "fear", "love", "joy", "grief", "envy", "shame", "pride"],
    "food": ["bread", "milk", "cheese", "meat", "fish", "fruit", "vegetable", "rice", "pasta", "egg"],
    "color": ["red", "blue", "green", "yellow", "purple", "pink", "orange", "brown", "black", "white"],
    "weather": ["rain", "sun", "snow", "storm", "cloud", "wind", "fog", "hail", "thunder", "lightning"],
    "nature": ["tree", "flower", "river", "mountain", "ocean", "forest", "desert", "valley", "hill", "lake"],
    "sport": ["soccer", "basketball", "tennis", "golf", "cricket", "rugby", "swimming", "cycling", "running", "baseball"],
    "profession": ["doctor", "engineer", "nurse", "lawyer", "scientist", "artist", "musician", "actor", "chef", "writer"],
    "building": ["house", "apartment", "office", "school", "hospital", "store", "factory", "church", "museum", "library"],
    "tool": ["hammer", "wrench", "screwdriver", "drill", "saw", "shovel", "pliers", "knife", "axe", "ruler"],
    "science": ["physics", "chemistry", "biology", "mathematics", "astronomy", "geology", "genetics", "robotics", "ecology", "quantum"],
    "art": ["painting", "sculpture", "drawing", "photography", "dance", "theater", "film", "literature", "poetry", "design"],
    "body": ["head", "arm", "leg", "eye", "ear", "nose", "mouth", "hand", "foot", "chest"],
    "education": ["school", "college", "university", "class", "lecture", "teacher", "student", "exam", "degree", "homework"],
    "money": ["cash", "currency", "dollar", "euro", "yen", "coin", "bank", "credit", "account", "loan"],
    "time": ["minute", "hour", "day", "week", "month", "year", "decade", "century", "millennium", "moment"],
    "relation": ["family", "friend", "colleague", "partner", "enemy", "neighbor", "spouse", "child", "parent", "sibling"]
}

# Aplicar retrofitting
retrofitted_embeddings = retrofit(concept_embeddings, semantic_lexicon)

Paso 4

In [69]:
import pandas as pd

dataNER = pd.read_csv("dataNER.csv")

PREPROCESAMIENTO DE DATA

In [70]:
import re
import ast

# Función para limpiar las comillas dobles y convertir a listas reales
def clean_tags_column(tags_column):
    cleaned_tags = []
    for tag_entry in tags_column:
        if isinstance(tag_entry, str):
            cleaned_entry = re.sub(r"^\"|\]\"$", "", tag_entry)  # Eliminar comillas externas
            cleaned_entry = ast.literal_eval(cleaned_entry)  # Convertir la cadena a una lista real
            cleaned_tags.append(cleaned_entry)
        else:
            cleaned_tags.append(tag_entry)
    return cleaned_tags

dataNER["Tags"] = clean_tags_column(dataNER["Tags"])

dataNER["Tags"].tolist()

[['B-PER', 'O', 'O', 'O', 'B-LOC', 'I-LOC'],
 ['B-PER', 'O', 'O', 'O', 'B-LOC'],
 ['B-PER', 'I-PER', 'O', 'B-LOC'],
 ['I-MISC', 'B-MISC', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['I-ORG', 'O', 'O', 'B-DATE'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['B-PER', 'B-DATE', 'O', 'I-LOC'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['I-DATE', 'O', 'B-LOC', 'B-LOC'],
 ['B-PER', 'O', 'I-DATE', 'B-LOC'],
 ['B-PER', 'O', 'B-MISC', 'B-LOC'],
 ['B-PER', 'O', 'O', 'I-DATE', 'B-LOC'],
 ['B-PER', 'I-ORG', 'O', 'B-LOC'],
 ['B-PER', 'B-PER', 'B-DATE', 'B-LOC'],
 ['B-PER', 'O', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['I-LOC', 'O', 'O', 'B-LOC'],
 ['I-MISC', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 'O', 'I-PER', 'B-LOC', 'I-PER'],
 ['B-PER', 'O', 'O', 'B-LOC'],
 ['I-DATE', 'B-LOC', 'O', 'B-LOC', 'I-ORG'],
 ['I-ORG', 'O', 'O', 'B-LOC'],
 ['B-PER', 'O', 

In [86]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

# Añadir token especial <PAD> para rellenado
retrofitted_embeddings["<PAD>"] = np.zeros(100)

# Crear diccionarios de palabras y etiquetas
word_to_ix = {word: i for i, word in enumerate(retrofitted_embeddings.keys())}
tag_to_ix = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-MISC": 7,
    "I-MISC": 8,
    "B-PROD": 9,
    "I-PROD": 10,
}
# Añadir índice para el padding de etiquetas
pad_tag_idx = len(tag_to_ix)
tag_to_ix["<PAD>"] = pad_tag_idx

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

# Tokenizar las oraciones en listas de palabras
sentences = [sentence.split() for sentence in dataNER["Sentence"].tolist()]
tags = dataNER["Tags"].tolist()

# Verificar que cada oración y su conjunto de etiquetas tengan la misma longitud
for i in range(len(sentences)):
    if len(sentences[i]) != len(tags[i]):
        print(f"Advertencia: La oración y las etiquetas en el índice {i} no coinciden en longitud.")
        print(f"Oración: {sentences[i]}")
        print(f"Etiquetas: {tags[i]}")

# Dividir datos en entrenamiento y validación
train_sentences, val_sentences, train_tags, val_tags = train_test_split(
    sentences, tags, test_size=0.2, random_state=42
)

# Dataset personalizado
class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = [[word_to_ix.get(word.lower(), word_to_ix["<PAD>"]) for word in sentence] for sentence in sentences]
        self.tags = [[tag_to_ix.get(tag, pad_tag_idx) for tag in tag_list] for tag_list in tags]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.sentences[idx], dtype=torch.long),
            torch.tensor(self.tags[idx], dtype=torch.long)
        )

# Rellenado dinámico
def collate_fn(batch):
    sentences, tags = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_ix["<PAD>"])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=pad_tag_idx)
    return sentences_padded, tags_padded

# Crear datasets y DataLoaders
train_dataset = NERDataset(train_sentences, train_tags, word_to_ix, tag_to_ix)
val_dataset = NERDataset(val_sentences, val_tags, word_to_ix, tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# Definición del modelo
class NERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, pretrained_embeddings):
        super(NERModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.tensor(pretrained_embeddings))
        self.embedding.weight.requires_grad = False  # Congelar embeddings

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentences):
        embeds = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = tag_space  # Sin softmax para CrossEntropyLoss
        return tag_scores

# Cargar embeddings preentrenados como tensor
pretrained_embeddings = np.array([retrofitted_embeddings[word] for word in word_to_ix.keys()])

# Inicializar modelo, pérdida y optimizador
embedding_dim = 100  # Dimensión de los embeddings
hidden_dim = 128
vocab_size = len(word_to_ix)
tagset_size = len(tag_to_ix)

model = NERModel(embedding_dim, hidden_dim, vocab_size, tagset_size, pretrained_embeddings)
loss_function = nn.CrossEntropyLoss(ignore_index=pad_tag_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Función para calcular métricas de rendimiento
def calculate_metrics(predictions, labels):
    # Convertir tensores a numpy arrays
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    # Obtener las etiquetas únicas presentes en los datos
    unique_labels = np.unique(labels)
    labels_list = unique_labels.tolist()

    # Obtener los nombres de las etiquetas correspondientes
    target_names = [ix_to_tag[i] for i in labels_list]

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    report = classification_report(labels, predictions, labels=labels_list, target_names=target_names, zero_division=0)
    return accuracy, f1, report

# Función de entrenamiento
def train_model(model, train_loader, val_loader, loss_function, optimizer, epochs=200):
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for sentences, tags in train_loader:
            optimizer.zero_grad()
            tag_scores = model(sentences)

            # Aplanar las dimensiones para calcular la pérdida
            loss = loss_function(tag_scores.view(-1, tagset_size), tags.view(-1))

            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # Evaluación en el conjunto de validación
        model.eval()
        total_val_loss = 0
        all_predictions = []
        all_labels = []
        with torch.no_grad():
            for sentences, tags in val_loader:
                tag_scores = model(sentences)
                loss = loss_function(tag_scores.view(-1, tagset_size), tags.view(-1))
                total_val_loss += loss.item()

                predictions = torch.argmax(tag_scores, dim=2)

                # Aplicar máscara para ignorar posiciones de padding
                mask = tags != pad_tag_idx

                # Aplicar máscara y aplanar
                masked_predictions = predictions[mask]
                masked_labels = tags[mask]

                all_predictions.append(masked_predictions)
                all_labels.append(masked_labels)

        avg_val_loss = total_val_loss / len(val_loader)

        # Concatena todas las predicciones y etiquetas
        all_predictions = torch.cat(all_predictions)
        all_labels = torch.cat(all_labels)

        # Calcula métricas
        accuracy, f1, report = calculate_metrics(all_predictions, all_labels)

        print(f"Época {epoch + 1}/{epochs}")
        print(f"Pérdida de entrenamiento: {avg_train_loss:.4f}")
        print(f"Pérdida de validación: {avg_val_loss:.4f}")
        print(f"Precisión: {accuracy:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print("Reporte de clasificación:")
        print(report)
        print("-" * 50)

# Entrenamiento del modelo
train_model(model, train_loader, val_loader, loss_function, optimizer, epochs=10)

# Función para evaluar una oración de prueba
def evaluate_model(model, sentence, word_to_ix, ix_to_tag):
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor([word_to_ix.get(word.lower(), word_to_ix["<PAD>"]) for word in sentence], dtype=torch.long).unsqueeze(0)
        tag_scores = model(inputs)
        predicted_tags = torch.argmax(tag_scores, dim=2).squeeze().tolist()
        # Mapea los índices predichos a etiquetas
        if isinstance(predicted_tags, int):  # Si solo hay un elemento
            predicted_tags = [ix_to_tag.get(predicted_tags, "<PAD>")]
        else:
            predicted_tags = [ix_to_tag.get(tag, "<PAD>") for tag in predicted_tags]
        return predicted_tags

test_sentence = ["maria", "visits", "new", "york"]
print("Oración de prueba:", test_sentence)
print("Etiquetas predichas:", evaluate_model(model, test_sentence, word_to_ix, ix_to_tag))


Época 1/10
Pérdida de entrenamiento: 1.5737
Pérdida de validación: 1.5705
Precisión: 0.6027
F1-Score: 0.2250
Reporte de clasificación:
              precision    recall  f1-score   support

           O       0.64      0.90      0.75      2670
       B-PER       0.62      0.82      0.71      1401
       I-PER       0.00      0.00      0.00       276
       B-ORG       0.00      0.00      0.00       294
       I-ORG       0.00      0.00      0.00       281
       B-LOC       0.55      0.75      0.64      1363
       I-LOC       0.38      0.39      0.38       411
      B-MISC       0.00      0.00      0.00       285
      I-MISC       0.00      0.00      0.00       302
      B-PROD       0.00      0.00      0.00       259
      I-PROD       0.00      0.00      0.00       306

    accuracy                           0.60      7848
   macro avg       0.20      0.26      0.23      7848
weighted avg       0.45      0.60      0.51      7848

--------------------------------------------------
É