In [2]:
import torch
import json
import numpy as np
from pathlib import Path
from datasets import Dataset, Features, Value, ClassLabel
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import evaluate
from rapidfuzz import fuzz
import json
import unicodedata
import spacy


In [None]:
def remove_similar(text_list, threshold=95):
    """
    Removes elements that are highly similar to each other.
    threshold: similarity percentage (0–100)
    """
    unique_texts = []
    for text in text_list:
        is_duplicate = False
        for u_text in unique_texts:
            # Levenshtein similarity ratio
            if fuzz.ratio(text, u_text) >= threshold:
                is_duplicate = True
                break
        if not is_duplicate:
            unique_texts.append(text)
    return unique_texts


In [None]:
# Load spaCy models for Spanish and English
# Since the chatbot's training corpus contains both Spanish and English data,
# we need separate lemmatization dictionaries for each language.
nlp_es = spacy.load("es_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

In [None]:

# Define paths for the original FAQ file and the normalized output file
FAQ_PATH = "./faqs/faqs.json"
OUTPUT_PATH = "./faqs/faqs_normalized.json"

# Load the raw FAQ data
with open(FAQ_PATH, "r", encoding="utf-8") as f:
    faq_data = json.load(f)["faqs"]

def normalize_text(text: str, lang: str = "es") -> str:
    """
    Normalizes a text string by applying:
    - Lowercasing
    - Removing question marks
    - Accent removal
    - Lemmatization
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove question marks (both Spanish and English forms)
    text = text.replace("¿", "").replace("?", "")
    
    # Lowercase again and remove accents/diacritics
    text = text.lower()
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

    # Select the spaCy model depending on the language
    # Since the dataset contains Spanish and English questions,
    # each language must use its own lemmatization dictionary.
    nlp = nlp_es if lang == "es" else nlp_en
    doc = nlp(text)

    # Apply lemmatization token by token
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    # Remove unnecessary double spaces
    lemmatized = " ".join(lemmatized.split())
    
    return lemmatized

# Process and normalize all questions for each FAQ entry
for faq in faq_data:
    for lang in ["es", "en"]:
        if lang in faq["questions"]:
            # Apply text normalization to each question
            normalized = [normalize_text(q, lang) for q in faq["questions"][lang]]
            
            # Remove exact duplicates while preserving order
            normalized = list(dict.fromkeys(normalized))
            
            # Remove near-duplicate questions (similarity > 95%)
            normalized = remove_similar(normalized, threshold=95)
            
            faq["questions"][lang] = normalized

# Save the normalized FAQ dataset
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump({"faqs": faq_data}, f, ensure_ascii=False, indent=4)

print(f"Archivo normalizado guardado en {OUTPUT_PATH}")


Archivo normalizado guardado en ./faqs/faqs_normalized.json


In [None]:
# =========================================================
# 1. Load normalized FAQs
# =========================================================
FAQ_PATH = "./faqs/faqs_normalized.json"

# Read the normalized FAQ dataset
with open(FAQ_PATH, "r", encoding="utf-8") as f:
    faq_data = json.load(f)["faqs"]

# Prepare containers for questions, labels, and label mappings
questions = []
labels = []
label2id = {}
id2label = {}

# Create label mappings and expand all Spanish and English questions
for idx, faq in enumerate(faq_data):
    # Map internal index → FAQ ID and FAQ ID → internal index
    label2id[str(idx)] = faq["id"]
    id2label[str(faq["id"])] = idx

    # Detect whether the FAQ uses the old format ("question") or the new one ("questions")
    q_data = faq.get("questions") or faq.get("question")

    for lang in ["es", "en"]:
        if lang in q_data:
            # If the entry is a string (old format), convert it into a list
            q_list = q_data[lang] if isinstance(q_data[lang], list) else [q_data[lang]]

            # Add each question to the dataset, associated with its label
            for q in q_list:
                questions.append(q)
                labels.append(idx)


In [None]:
# =========================================================
# 2. Tokenization
# =========================================================
MODEL_NAME = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    # Tokenize each text input using the multilingual BERT tokenizer
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

# Number of unique classes (each distinct FAQ entry is its own class)
num_classes = len(set(labels))

# Define the dataset features for Hugging Face Datasets
features = Features({
    "text": Value("string"),
    "label": ClassLabel(num_classes=num_classes)
})

# =========================================================
# 3. Create dataset with stratification
# =========================================================
# Each different question corresponds to a separate class.
# Stratification ensures that all classes remain properly represented
# in both the training and test splits.
X_train, X_test, y_train, y_test = train_test_split(
    questions,
    labels,
    test_size=0.2,
    stratify=labels,      # <- stratify by class label
    random_state=42
)

# Build Hugging Face Datasets for train/test
train_data = Dataset.from_dict({"text": X_train, "label": y_train}, features=features)
test_data  = Dataset.from_dict({"text": X_test, "label": y_test}, features=features)

# Tokenize both datasets
train_dataset = train_data.map(tokenize, batched=True)
test_dataset  = test_data.map(tokenize, batched=True)

# Format datasets for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# =========================================================
# 4. Define the model
# =========================================================
num_labels = num_classes

# Detect GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a multilingual BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

# =========================================================
# 5. Training configuration
# =========================================================
training_args = TrainingArguments(
    output_dir="./faq_model_2",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,   # increased training epochs for better performance
    weight_decay=0.01,
    load_best_model_at_end=True
)

# Accuracy metric
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return accuracy.compute(predictions=preds, references=p.label_ids)

# =========================================================
# 6. Trainer
# =========================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)




Map:   0%|          | 0/1061 [00:00<?, ? examples/s]

Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# =========================================================
# 7. Train the model
# =========================================================
# Start the training process using the Trainer configuration.
# This will run the training loop, evaluation per epoch,
# and automatically store the best model if configured.
trainer.train()

# =========================================================
# 8. Save the trained model
# =========================================================
# After training finishes, save both the model and tokenizer
# so they can be loaded later for inference or deployment.
model.save_pretrained("./faq_model_2")
tokenizer.save_pretrained("./faq_model_2")


  0%|          | 0/536 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 3.291703939437866, 'eval_accuracy': 0.22180451127819548, 'eval_runtime': 11.5192, 'eval_samples_per_second': 23.092, 'eval_steps_per_second': 1.476, 'epoch': 1.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 2.5876176357269287, 'eval_accuracy': 0.5977443609022557, 'eval_runtime': 10.635, 'eval_samples_per_second': 25.012, 'eval_steps_per_second': 1.598, 'epoch': 2.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 1.9940024614334106, 'eval_accuracy': 0.7180451127819549, 'eval_runtime': 11.5092, 'eval_samples_per_second': 23.112, 'eval_steps_per_second': 1.477, 'epoch': 3.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 1.5976312160491943, 'eval_accuracy': 0.8120300751879699, 'eval_runtime': 10.6237, 'eval_samples_per_second': 25.038, 'eval_steps_per_second': 1.6, 'epoch': 4.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 1.3220584392547607, 'eval_accuracy': 0.8308270676691729, 'eval_runtime': 10.311, 'eval_samples_per_second': 25.798, 'eval_steps_per_second': 1.649, 'epoch': 5.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 1.1496819257736206, 'eval_accuracy': 0.868421052631579, 'eval_runtime': 10.675, 'eval_samples_per_second': 24.918, 'eval_steps_per_second': 1.593, 'epoch': 6.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 1.0533047914505005, 'eval_accuracy': 0.8834586466165414, 'eval_runtime': 10.5794, 'eval_samples_per_second': 25.143, 'eval_steps_per_second': 1.607, 'epoch': 7.0}
{'loss': 2.0333, 'grad_norm': 7.578559875488281, 'learning_rate': 1.3432835820895524e-06, 'epoch': 7.46}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 1.0237741470336914, 'eval_accuracy': 0.8796992481203008, 'eval_runtime': 8.8997, 'eval_samples_per_second': 29.889, 'eval_steps_per_second': 1.91, 'epoch': 8.0}
{'train_runtime': 2878.1513, 'train_samples_per_second': 2.949, 'train_steps_per_second': 0.186, 'train_loss': 1.9653314405412816, 'epoch': 8.0}


('./faq_model_2\\tokenizer_config.json',
 './faq_model_2\\special_tokens_map.json',
 './faq_model_2\\vocab.txt',
 './faq_model_2\\added_tokens.json',
 './faq_model_2\\tokenizer.json')

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# --- Load spaCy models for lemmatization ---
# We load both Spanish and English models because the chatbot
# needs to normalize and lemmatize user input depending on the language.
nlp_es = spacy.load("es_core_news_sm")  # Spanish
nlp_en = spacy.load("en_core_web_sm")   # English

# --- Text normalization function ---
def normalize_text(text: str, lang: str = "es") -> str:
    """
    Normalize text before feeding it to the classifier:
    - Convert to lowercase
    - Remove question marks
    - Remove accents
    - Lemmatize using the appropriate language model
    """
    # Lowercase
    text = text.lower()
    
    # Remove question marks
    text = text.replace("¿", "").replace("?", "")
    
    # Remove accents/diacritics
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

    # Lemmatization
    nlp = nlp_es if lang == "es" else nlp_en
    doc = nlp(text)
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    # Remove double spaces
    lemmatized = " ".join(lemmatized.split())
    
    return lemmatized


# --- Load trained model and tokenizer ---
# These are the artifacts saved after fine-tuning the multilingual BERT model.
model = AutoModelForSequenceClassification.from_pretrained("./faq_model_2")
tokenizer = AutoTokenizer.from_pretrained("./faq_model_2")

# --- Create inference pipeline ---
# The Hugging Face pipeline simplifies running predictions on text.
clf = pipeline(
    "text-classification", 
    model=model, 
    tokenizer=tokenizer, 
    device=0 if torch.cuda.is_available() else -1   # Use GPU if available
)

# --- Prediction function with normalization ---
def get_answer(user_question: str, lang: str = "es"):
    # Normalize user input before passing it to the model
    clean_q = normalize_text(user_question, lang=lang)
    
    # Run model inference
    pred = clf(clean_q, truncation=True, max_length=64)[0]
    
    # Extract predicted class index and confidence score
    label_idx = int(pred["label"].replace("LABEL_", ""))
    confidence = pred["score"]
    
    # Retrieve original FAQ ID from mapping
    faq_id = label2id[str(label_idx)]
    
    # Look up the corresponding answer in the JSON data
    for faq in faq_data:
        if faq["id"] == faq_id:
            return faq.get("answer", "No answer available."), confidence
    
    return "I couldn't find an answer.", confidence


# =========================================================
# Example usage
# =========================================================
pregunta = "que es el SP 500"
respuesta, confianza = get_answer(pregunta, lang="es")

print(f"Pregunta original: {pregunta}")
print(f"Respuesta: {respuesta}")
print(f"Confianza: {confianza:.2%}")


DEBUG Prediction: [{'label': 'LABEL_23', 'score': 0.4423360526561737}]
{'0': 1, '1': 2, '2': 4, '3': 6, '4': 9, '5': 10, '6': 11, '7': 12, '8': 15, '9': 17, '10': 26, '11': 28, '12': 30, '13': 33, '14': 36, '15': 37, '16': 41, '17': 53, '18': 54, '19': 55, '20': 62, '21': 67, '22': 69, '23': 72, '24': 100, '25': 101, '26': 102, '27': 605, '28': 200, '29': 201, '30': 202, '31': 203, '32': 204, '33': 300, '34': 301, '35': 400, '36': 401, '37': 402, '38': 403, '39': 500, '40': 503, '41': 504, '42': 600}
{'1': 0, '2': 1, '4': 2, '6': 3, '9': 4, '10': 5, '11': 6, '12': 7, '15': 8, '17': 9, '26': 10, '28': 11, '30': 12, '33': 13, '36': 14, '37': 15, '41': 16, '53': 17, '54': 18, '55': 19, '62': 20, '67': 21, '69': 22, '72': 23, '100': 24, '101': 25, '102': 26, '605': 27, '200': 28, '201': 29, '202': 30, '203': 31, '204': 32, '300': 33, '301': 34, '400': 35, '401': 36, '402': 37, '403': 38, '500': 39, '503': 40, '504': 41, '600': 42}
Pregunta original: que es el SP 500
Respuesta: {'es': 'El S