## Hugging Face Forum Semantic Search with FAISS

### Import Libraries

In [1]:
from datasets import load_dataset, Dataset
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModel
import faiss


  from .autonotebook import tqdm as notebook_tqdm


### Data Loading

In [2]:
# Directory containing JSON files
DATASET_DIR = "huggingface_forum"
data_files = {"train": os.path.join(DATASET_DIR, "*.json")}  # Use glob pattern to load all JSON files

# Load the dataset
huggingface_forum_dataset = load_dataset("json", data_files=data_files, split="train")


### Data Preprocessing

In [3]:
def simplify_entry(entry):
    # Retain only the responses from each entry
    simplified_entry = {
        "responses": [response["reply"] for response in entry.get("responses", [])]
    }
    return simplified_entry

huggingface_forum_dataset = huggingface_forum_dataset.map(simplify_entry)

# Specify columns to keep and remove unnecessary ones
columns = huggingface_forum_dataset.column_names
columns_to_keep = ["title", "link", "initial_post", "responses"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
huggingface_forum_dataset = huggingface_forum_dataset.remove_columns(columns_to_remove)


### Data Formatting for Analysis

In [4]:
# Set dataset to pandas format for easier manipulation
huggingface_forum_dataset.set_format("pandas")
df = huggingface_forum_dataset[:]

# Explode responses for individual analysis
res_df = df.explode("responses", ignore_index=True)

# Add response length column for filtering
res_df["response_length"] = res_df["responses"].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# Convert back to Hugging Face Dataset format
res_df = Dataset.from_pandas(res_df)

# Filter responses based on length
res_df = res_df.filter(lambda x: x["response_length"] > 15)


Filter: 100%|██████████| 40193/40193 [00:00<00:00, 224540.53 examples/s]


### Concatenate Text Fields

In [5]:
def concatenate_text(examples):
    return {
        "text": examples["title"] + " \n " + examples["initial_post"] + " \n " + examples["responses"]
    }

res_df = res_df.map(concatenate_text)


Map: 100%|██████████| 22988/22988 [00:01<00:00, 15969.45 examples/s]


## Embedding Generation with Transformers

In [6]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

# Generate a sample embedding to check the shape
embedding = get_embeddings(res_df["text"][0])
print(f"Embedding shape: {embedding.shape}")


Embedding shape: torch.Size([1, 768])


### Add Embeddings to the Dataset

In [7]:
# Add embeddings as a new column
embeddings_dataset = res_df.map(lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]})


Map: 100%|██████████| 22988/22988 [04:40<00:00, 81.82 examples/s] 


### Create and add FAISS index directly to the Dataset

In [9]:
embeddings_dataset.add_faiss_index(column="embeddings")
print("FAISS index successfully added to the dataset.")

100%|██████████| 23/23 [00:00<00:00, 258.42it/s]

FAISS index successfully added to the dataset.





## Define and embed a query for semantic search

In [17]:
def search_faiss(query, top_k=5):
    # Generate the query embedding as a NumPy array
    query_embedding = get_embeddings([query]).cpu().detach().numpy()
    
    # Perform search on FAISS index
    scores, samples = embeddings_dataset.get_nearest_examples("embeddings", query_embedding, k=top_k)
    
    # Display search results
    results = []
    for i in range(len(samples["title"])):
        result = {
            "title": samples["title"][i],
            "link": samples["link"][i],
            "initial_post": samples["initial_post"][i],
            "response": samples["responses"][i]
        }
        results.append(result)
    return results

In [18]:
# Test with a sample query
query = "What model is best for document information extraction?"
search_results = search_faiss(query)

# Display results
for result in search_results:
    print(f"Title: {result['title']}\nLink: {result['link']}\nInitial Post: {result['initial_post']}\nResponse: {result['response']}\n")

Title: What model will fit better for Email Parsing and Data Extraction
Link: https://discuss.huggingface.co/t/what-model-will-fit-better-for-email-parsing-and-data-extraction/70371
Initial Post: Hi,Firstly, Apologies if that is not the right section in the forum… (quite new here)Now, I am working in a new project, well in a new idea to automate a current very manual process:Lots of emails come in which are manually processed by “a human” to extract data from them which are then copied in a tabulate software (excel) or similar.I can clearly identify the attributes/fields I want to extract from the text/emails.I have too the types of “text/emails”The “problem” is to be able to extract that information accurately.Email Example:"Hello my friends.we have two guys arriving tomorrow 23/01/2024 around 1pm from Madrid, flight ABC123Another one people leaving on sunday same week around 2am to Instanbul, flight CBA321Name of the arrving ones: Jose Mateo Feliz y Ana Triste del Carmen.Name of the 