In [None]:
import json
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def load_json(file_path):
    """
    Loads a JSON file and returns its content.

    Parameters:
        file_path (str): Path to the JSON file.
    
    Returns:
        dict: Parsed content of the JSON file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def combine_data(*datasets):
    """
    Combines multiple datasets into a single structured dictionary and removes duplicates.

    Parameters:
        *datasets (dict): Variable number of dictionaries representing datasets.

    Returns:
        dict: A combined dictionary where data is organized into predefined categories.
    """
    combined_data = {"Attractions": [], "Dining": [], "Shopping": [], "Transportation": [], "Other": []}
    for dataset in datasets:
        for category, content in dataset.items():
            if category in combined_data:
                combined_data[category].extend(content)
            else:
                combined_data["Other"].extend(content)
    # Remove duplicates in each category
    for category in combined_data:
        combined_data[category] = list(set(combined_data[category]))
    return combined_data

def clean_and_segment_text(data):
    """
    Cleans and segments text content into tuples of category and text.

    Parameters:
        data (dict): A dictionary with categories as keys and lists of text as values.

    Returns:
        list: A list of tuples where each tuple contains a category and a cleaned text string.
    """
    segmented_data = []
    for category, texts in data.items():
        for text in texts:
            # Clean text by removing extra spaces
            text = re.sub(r'\s+', ' ', text.strip())
            if len(text) > 20:  # Ignore very short texts
                segmented_data.append((category, text))
    return segmented_data

def generate_embeddings(data, model_name="all-MiniLM-L6-v2"):
    """
    Generates text embeddings using a SentenceTransformer model.

    Parameters:
        data (list): A list of tuples where each tuple contains a category and text string.
        model_name (str): Name of the SentenceTransformer model.

    Returns:
        tuple: A tuple containing the generated embeddings (numpy array) and a list of text strings.
    """
    model = SentenceTransformer(model_name)
    texts = [entry[1] for entry in data]
    embeddings = model.encode(texts)
    return embeddings, texts

def store_embeddings(embeddings, texts, categories):
    """
    Stores embeddings in a FAISS index along with their metadata.

    Parameters:
        embeddings (numpy.ndarray): Array of generated embeddings.
        texts (list): List of text strings corresponding to the embeddings.
        categories (list): List of categories corresponding to the texts.

    Returns:
        tuple: A FAISS index and metadata list.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Initialize FAISS index with L2 distance metric
    index.add(embeddings)  # Add embeddings to the index
    metadata = [{"text": texts[i], "category": categories[i]} for i in range(len(texts))]
    return index, metadata

def save_metadata(metadata, file_path):
    """
    Saves metadata to a JSON file.

    Parameters:
        metadata (list): List of metadata dictionaries.
        file_path (str): Path to the output JSON file.
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=4)

# Paths to input and output files
changi_file = "data\gold\final_changi_data.json"
jewel_file = "data\goldfinal_jewel_data.json"
metadata_output = "data\metadata.json"
index_output = "data\vector_index.faiss"

# Load and combine datasets
changi_data = load_json(changi_file)
jewel_data = load_json(jewel_file)
combined_data = combine_data(changi_data, jewel_data)

# Clean and segment text data
segmented_data = clean_and_segment_text(combined_data)
categories = [entry[0] for entry in segmented_data]
texts = [entry[1] for entry in segmented_data]

# Generate embeddings for the text data
embeddings, texts = generate_embeddings(segmented_data)

# Store embeddings in a FAISS index and save metadata
index, metadata = store_embeddings(embeddings, texts, categories)
faiss.write_index(index, index_output)  # Save the FAISS index to a file
save_metadata(metadata, metadata_output)  # Save metadata to a JSON file

print(f"Embeddings stored in {index_output} and metadata in {metadata_output}.")
