In [10]:
import json
import numpy as np
import torch
import torch.nn.functional as F
import faiss
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [4]:
import os
import json
import glob

def load_json_file(filepath):
    """
    Attempt to load a JSON object from a file.
    If the JSON is complete but wrapped in triple backticks (```), they will be removed.
    If the JSON is truncated or cannot be loaded, the file is skipped and a note is printed.

    Returns:
        A JSON object (dictionary) if successfully loaded, or None if failed.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read().strip()

    # Remove triple backticks if present at the beginning and/or end.
    if content.startswith("```"):
        content = content[3:]
    if content.endswith("```"):
        content = content[:-3]
    content = content.strip()

    try:
        # Attempt to load the cleaned JSON string.
        data = json.loads(content)
        return data
    except json.JSONDecodeError as e:
        # Log the error and note that the JSON might be truncated.
        # print(f"JSON decoding error in {filepath}: {e}. Possibly truncated JSON. Skipping.")
        return None

def load_json_files(filepaths):
    """
    Given a list of file paths, attempt to load each JSON file using load_json_file.
    Returns a list of successfully loaded JSON objects.
    """
    data_list = []
    for filepath in filepaths:
        data = load_json_file(filepath)
        if data is not None:
            data_list.append(data)
    return data_list

# Replace this with the path to your JSON files directory.
directory_path = "/home/stirunag/Downloads/Downloaded_EPD_JSON-20250327T001140Z-001/Downloaded_EPD_JSON"  # Replace with your JSON directory path
all_files = glob.glob(os.path.join(directory_path, "*.json"))
data = load_json_files(all_files)
print(f"Loaded {len(data)} JSON objects.")


Loaded 3961 JSON objects.


In [6]:
data[0].keys()

dict_keys(['document_type', 'document_id', 'document_issue_date', 'document_valid_date', 'document_privacy_tag', 'product_names', 'product_ids', 'product_description', 'From', 'To', 'Verifier', 'epd_module_declaration', 'declared_unit', 'epd_impacts', 'additional_information'])

In [25]:
# Create a list to store text representations and a mapping for indices.
processed_data = data
texts = []
for item in tqdm(processed_data):
    # Combine relevant fields: you can customize this as needed.
    product_names = " ".join(item.get("product_names", []))
    product_desc = " ".join(item.get("product_description", []))
    try:
        product_ids = " ".join(item.get("product_ids", []))
    except TypeError:
        product_ids = ''
    combined_text = product_names + ". " + product_desc + ". " + product_ids
    texts.append(combined_text)


# ---------- Step 2: Compute Sentence Embeddings Using a Local Model ----------
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

def mean_pooling(model_output, attention_mask):
    """Mean pooling, accounting for the attention mask."""
    token_embeddings = model_output[0]  # First element contains token embeddings.
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embedding(text):
    """Generate an embedding for a given text."""
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    # Normalize the embedding so that dot product equals cosine similarity.
    embedding = F.normalize(embedding, p=2, dim=1)
    return embedding

# Compute embeddings for all texts.
embeddings_list = []
for text in tqdm(texts):
    emb = get_embedding(text)
    embeddings_list.append(emb.cpu().numpy())

# Stack embeddings into a NumPy array of type float32.
all_embeddings = np.vstack(embeddings_list).astype('float32')


100%|██████████| 3961/3961 [00:00<00:00, 381064.23it/s]


In [26]:
# ---------- Step 3: Build a FAISS Index for Cosine Similarity ----------
embedding_dim = all_embeddings.shape[1]
# Use an inner product index. With normalized embeddings, this equals cosine similarity.
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(all_embeddings)
print("FAISS index built with", faiss_index.ntotal, "vectors.")

FAISS index built with 3961 vectors.


In [23]:
# ---------- Step 4: Search the Index and Retrieve Corresponding JSONs ----------
def search_products(query, k=3):
    """
    Search for the k most similar products to the query based on cosine similarity.
    Returns a list of tuples (matched_product, similarity_score) where similarity_score is in [0, 1].
    """
    query_emb = get_embedding(query).cpu().numpy().astype('float32')
    # Perform inner product search.
    similarities, indices = faiss_index.search(query_emb, k)

    # If there is any possibility of negative cosine similarities, convert to [0,1]:
    similarities = (similarities + 1) / 2  # Transform from [-1,1] to [0,1]

    results = []
    for idx, sim in zip(indices[0], similarities[0]):
        matched_product = processed_data[idx]
        results.append((matched_product, sim))
    return results


In [46]:
# Example query:
query = "Reinforcing Steel per Release"
results = search_products(query, k=3)

print("Search results:")
for product, sim in results:
    print("Product Names:", product.get("product_names"))
    print("Product Description:", product.get("product_description"))
    print("Product ID:", product.get("product_ids"))
    print("Cosine Similarity:", sim)
    print("-" * 40)

Search results:
Product Names: ['Reinforcing Steel meshes']
Product Description: ['Reinforcing steel is encased in concrete in order to improve the tensile strength of the latter in structures bearing axial or bending loads. The steel is relatively simple and comprises about 99 % iron. The reinforcing-steel products consist of steel meshes with a diameter of 5 mm to 12 mm. Steel reinforcing will normally last over the life of the concrete structure. It is 100 % recyclable in the event that the structure is demolished.']
Product ID: ['Further processed concrete reinforcing based upon steel conforming to SS-EN 10080:2005 and SS 212540:2014']
Cosine Similarity: 0.8271304
----------------------------------------
Product Names: ['Cut and bent bar in concrete reinforcing steel']
Product Description: ['Reinforcing steel is encased in concrete in order to improve the tensile strength of the latter in structures bearing axial or bending loads. The steel is relatively simple and comprises about 

In [30]:
import faiss
import pickle
import json

# Assume `faiss_index` is your built FAISS index and `processed_data` is your JSON mapping.

# ---------- Save FAISS index ----------
index_filename = "../model/faiss_index.index"
faiss.write_index(faiss_index, index_filename)
print(f"FAISS index saved to {index_filename}")

# ---------- Save the JSON mapping ----------
# We assume processed_data is a list of your JSON objects in the same order as you built the FAISS index.
mapping_filename = "../model/json_mapping.pkl"
with open(mapping_filename, "wb") as f:
    pickle.dump(processed_data, f)
print(f"JSON mapping saved to {mapping_filename}")

# # ---------- Later, to load them back ----------
# # Load the FAISS index:
# loaded_index = faiss.read_index(index_filename)
# print("FAISS index loaded:", loaded_index.ntotal, "vectors.")
#
# # Load the JSON mapping:
# with open(mapping_filename, "rb") as f:
#     loaded_data = pickle.load(f)
# print("Loaded JSON mapping with", len(loaded_data), "records.")

FAISS index saved to ../model/faiss_index.index
JSON mapping saved to ../model/json_mapping.pkl


In [37]:
def get_impact_info(epd_json, impact_filter='global warming'):
    """
    Searches the 'epd_impacts' list of the JSON for an impact row
    where the 'impact_category' contains the given filter (case-insensitive).

    Parameters:
      epd_json (dict): The matched EPD JSON object.
      impact_filter (str): The substring to match in the impact_category.

    Returns:
      dict or None: The matching impact row or None if not found.
    """
    impacts = epd_json.get("epd_impacts", [])
    for impact in impacts:
        if impact_filter.lower() in impact.get("impact_category", "").lower():
            return impact
    return None

In [47]:
matched_json = results[1][0]

In [49]:
matched_json

{'document_type': 'EPD',
 'document_id': 'S-P-02039',
 'document_issue_date': '2020-08-11',
 'document_valid_date': '2025-07-05',
 'document_privacy_tag': 'public',
 'product_names': ['Cut and bent bar in concrete reinforcing steel'],
 'product_ids': ['Further processed concrete reinforcing based upon steel conforming to SS-EN 10080:2005 and SS 212540:2014'],
 'product_description': ['Reinforcing steel is encased in concrete in order to improve the tensile strength of the latter in structures bearing axial or bending loads. The steel is relatively simple and comprises about 99 % iron. The reinforcing-steel products consist of cut-to-length and bent pieces starting from bars in long length or coil supplied from a steel plant. The bar surface is often ribbed in order to facilitate bonding between steel and concrete. The cut and bent parts can be of standard shape or bespoke in accord with drawings supplied by a construction company. In many instances, parts are welded or otherwise joined

In [50]:
impact_data = get_impact_info(matched_json, impact_filter="global warming")
if impact_data:
    print("Matched Impact Data:")
    print("Impact Category:", impact_data.get("impact_category"))
    print("Unit:", impact_data.get("unit"))
    # For instance, extract impact values for keys A1-A5 (excluding A1_A3_total if desired)
    impact_keys = {key: impact_data[key] for key in impact_data if key.startswith("A") }#and key != "A1_A3_total"
    print("Impact Values:", impact_keys)
else:
    print("No matching impact data found.")

Matched Impact Data:
Impact Category: Global Warming Potential (GWP)
Unit: kg CO2-eq.
Impact Values: {'A1': 0.414, 'A2': 0.0383, 'A3': 0.00831, 'A4': 0.0367, 'A5': 0.000625, 'A1_A3_total': 0.46061}
