## MetaData Creation

In [84]:
df = pd.read_excel('df_with_metadata.xlsx')

In [85]:
# Define the 16 sections and map them to their respective columns in the dataframe
sections = [
    "Identification", "Hazards identification", "Composition/information on ingredients", "First aid measures",
    "Firefighting measures", "Accidental release measures", "Handling and storage", 
    "Exposure controls/personal protection", "Physical and chemical properties", "Stability and reactivity",
    "Toxicological information", "Ecological information", "Disposal considerations", 
    "Transport information", "Regulatory information", "Other information"
]

# Function to generate processed metadata for each row
def generate_processed_metadata(row):
    processed_metadata = []
    for section_id, section_name in enumerate(sections, start=1):
        metadata_entry = {
            "page_content": row[section_name] if pd.notnull(row[section_name]) else "",
            "metadata": {
                "File Name": row["File Name"],
                "product_name": row["Product Name"],
                "supplier": row["Supplier Name"],
                "section_id": section_id
            }
        }
        processed_metadata.append(metadata_entry)
    return processed_metadata

# Apply the function to create the processed_metadata column
df['processed_metadata'] = df.apply(generate_processed_metadata, axis=1)



In [88]:
# df['processed_metadata'][49]

In [87]:
import json

def validate_json_structure(row):
    """
    Validates if each item in the processed_metadata column is:
    1. A list containing 16 dictionaries
    2. Each dictionary has a valid JSON structure with required fields
    """
    # Check if the row is a list
    if not isinstance(row, list):
        return False
    
    # Check if it contains 16 items
    if len(row) != 16:
        return False
    
    # Validate each item
    for item in row:
        # Check if each item is a dictionary
        if not isinstance(item, dict):
            return False
        # Check if it contains 'page_content' and 'metadata'
        if 'page_content' not in item or 'metadata' not in item:
            return False
        # Check if metadata contains required fields
        metadata = item['metadata']
        required_fields = {'File Name', 'product_name', 'supplier', 'section_id'}
        if not required_fields.issubset(metadata.keys()):
            return False
        # Check if section_id is an integer between 1 and 16
        section_id = metadata.get('section_id')
        if not isinstance(section_id, int) or not 1 <= section_id <= 16:
            return False

    return True

# Apply validation function and get summary
df['is_valid_json'] = df['processed_metadata'].apply(validate_json_structure)

# Check if all rows are valid
all_valid = df['is_valid_json'].all()

# Output the result
all_valid, df['is_valid_json'].value_counts()


(True,
 True    50
 Name: is_valid_json, dtype: int64)

##

##

## Chroma DB set-up

### 

In [89]:
import os
import json
import pandas as pd
import openai
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from chromadb.config import Settings

# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = 'sk-svcacct-QsCZJG1m7apYpNXmYDj73kS-eon0tOJ9Iy6XuNwWD8dH3IDn9QM_OgxlYV-O2-PlAET3BlbkFJ5LFfldkzioy0xrvORw_jOaQo2acpnCix3KApdmur4QmECN1t17iJA0nBrv6cSS74MA'

openai.api_key = os.environ['OPENAI_API_KEY']


In [90]:
# Setup ChromaDB client
chroma_db_path = "Chroma_db_storage"
os.makedirs(chroma_db_path, exist_ok=True)
client = chromadb.PersistentClient(path=chroma_db_path)

# Define collection name and initialize collection
collection_name = "openai_sds_embeddings_metadata"
try:
    client.delete_collection(name=collection_name)
except Exception as e:
    print("No existing collection found to delete.")
collection = client.create_collection(name=collection_name)


No existing collection found to delete.


In [91]:
def get_embedding(text, model="text-embedding-ada-002"):
    try:
        response = openai.Embedding.create(input=text, model=model)
        return response['data'][0]['embedding']
    except Exception as e:
        print(f"Embedding error: {e}")
        return None


In [92]:
def store_sds_documents_to_chromadb(df):
    for index, row in df.iterrows():
        for section in row['processed_metadata']:
            page_content = section['page_content']
            metadata = section['metadata']
            embedding = get_embedding(page_content)
            
            if embedding:
                collection.add(
                    embeddings=[embedding],
                    documents=[page_content],
                    ids=[f"{index}_{metadata['section_id']}"],
                    metadatas=[metadata]
                )
    print("All sections stored in ChromaDB.")


In [None]:
store_sds_documents_to_chromadb(df)


In [1]:
def retrieve_section(product_name, supplier, section_id, query_parameter=None):
    # Define filters
    filters = {
        "product_name": product_name,
        "supplier": supplier,
        "section_id": section_id
    }
    
    # If query_parameter is specified, adjust retrieval
    if query_parameter:
        results = collection.query(
            where=filters,
            n_results=1,
            search_kwargs={"query": query_parameter}
        )
    else:
        results = collection.query(where=filters, n_results=1)
    
    # Extract and return the relevant data
    if results['documents']:
        document = results['documents'][0]
        metadata = results['metadatas'][0]
        return {
            "product_name": metadata['product_name'],
            "supplier": metadata['supplier'],
            "section_id": metadata['section_id'],
            "query_parameter": query_parameter,
            "page_content": document
        }
    else:
        print("No matching section found.")
        return None


In [None]:
# Example to retrieve based on specific criteria
result = retrieve_section(
    product_name="4-Aminopyridine",
    supplier="/UNDERTAKING.",
    section_id=1,
    query_parameter="Identification"
)
print(result)
