In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install openai




In [None]:
!pip install --upgrade openai




In [None]:
import os
import json
import numpy as np
import getpass
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

api_key = getpass.getpass("🔐 Enter your OpenAI API key: ")
client = OpenAI(api_key=api_key)

json_folder = '/content/drive/MyDrive/final_json'
csv_folder = '/content/drive/MyDrive/Geochem_Split_3'
output_folder = '/content/drive/MyDrive/Geochem_Knowledge'
os.makedirs(output_folder, exist_ok=True)

def load_json_text(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    text = f"{data.get('Model_Name', '')}\n" + "\n".join(f"{k}: {v}" for k, v in data.items() if k != "ai_modification_log")
    return text, data

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

def query_chatgpt(deposit_type, json_knowledge):
    prompt = f"""
You are a geologist AI assistant. The following is a JSON object describing a mineral deposit model:

{json.dumps(json_knowledge, indent=2)}

Based on this description, extract and summarize knowledge relevant to the following deposit type:
"{deposit_type}"

Respond with only relevant information that matches this deposit type, in JSON format.
"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content

json_embeddings = []
json_data_list = []

print("📦 Generating embeddings for JSON models...")
for file in os.listdir(json_folder):
    if file.endswith(".json"):
        file_path = os.path.join(json_folder, file)
        text, data = load_json_text(file_path)
        embedding = get_embedding(text)
        json_embeddings.append(embedding)
        json_data_list.append((file, data))

print("\n🔍 Matching CSV deposit types to JSON models...")
for csv_file in os.listdir(csv_folder):
    if not csv_file.endswith(".csv"):
        continue

    deposit_type = csv_file.replace("Saved", "").replace(".csv", "").strip()
    print(f"\n🔎 Processing: {deposit_type}")

    try:
        deposit_embedding = get_embedding(deposit_type)
        sims = cosine_similarity([deposit_embedding], json_embeddings)[0]
        best_idx = int(np.argmax(sims))
        best_model_name, best_model_data = json_data_list[best_idx]

        print(f"✅ Best matched model: {best_model_name} (similarity={sims[best_idx]:.3f})")

        matched_knowledge = query_chatgpt(deposit_type, best_model_data)

        output_path = os.path.join(output_folder, f"{csv_file.replace('.csv', '_knowledge.json')}")
        with open(output_path, 'w') as f:
            f.write(matched_knowledge)
        print(f"📁 Saved: {output_path}")

    except Exception as e:
        print(f"❌ Error with {csv_file}: {e}")


🔐 Enter your OpenAI API key: ··········
📦 Generating embeddings for JSON models...

🔍 Matching CSV deposit types to JSON models...

🔎 Processing: Au
✅ Best matched model: Descriptive model of carbonate-hosted Au-Ag_ocr_output_final.json (similarity=0.330)
📁 Saved: /content/drive/MyDrive/Geochem_Knowledge/Au_knowledge.json

🔎 Processing: Black shale
✅ Best matched model: Descriptive model of emerald veins_ocr_output_final.json (similarity=0.443)
📁 Saved: /content/drive/MyDrive/Geochem_Knowledge/Black shale_knowledge.json

🔎 Processing: Breccia pipe U
✅ Best matched model: Descriptive model of diamond pipes_ocr_output_final.json (similarity=0.463)
📁 Saved: /content/drive/MyDrive/Geochem_Knowledge/Breccia pipe U_knowledge.json

🔎 Processing: Carbonatite
✅ Best matched model: Descriptbe model of carbonatite deposits_ocr_output_final.json (similarity=0.605)
📁 Saved: /content/drive/MyDrive/Geochem_Knowledge/Carbonatite_knowledge.json

🔎 Processing: Cu
✅ Best matched model: Descriptive model 