In [39]:
import os
import pickle
import pandas as pd
import google.generativeai as genai
from tqdm import tqdm

# Setup
PICKLE_FOLDER = './PickleFiles'
OUTPUT_CSV = './kg_triples.csv'
GENERATION_MODEL = 'gemini-2.0-flash'
MAX_PAGES_PER_DOC = 10  # Limit to avoid too many tokens
OUTPUT_JSON = './kg_triples.json'
CHECKPOINT_FILE = './checkpoint.json'



In [40]:
# Authenticate Gemini
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel(GENERATION_MODEL)

def load_documents(pickle_folder):
    files = [f for f in os.listdir(pickle_folder) if f.endswith('.pkl')]
    documents = []
    for file in files:
        with open(os.path.join(pickle_folder, file), 'rb') as f:
            data = pickle.load(f)
            if 'pages' in data:
                documents.append({'filename': file, 'pages': data['pages']})
    return documents

def build_prompt(text_chunk):
    return f"""
Given the following text, extract clear and factual knowledge graph triples in the format:
(subject) — [predicate] → (object)

Text: {text_chunk}


Only output the list of triples. Ignore speculative or vague statements. Avoid duplicates.
"""

def extract_triples_from_text(text):
    prompt = build_prompt(text)
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print(f"Error generating response: {e}")
        return ""

def parse_triples(output_text):
    triples = []
    for line in output_text.split("\n"):
        if '—' in line and '→' in line:
            try:
                subject, rest = line.split('—')
                predicate, obj = rest.split('→')
                triples.append({
                    'subject': subject.strip(" ()"),
                    'predicate': predicate.strip(" []"),
                    'object': obj.strip(" ()")
                })
            except ValueError:
                continue  # Skip badly formatted lines
    return triples

In [41]:
def load_existing_triples(output_json):
    if not os.path.exists(output_json):
        return [], set()
    with open(output_json, 'r') as f:
        data = json.load(f)
        processed = {item['source_file'] for item in data}
        return data, processed
    
documents = load_documents(PICKLE_FOLDER)
existing_data, processed_files = load_existing_triples(OUTPUT_JSON)
new_triples = []



In [42]:
import json

In [43]:
def load_documents(folder):
    return [
        {'filename': f, 'path': os.path.join(folder, f)}
        for f in os.listdir(folder)
        if f.endswith('.pkl')
    ]

def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def load_json_file(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)
    return []

def save_json_file(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=2)

In [44]:
import time
for doc in tqdm(documents, desc="Processing documents"):
    if doc['filename'] in processed_files:
        continue  # Already done

    try:
        
        data = load_pickle('./PickleFiles/'+doc['filename'])
        pages = data.get('pages', [])[:MAX_PAGES_PER_DOC]
    except Exception as e:
        print(f"❌ Failed to load {doc['filename']}: {e}")
        continue

    for i, page in enumerate(pages):
        triples_text = extract_triples_from_text(page)
        triples = parse_triples(triples_text)
        for t in triples:
            t['source_file'] = doc['filename']
            t['page_number'] = i
        new_triples.extend(triples)

    # Update checkpoint after successful file
    processed_files.add(doc['filename'])
    save_json_file(list(processed_files), CHECKPOINT_FILE)
    time.sleep(50)


Processing documents: 100%|██████████| 14/14 [15:00<00:00, 64.33s/it]


In [45]:
all_triples = load_json_file(OUTPUT_JSON)

all_triples.extend(new_triples)
save_json_file(all_triples, OUTPUT_JSON)

print(f"\n✅ {len(new_triples)} new triples saved to {OUTPUT_JSON}")
print(f"📍 Checkpoint updated: {CHECKPOINT_FILE}")


✅ 1437 new triples saved to ./kg_triples.json
📍 Checkpoint updated: ./checkpoint.json


In [3]:
import json
json.load(open(OUTPUT_JSON, 'r'))

[{'subject': 'The Future of Jobs Report 2020',
  'predicate': 'publisher',
  'object': 'World Economic Forum',
  'source_file': 'WEF_Future_of_Jobs_2020.pdf.pkl',
  'page_number': 0},
 {'subject': 'The Future of Jobs Report 2020',
  'predicate': 'publication date',
  'object': 'October 2020',
  'source_file': 'WEF_Future_of_Jobs_2020.pdf.pkl',
  'page_number': 0},
 {'subject': 'cover',
  'predicate': 'depicts',
  'object': "modern building's curved facade",
  'source_file': 'WEF_Future_of_Jobs_2020.pdf.pkl',
  'page_number': 0},
 {'subject': "modern building's curved facade",
  'predicate': 'background color',
  'object': 'light blue',
  'source_file': 'WEF_Future_of_Jobs_2020.pdf.pkl',
  'page_number': 0},
 {'subject': 'Report: "The Future of Jobs"',
  'predicate': 'title',
  'object': 'The Future of Jobs',
  'source_file': 'WEF_Future_of_Jobs_2020.pdf.pkl',
  'page_number': 1},
 {'subject': 'Report: "The Future of Jobs"',
  'predicate': 'hasPart',
  'object': 'Part 1: "Tracking the F

In [1]:
from neo4j import GraphDatabase

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j+s://bd2636a6.databases.neo4j.io"
AUTH = ("neo4j", "Ahw-ABDgxBlTbISCbDTbzO4rG_Rub1vZYmowzkb47H0")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

ValueError: Cannot resolve address bd2636a6.databases.neo4j.io:7687

In [51]:
def upload_triples(json_file_path):
    with open(json_file_path, 'r') as f:
        triples = json.load(f)
    
    with driver.session() as session:
        for triple in triples:
            subject = triple['subject']
            predicate = triple['predicate'].upper().replace(" ", "_")  # Neo4j relationship types uppercase & no spaces
            object_ = triple['object']
            source_file = triple.get('source_file', '')
            page_number = triple.get('page_number', -1)

            # Cypher query to merge nodes and relationship
            cypher_query = f"""
            MERGE (s:Entity {{name: $subject}})
            MERGE (o:Entity {{name: $object}})
            MERGE (s)-[r:{predicate} {{
                source_file: $source_file,
                page_number: $page_number
            }}]->(o)
            """

            session.run(
                cypher_query,
                subject=subject,
                object=object_,
                source_file=source_file,
                page_number=page_number
            )

    print("Upload complete!")

In [52]:
upload_triples("kg_triples.json")

  with driver.session() as session:


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input '-': expected a parameter, '&', '*', ':', 'WHERE', ']', '{' or '|' (line 4, column 28 (offset: 119))
"            MERGE (s)-[r:CO-SUPERVISOR {"
                            ^}