* Install required libraries

In [5]:
!pip install -q groq PyPDF2

* Import reuired libraries

In [6]:
import os
import re
import json
import time
import PyPDF2
from google.colab import userdata
from groq import Groq


In [8]:
# ============================
# Initialize Groq API Client
# ============================
def initialize_groq_client(api_key):
    """Initialize the Groq API client."""
    return Groq(api_key=api_key)


# ============================
# Extract Text from PDF
# ============================
def extract_text_from_pdf(pdf_path):
    """Extract text from each page of a PDF."""
    text_pages = []
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text = page.extract_text()
            if text:
                text_pages.append(text.strip())  # Remove extra spaces
    return text_pages


# ============================
# Preprocess Text
# ============================
def preprocess_text(text):
    """Preprocess text by converting to lowercase and removing special characters."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'[^a-z0-9\s\.,;:?!-]', '', text)  # Keep alphanumeric and punctuation
    return text


# ============================
# Extract Knowledge Graph from Text using Groq API
# ============================
def get_knowledge_graph(client, preprocessed_text, retries=3):
    """
    Extract structured entities and relationships from text using Groq API.
    Returns a JSON object containing extracted entities and relationships.
    """
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "You are an AI assistant that extracts structured knowledge from text. "
                                   "Your task is to extract entities and relationships from the given text, "
                                   "and return them in a structured JSON format as follows:\n\n"
                                   "{\n"
                                   '  "entities": [\n'
                                   '    {"name": "Entity1", "properties": {"property1": "value1", "property2": "value2"}},\n'
                                   '    {"name": "Entity2", "properties": {"propertyA": "valueA"}}\n'
                                   '  ],\n'
                                   '  "relationships": [\n'
                                   '    {"subject": "Entity1", "relationship": "relates_to", "object": "Entity2"}\n'
                                   '  ]\n'
                                   '}\n\n'
                                   "Ensure that each subject and object are individual entities."
                    },
                    {
                        "role": "user",
                        "content": f"""Extract entities and relationships from the following text:

                        {preprocessed_text}

                        Respond strictly with a valid JSON object and nothing else—no explanations, introductions, or extra text."""
                    }
                ],
                model="llama3-70b-8192",
                temperature=0.5,
                top_p=1,
                stop=None,
                stream=False,
            )

            # Parse the response
            extracted_json = response.choices[0].message.content.strip()
            parsed_data = json.loads(extracted_json)

            # Validate the structure
            if "entities" in parsed_data and "relationships" in parsed_data:
                return parsed_data

        except json.JSONDecodeError:
            print(f"Attempt {attempt+1}: Failed to parse JSON. Retrying...")
            time.sleep(2)  # Wait before retrying

        except Exception as e:
            print(f"Error during API call: {e}")
            return None

    print("Failed to extract structured data after retries.")
    return None


# ============================
# Process PDF and Extract Knowledge Graph
# ============================
def process_pdf_and_extract_kg(pdf_path, client):
    """Extracts structured knowledge from a PDF file."""
    text_pages = extract_text_from_pdf(pdf_path)
    print("Text extraction completed.")

    knowledge_graph = []

    for idx, page_text in enumerate(text_pages):
        print(f"Processing Page {idx + 1}/{len(text_pages)}...")

        # Preprocess the text
        preprocessed_text = preprocess_text(page_text)

        # Get knowledge graph data
        kg_data = get_knowledge_graph(client, preprocessed_text)

        if kg_data:
            knowledge_graph.append(kg_data)
            # print(f"Extraction for Page {idx + 1} completed.")
        else:
            print(f"Skipping Page {idx + 1} due to parsing issues.")

    return knowledge_graph


# ============================
# Save Knowledge Graph to JSON File
# ============================
def save_knowledge_graph(knowledge_graph, output_file):
    """Save the extracted knowledge graph to a JSON file."""
    with open(output_file, "w") as f:
        json.dump(knowledge_graph, f, indent=2)
    print(f"Knowledge graph extraction completed. Results saved to {output_file}")


# ============================
# Main Execution
# ============================
if __name__ == "__main__":
    # Set your API key

    API_KEY = userdata.get('GROQ_API_KEY')

    # Set input and output file paths
    PDF_PATH = "/content/drive/MyDrive/KnowledgeDatabase/R048r12e.pdf"
    OUTPUT_FILE = "/content/drive/MyDrive/KnowledgeGraphResults/kgf1.json"

    # Initialize API client
    client = initialize_groq_client(API_KEY)

    # Process the PDF and extract the knowledge graph
    knowledge_graph = process_pdf_and_extract_kg(PDF_PATH, client)

    # Save the extracted knowledge graph
    save_knowledge_graph(knowledge_graph, OUTPUT_FILE)


Text extraction completed.
Processing Page 1/131...
Processing Page 2/131...
Failed to extract structured data after retries.
Skipping Page 2 due to parsing issues.
Processing Page 3/131...
Processing Page 4/131...
Processing Page 5/131...
Processing Page 6/131...
Processing Page 7/131...
Processing Page 8/131...
Processing Page 9/131...
Processing Page 10/131...
Processing Page 11/131...
Processing Page 12/131...
Processing Page 13/131...
Processing Page 14/131...
Processing Page 15/131...
Processing Page 16/131...
Processing Page 17/131...
Processing Page 18/131...
Processing Page 19/131...
Processing Page 20/131...
Processing Page 21/131...
Processing Page 22/131...
Processing Page 23/131...
Processing Page 24/131...
Processing Page 25/131...
Processing Page 26/131...
Processing Page 27/131...
Processing Page 28/131...
Processing Page 29/131...
Processing Page 30/131...
Processing Page 31/131...
Processing Page 32/131...
Processing Page 33/131...
Processing Page 34/131...
Processing