In [None]:
import json
import os
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from whoosh.index import create_in, open_dir
from whoosh.qparser import QueryParser
import tika
from tika import parser
import ipfshttpclient
from pyvis.network import Network
from datetime import datetime

# Initialize Tika
tika.initVM()

# Configure IPFS client
client = ipfshttpclient.connect('/ip4/127.0.0.1/tcp/5001')

# Directory for Whoosh index
INDEX_DIR = "index"
if not os.path.exists(INDEX_DIR):
    os.mkdir(INDEX_DIR)

# Define the schema
schema = Schema(
    project_account=TEXT(stored=True),
    cid=ID(stored=True),
    name=TEXT(stored=True),
    size=NUMERIC(stored=True),
    filetype=TEXT(stored=True),
    title=TEXT(stored=True),
    creator=TEXT(stored=True),
    language=TEXT(stored=True),
    subject=TEXT(stored=True),
    description=TEXT(stored=True),
    publisher=TEXT(stored=True),
    date=TEXT(stored=True),
    abstract=TEXT(stored=True),
    format=TEXT(stored=True),
    created=TEXT(stored=True),
    modified=TEXT(stored=True),
    full_text=TEXT(stored=False)
)

# Step 1: Extract and Normalize Metadata
def extract_and_normalize_metadata(file_path):
    parsed = parser.from_file(file_path)
    metadata = parsed["metadata"]
    metadata["full_text"] = parsed.get("content", "").strip()
    normalized_metadata = {k.lower(): v for k, v in metadata.items() if isinstance(v, str)}
    return normalized_metadata

# Step 2: Index Metadata
def index_metadata(metadata):
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)
    ix = create_in(INDEX_DIR, schema) if not os.path.exists(INDEX_DIR + "/MAIN") else open_dir(INDEX_DIR)
    writer = ix.writer()
    writer.add_document(
        project_account=metadata.get("project_account", ""),
        cid=metadata.get("cid", ""),
        name=metadata.get("name", ""),
        size=int(metadata.get("size", 0)),
        filetype=metadata.get("filetype", ""),
        title=metadata.get("title", ""),
        creator=metadata.get("creator", ""),
        language=metadata.get("language", ""),
        subject=metadata.get("subject", ""),
        description=metadata.get("description", ""),
        publisher=metadata.get("publisher", ""),
        date=metadata.get("date", ""),
        abstract=metadata.get("abstract", ""),
        format=metadata.get("format", ""),
        created=metadata.get("created", ""),
        modified=metadata.get("modified", ""),
        full_text=metadata.get("full_text", "")
    )
    writer.commit()

# Step 3: Send Metadata JSON to IPFS
def send_metadata_to_ipfs(metadata):
    json_data = json.dumps(metadata)
    res = client.add_json(json_data)
    return res

# Step 4: Search Metadata by Keyword
def search_metadata(keyword):
    ix = open_dir(INDEX_DIR)
    qp = QueryParser("full_text", schema=ix.schema)
    query = qp.parse(keyword)
    results = []
    with ix.searcher() as searcher:
        result = searcher.search(query)
        results = [hit["cid"] for hit in result]
    return results

# Step 5: Fetch Metadata from IPFS
def fetch_metadata_from_ipfs(cid):
    return client.get_json(cid)

# Step 6: Build and Display Knowledge Graph
def build_and_display_knowledge_graph(project_id, related_data):
    net = Network(height="750px", width="100%", directed=True)
    net.add_node(project_id, label=project_id, color="lightblue")
    for key, value in related_data.items():
        if isinstance(value, dict):  # For nested files
            for sub_key, sub_value in value.items():
                net.add_node(sub_value, label=sub_key)
                net.add_edge(project_id, sub_value, label=key)
        else:
            net.add_node(value, label=key)
            net.add_edge(project_id, value, label=key)
    net.show("knowledge_graph.html")

# Script Execution - Consolidated Workflow
file_path = "upload/Aeroacoustic-airfoil-shape-optimization-enhance_2023_Expert-Systems-with-App.pdf"

# Step 1: Extract and Normalize Metadata
print("Step 1: Extracting and Normalizing Metadata")
metadata = extract_and_normalize_metadata(file_path)
print("Extracted Metadata:", json.dumps(metadata, indent=2))

# Step 2: Index Metadata
print("\nStep 2: Indexing Metadata")
index_metadata(metadata)
print("Metadata indexed successfully.")

# Step 3: Send Metadata JSON to IPFS
print("\nStep 3: Sending Metadata JSON to IPFS")
metadata_cid = send_metadata_to_ipfs(metadata)
print("Metadata CID:", metadata_cid)

# Step 4: Search Metadata by Keyword
keyword = "sample_keyword"  # Replace with an actual keyword relevant to your metadata
print("\nStep 4: Searching Metadata by Keyword")
metadata_cids = search_metadata(keyword)
print("Search Results (Metadata CIDs):", metadata_cids)

# Step 5: Fetch Metadata from IPFS
if metadata_cids:
    print("\nStep 5: Fetching Metadata from IPFS")
    fetched_metadata = fetch_metadata_from_ipfs(metadata_cids[0])
    print("Fetched Metadata from IPFS:", json.dumps(fetched_metadata, indent=2))
else:
    print("\nNo metadata found for the specified keyword.")

# Step 6: Build and Display Knowledge Graph
project_id = metadata.get("project_account", "default_project_id")  # Fallback if no project account is in metadata
print("\nStep 6: Building and Displaying Knowledge Graph")
sample_related_data = {
    "Owner": "Researcher_A",
    "Funding Agency": "Agency_X",
    "Files": {
        "File 1": "file_cid_1",
        "File 2": "file_cid_2"
    },
    "Keywords": ["Keyword_1", "Keyword_2"],
    "Affiliated Institute": "Institute_Y"
}
build_and_display_knowledge_graph(project_id, sample_related_data)
print("Knowledge graph created and saved as 'knowledge_graph.html'. Open this file to view the graph.")
print("\nWorkflow Complete!")


In [1]:
from tika import parser
import json

# Step 1: Extract metadata and content with Tika
def parse_document(file_path):
    # Parse the document with Tika
    raw_data = parser.from_file(file_path)
    metadata = raw_data.get("metadata", {})
    content = raw_data.get("content", "").strip()
    return metadata, content

# Step 2: Filter and format metadata to use Dublin Core keys
def format_to_dublin_core_json(metadata, content):
    # Define the Dublin Core terms
    dublin_core_terms = {
        "title": "dc:title",
        "creator": "dc:creator",
        "subject": "dc:subject",
        "description": "dc:description",
        "publisher": "dc:publisher",
        "contributor": "dc:contributor",
        "date": "dc:date",
        "type": "dc:type",
        "format": "dc:format",
        "identifier": "dc:identifier",
        "source": "dc:source",
        "language": "dc:language",
        "relation": "dc:relation",
        "coverage": "dc:coverage",
        "rights": "dc:rights"
    }

    # Map metadata to Dublin Core JSON structure
    dublin_core_json = {
        "@context": "http://purl.org/dc/elements/1.1/",
        "content": content,
        "metadata": {}
    }

    for key, value in metadata.items():
        # Map each metadata key to its corresponding Dublin Core term, if available
        dc_key = dublin_core_terms.get(key.lower())
        if dc_key:
            dublin_core_json["metadata"][dc_key] = value
        else:
            # Add unmapped metadata under "additionalMetadata" for reference
            if "additionalMetadata" not in dublin_core_json:
                dublin_core_json["additionalMetadata"] = {}
            dublin_core_json["additionalMetadata"][key] = value

    return dublin_core_json

# Example usage
file_path = "upload/Aeroacoustic-airfoil-shape-optimization-enhance_2023_Expert-Systems-with-App.pdf"

# Parse the document
metadata, content = parse_document(file_path)

# Format data to Dublin Core JSON
dublin_core_json = format_to_dublin_core_json(metadata, content)

# Display Dublin Core JSON output
print(json.dumps(dublin_core_json, indent=4))

# Example output without IPFS handling for simplicity


2024-11-10 21:15:27,800 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2024-11-10 21:15:33,813 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2024-11-10 21:15:34,909 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


{
    "@context": "http://purl.org/dc/elements/1.1/",
    "content": "Aeroacoustic airfoil shape optimization enhanced by autoencoders\n\n\nExpert Systems With Applications 217 (2023) 119513\n\nA\n0\nn\n\nContents lists available at ScienceDirect\n\nExpert Systems With Applications\n\njournal homepage: www.elsevier.com/locate/eswa\n\nAeroacoustic airfoil shape optimization enhanced by autoencoders\nJiaqing Kou a,\u2217, Laura Botero-Bol\u00edvar b, Rom\u00e1n Ballano a, Oscar Marino a, Leandro de Santana b,\nEusebio Valero a,c, Esteban Ferrer a,c\n\na ETSIAE-UPM-School of Aeronautics, Universidad Polit\u00e9cnica de Madrid, Plaza Cardenal Cisneros 3, E-28040 Madrid, Spain\nb Department of Thermal Fluid Engineering, University of Twente, PO Box 217, 7522 NB Enschede, The Netherlands\nc Center for Computational Simulation, Universidad Polit\u00e9cnica de Madrid, Campus de Montegancedo, Boadilla del Monte, 28660, Madrid, Spain\n\nA R T I C L E I N F O\n\nKeywords:\nAeroacoustics\nOptimiza

In [3]:
from tika import parser
from ipfs_functions import *
import json

# Step 1: Extract metadata and content with Tika
def parse_document(file_path):
    raw_data = parser.from_file(file_path)
    metadata = raw_data.get("metadata", {})
    content = raw_data.get("content", "").strip()
    return metadata, content

# Step 2: Convert to JSON-LD format
def format_to_jsonld(metadata, content):
    jsonld_context = {
        "@context": {
            "dc": "http://purl.org/dc/elements/1.1/",
            "schema": "http://schema.org/",
            "content": "schema:text",
            "metadata": "schema:additionalType"
        }
    }
    
    jsonld_data = {
        "@context": jsonld_context["@context"],
        "@type": "schema:CreativeWork",
        "content": content,
        "metadata": []
    }
    
    for key, value in metadata.items():
        jsonld_data["metadata"].append({
            "@type": "schema:PropertyValue",
            "schema:propertyID": key,
            "schema:value": value
        })
    
    return jsonld_data

# Step 3: Save JSON-LD (function retained if needed later)
def save_jsonld(data, output_path):
    with open(output_path, "w") as file:
        json.dump(data, file, indent=2)

# Example usage with IPFS functions
file_path = "upload/Aeroacoustic-airfoil-shape-optimization-enhance_2023_Expert-Systems-with-App.pdf"
output_path = "datasets/output.jsonld"

# Parse the document
metadata, content = parse_document(file_path)

# Format data to JSON-LD
jsonld_data = format_to_jsonld(metadata, content)

# Display JSON-LD output
print(jsonld_data)

# Assuming IPFS upload/download functions are defined
metadata_cid = upload_json_to_ipfs(jsonld_data)  # Upload JSON-LD to IPFS
print("Metadata CID:", metadata_cid)

# Retrieve JSON-LD metadata from IPFS
file_metadata = download_json_from_ipfs(metadata_cid)
print("Retrieved file metadata from IPFS:", file_metadata)


save_jsonld (jsonld_data, output_path)

{'@context': {'dc': 'http://purl.org/dc/elements/1.1/', 'schema': 'http://schema.org/', 'content': 'schema:text', 'metadata': 'schema:additionalType'}, '@type': 'schema:CreativeWork', 'content': "Aeroacoustic airfoil shape optimization enhanced by autoencoders\n\n\nExpert Systems With Applications 217 (2023) 119513\n\nA\n0\nn\n\nContents lists available at ScienceDirect\n\nExpert Systems With Applications\n\njournal homepage: www.elsevier.com/locate/eswa\n\nAeroacoustic airfoil shape optimization enhanced by autoencoders\nJiaqing Kou a,∗, Laura Botero-Bolívar b, Román Ballano a, Oscar Marino a, Leandro de Santana b,\nEusebio Valero a,c, Esteban Ferrer a,c\n\na ETSIAE-UPM-School of Aeronautics, Universidad Politécnica de Madrid, Plaza Cardenal Cisneros 3, E-28040 Madrid, Spain\nb Department of Thermal Fluid Engineering, University of Twente, PO Box 217, 7522 NB Enschede, The Netherlands\nc Center for Computational Simulation, Universidad Politécnica de Madrid, Campus de Montegancedo, Bo