In [1]:
!pip install rdflib nltk pdfplumber scikit-learn spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
from rdflib import Graph, Namespace, RDF, OWL, RDFS

# Load the ontology
ontology_path = "hi_ontology.ttl"  # Update with the correct path
g = Graph()
g.parse(ontology_path, format="turtle")

# Define namespace
HI = Namespace("http://www.semanticweb.org/hi_ontology#")

# Extract classes
existing_classes = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.Class)]

# Extract object properties
existing_object_properties = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.ObjectProperty)]

# Extract data properties
existing_data_properties = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.DatatypeProperty)]

# Print summary
print("✅ Existing Classes:", existing_classes)
print("✅ Existing Object Properties:", existing_object_properties)
print("✅ Existing Data Properties:", existing_data_properties)

✅ Existing Classes: ['http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Context', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Domain', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Endgoal', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/EthicalConsideration', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ArtificialAgent', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/InformationProcessing', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Human', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Interaction', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/InteractionTask', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ProcessingMethod', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ProcessingTa

In [3]:
import os
import pdfplumber
import re
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")

# Function to extract text from all PDFs in a folder
def extract_text_from_pdfs(pdf_folder):
    extracted_texts = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, filename)
            with pdfplumber.open(file_path) as pdf:
                text = " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])
                extracted_texts.append(text)
    return extracted_texts

# Function to clean and preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    return " ".join(tokens)

# Extract key terms using TF-IDF
def extract_keywords_tfidf(texts, num_keywords=20):
    vectorizer = TfidfVectorizer(max_features=500, stop_words="english", ngram_range=(1,2))
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).tolist()[0]
    keyword_scores = dict(zip(feature_names, tfidf_scores))
    sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
    return [keyword for keyword, _ in sorted_keywords[:num_keywords]]

# Extract named entities (AI-related terms, datasets, models)
def extract_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in {"ORG", "PRODUCT", "EVENT"}]
    return list(set(entities))

# Provide the path to your PDFs folder
pdf_folder_path = "pdfs"  # Update this to your actual folder path

# Extract and process text
raw_texts = extract_text_from_pdfs(pdf_folder_path)
processed_texts = [preprocess_text(text) for text in raw_texts]

# Get key terms
tfidf_keywords = extract_keywords_tfidf(processed_texts)
named_entities = []
for text in raw_texts:
    named_entities.extend(extract_named_entities(text))

# Print extracted key concepts
print("🔹 Extracted Key Concepts:")
print("📌 Top Keywords:", tfidf_keywords)
print("📌 Named Entities:", list(set(named_entities)))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔹 Extracted Key Concepts:
📌 Top Keywords: ['trust', 'ai', 'agent', 'rl', 'design', 'learning', 'human', 'acc', 'user', 'pvossenetalaconversationalagent', 'sampling', 'figure', 'cid', 'available', 'al', 'data', 'model', 'cluster', 'uncertainty', 'wtchangetalhumancenteredaifordementiacare']
📌 Named Entities: ['Reinforcement Learning', '0.002 0.173 0.0 0.0 0.0', 'QR-Code', 'PatrizioPelliccione', 'DST', 'International\nJournal of Mental Health', 'UsingaKruskal-Wallisranksum', 'Train/Val/Testsize', 'Benda', 'Boehm-DavisDA.EffectsofAgeandCongestionInformationAccuracyofAdvancedTraveler\nInformationSystemsonUserTrustandCompliance', 'Atari', 'M.E.Taylor/ReinforcementLearningRequiresHuman-in-the-LoopFramingandApproaches 353', 'NormanSadeh', 'SantaClara', 'NY', 'SunY', 'MDPandtrytolearnahigherperformingpolicy', 'AI', 'CHI Conference', '0.716 0.0', 'TheRoleofTrustin\nHuman-RobotInteraction', '0.776 0.487', '0.855 0.631 0.51 0.341', 'World Health Organization', 'Fig.1', 'EUR', 'Acc <Acc', 'Massimil

In [4]:
# Convert extracted keywords to a set for comparison
extracted_keywords_set = set(tfidf_keywords)
extracted_named_entities_set = set(named_entities)

# Convert existing ontology classes and properties into sets
existing_classes_set = set([s.split("#")[-1] for s in existing_classes])
existing_object_properties_set = set([s.split("#")[-1] for s in existing_object_properties])
existing_data_properties_set = set([s.split("#")[-1] for s in existing_data_properties])

# Find missing classes
missing_classes = extracted_keywords_set - existing_classes_set

# Find missing object properties
missing_object_properties = extracted_keywords_set - existing_object_properties_set

# Find missing data properties (assuming no existing ones)
missing_data_properties = extracted_keywords_set - existing_data_properties_set if existing_data_properties else extracted_keywords_set

# Print results
print("❌ Missing Classes:", missing_classes)
print("❌ Missing Object Properties:", missing_object_properties)
print("❌ Missing Data Properties:", missing_data_properties)

❌ Missing Classes: {'figure', 'sampling', 'uncertainty', 'model', 'available', 'cid', 'design', 'human', 'agent', 'ai', 'cluster', 'wtchangetalhumancenteredaifordementiacare', 'al', 'acc', 'user', 'trust', 'pvossenetalaconversationalagent', 'rl', 'learning', 'data'}
❌ Missing Object Properties: {'figure', 'sampling', 'uncertainty', 'model', 'available', 'cid', 'design', 'human', 'agent', 'ai', 'cluster', 'wtchangetalhumancenteredaifordementiacare', 'al', 'acc', 'user', 'trust', 'pvossenetalaconversationalagent', 'rl', 'learning', 'data'}
❌ Missing Data Properties: {'figure', 'sampling', 'uncertainty', 'model', 'available', 'cid', 'design', 'human', 'agent', 'ai', 'cluster', 'wtchangetalhumancenteredaifordementiacare', 'al', 'acc', 'user', 'trust', 'pvossenetalaconversationalagent', 'rl', 'learning', 'data'}


In [5]:
import urllib.parse
from rdflib import URIRef, Graph, Namespace, RDF, RDFS, OWL, XSD, Literal

# Define the namespace (update it based on your ontology's namespace)
HI = Namespace("http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/")

# Initialize RDF graph
g = Graph()
g.bind("hi", HI)

# Function to clean concept names (replace spaces and special chars, ensure valid URI)
def clean_name(name):
    return urllib.parse.quote(name.replace(" ", "_").replace("-", "_").replace("/", "_"))

# Define safe mappings for problematic words
safe_mappings = {
    "acc": "Accuracy",
    "ai": "Artificial_Intelligence",
    "rl": "Reinforcement_Learning",
    "cid": "Concept_ID",
    "al": "Algorithm",
    "pvossenetalaconversationalagent": "Conversational_Agent",
    "wtchangetalhumancenteredaifordementiacare": "Human_Centered_AI_Dementia",
}

# Function to get safe name
def get_safe_name(name):
    return safe_mappings.get(name, clean_name(name))

# Add missing classes
for class_name in missing_classes:
    class_name_clean = get_safe_name(class_name)
    class_uri = HI[class_name_clean]
    g.add((class_uri, RDF.type, OWL.Class))
    g.add((class_uri, RDFS.label, Literal(class_name_clean)))  
    g.add((class_uri, RDFS.comment, Literal(f"Automatically added class: {class_name_clean}")))  

# Add missing object properties
for prop in missing_object_properties:
    prop_clean = get_safe_name(prop)
    prop_uri = HI[prop_clean]
    g.add((prop_uri, RDF.type, OWL.ObjectProperty))
    g.add((prop_uri, RDFS.domain, HI["AIModel"]))  
    g.add((prop_uri, RDFS.range, HI["AIConcept"]))  
    g.add((prop_uri, RDFS.label, Literal(prop_clean)))  
    g.add((prop_uri, RDFS.comment, Literal(f"Automatically added object property: {prop_clean}")))

# Add missing data properties
for prop in missing_data_properties:
    prop_clean = get_safe_name(prop)
    prop_uri = HI[prop_clean]
    g.add((prop_uri, RDF.type, OWL.DatatypeProperty))
    g.add((prop_uri, RDFS.domain, HI["AIModel"]))  
    g.add((prop_uri, RDFS.range, XSD.string))  
    g.add((prop_uri, RDFS.label, Literal(prop_clean)))  
    g.add((prop_uri, RDFS.comment, Literal(f"Automatically added data property: {prop_clean}")))  

# Save the updated ontology
updated_ontology_path = "updated_hi_ontology.ttl"
g.serialize(destination=updated_ontology_path, format="turtle")

print(f"✅ Ontology updated successfully! Saved as {updated_ontology_path}")

✅ Ontology updated successfully! Saved as updated_hi_ontology.ttl


In [None]:
extracted_titles=[
    "On the Interdependence of Reliance Behavior and Accuracy in AI-Assisted Decision-Making",
"Value-aware active learning",
"Human-Centered AI for Dementia Care: Using Reinforcement Learning for Personalized Interventions Support in Eating and Drinking Scenarios",
"Reinforcement Learning Requires Human-in-the-Loop Framing and Approaches",
"Trust in Clinical AI: Expanding the Unit of Analysis",
"A Hybrid Intelligence Approach to Training Generative Design Assistants: Partnership Between Human Experts and AI Enhanced Co-Creative Tools",
"Exploring the Dynamic Nature of Trust Using Interventions in a Human-AI Collaborative Task",
"A Conversational Agent for Structured Diary Construction Enabling Monitoring of Functioning & Well-being", 
"Knowledge Graphs in Support of Human-Machine Intelligence",
"Exosoul: ethical profiling in the digital world",
"Landmarks in Case-based Reasoning: From Theory to Data",
"Validation of a Measure of Trust in Artificial Intelligence",
]

In [14]:
import requests

def get_metadata_from_doi(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url).json()
    if "message" in response:
        metadata = response["message"]
        return {
            "title": metadata.get("title", ["Unknown"])[0],
            "authors": ", ".join([author.get("given", "") + " " + author.get("family", "") 
                                  for author in metadata.get("author", [])]),
            "year": metadata.get("published-print", {}).get("date-parts", [[None]])[0][0],
            "journal": metadata.get("container-title", ["Unknown"])[0],
            "doi": metadata.get("DOI", ""),
            "url": metadata.get("URL", "")
        }
    return None

doi_list = ["10.3233/FAIA230074",
            "10.3233/FAIA230085",
            "10.3233/FAIA240185",
            "10.3233/FAIA230098",
            "10.3233/FAIA220192",
            "10.3233/FAIA230078",
            "10.3233/FAIA220200",
            "10.3233/FAIA240204"]  # Replace with your DOIs
papers_metadata = [get_metadata_from_doi(doi) for doi in doi_list]

# Print results
for paper in papers_metadata:
    print(f"📌 {paper['title']} ({paper['year']})")
    print(f"   📝 Authors: {paper['authors']}")
    print(f"   📖 Journal: {paper['journal']}")
    print(f"   🔗 DOI: {paper['doi']}")
    print(f"   🌐 URL: {paper['url']}\n")

📌 On the Interdependence of Reliance Behavior and Accuracy in AI-Assisted Decision-Making (2023)
   📝 Authors: Jakob Schoeffer, Johannes Jakubik, Michael Voessing, Niklas Kuehl, Gerhard Satzger
   📖 Journal: Frontiers in Artificial Intelligence and Applications
   🔗 DOI: 10.3233/faia230074
   🌐 URL: https://doi.org/10.3233/faia230074

📌 Value-Aware Active Learning (2023)
   📝 Authors: Burcu Sayin, Jie Yang, Andrea Passerini, Fabio Casati
   📖 Journal: Frontiers in Artificial Intelligence and Applications
   🔗 DOI: 10.3233/faia230085
   🌐 URL: https://doi.org/10.3233/faia230085

📌 Human-Centered AI for Dementia Care: Using Reinforcement Learning for Personalized Interventions Support in Eating and Drinking Scenarios (2024)
   📝 Authors: Wen-Tseng Chang, Shihan Wang, Stephanie Kramer, Michel Oey, Somaya Ben Allouch
   📖 Journal: Frontiers in Artificial Intelligence and Applications
   🔗 DOI: 10.3233/faia240185
   🌐 URL: https://doi.org/10.3233/faia240185

📌 Reinforcement Learning Require

In [9]:
import requests
import time


# Semantic Scholar API endpoint
API_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

# Function to fetch metadata
def get_paper_metadata(title):
    params = {
        "query": title,
        "fields": "title,authors,year,doi,venue,abstract"
    }
    response = requests.get(API_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        if "data" in data and len(data["data"]) > 0:
            paper = data["data"][0]  # Take the first matching result
            return {
                "title": paper.get("title", "N/A"),
                "authors": ", ".join([author["name"] for author in paper.get("authors", [])]),
                "year": paper.get("year", "N/A"),
                "doi": paper.get("doi", "N/A"),
                "venue": paper.get("venue", "N/A"),
                "abstract": paper.get("abstract", "N/A")
            }
    else:
        print(f"❌ Error fetching metadata for: {title}")
    return None

# Fetch metadata for all papers
papers_metadata = []
for title in extracted_titles:
    metadata = get_paper_metadata(title)
    if metadata:
        papers_metadata.append(metadata)
    time.sleep(1)  # Respect API rate limits

# Print the structured list
print("📄 Research Papers Metadata:")
for paper in papers_metadata:
    print(paper)

❌ Error fetching metadata for: Value-aware active learning
❌ Error fetching metadata for: Human-Centered AI for Dementia Care: Using Reinforcement Learning for Personalized Interventions Support in Eating and Drinking Scenarios
❌ Error fetching metadata for: Reinforcement Learning Requires Human-in-the-Loop Framing and Approaches
❌ Error fetching metadata for: Trust in Clinical AI: Expanding the Unit of Analysis
❌ Error fetching metadata for: A Hybrid Intelligence Approach to Training Generative Design Assistants: Partnership Between Human Experts and AI Enhanced Co-Creative Tools
❌ Error fetching metadata for: Exploring the Dynamic Nature of Trust Using Interventions in a Human-AI Collaborative Task
❌ Error fetching metadata for: A Conversational Agent for Structured Diary Construction Enabling Monitoring of Functioning & Well-being
❌ Error fetching metadata for: Knowledge Graphs in Support of Human-Machine Intelligence
❌ Error fetching metadata for: Exosoul: ethical profiling in the 

In [15]:
from rdflib import Graph, Namespace, RDF, RDFS, OWL, XSD, Literal, URIRef

# Define the namespace (Ensure it matches your ontology)
HI = Namespace("http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/")

# Initialize RDF graph
g = Graph()
g.bind("hi", HI)

# Define new class for research papers
ResearchPaper = HI["ResearchPaper"]
g.add((ResearchPaper, RDF.type, OWL.Class))
g.add((ResearchPaper, RDFS.label, Literal("Research Paper")))
g.add((ResearchPaper, RDFS.comment, Literal("Class representing research papers in the ontology.")))

# Sample extracted key concepts (from TF-IDF + Named Entities)
key_concepts = ["Trust", "AI", "Reinforcement Learning", "Human-AI Interaction", "Uncertainty", "Knowledge Graphs"]

# Create instances for concepts (ensure they are part of ontology classes)
for concept in key_concepts:
    concept_uri = HI[concept.replace(" ", "_")]
    g.add((concept_uri, RDF.type, HI["Concept"]))  # Assuming Concept class exists
    g.add((concept_uri, RDFS.label, Literal(concept)))
    g.add((concept_uri, RDFS.comment, Literal(f"Automatically added concept: {concept}")))


# Define object properties
hasConcept = HI["hasConcept"]
g.add((hasConcept, RDF.type, OWL.ObjectProperty))
g.add((hasConcept, RDFS.label, Literal("hasConcept")))
g.add((hasConcept, RDFS.comment, Literal("Links a research paper to a concept")))

# Define data properties
hasTitle = HI["hasTitle"]
hasAuthor = HI["hasAuthor"]
hasYear = HI["hasYear"]
hasDOI = HI["hasDOI"]

for prop in [hasTitle, hasAuthor, hasYear, hasDOI]:
    g.add((prop, RDF.type, OWL.DatatypeProperty))
    g.add((prop, RDFS.domain, ResearchPaper))
    g.add((prop, RDFS.range, XSD.string if prop != hasYear else XSD.integer))

# Add instances for research papers
for paper in papers_metadata:
    paper_uri = HI[paper["title"].replace(" ", "_")]
    g.add((paper_uri, RDF.type, ResearchPaper))
    g.add((paper_uri, hasTitle, Literal(paper["title"])))
    g.add((paper_uri, hasAuthor, Literal(paper["authors"])))
    g.add((paper_uri, hasYear, Literal(paper["year"], datatype=XSD.integer)))
    g.add((paper_uri, hasDOI, Literal(paper["doi"])))

    # Link paper to concepts
    for concept in paper["related_concepts"]:
        concept_uri = HI[concept.replace(" ", "_")]
        g.add((paper_uri, hasConcept, concept_uri))

# Save the updated ontology with instances
updated_ontology_path = "updated_hi_ontology_with_instances.ttl"
g.serialize(destination=updated_ontology_path, format="turtle")

print(f"✅ Ontology updated with RDF instances! Saved as {updated_ontology_path}")

KeyError: 'related_concepts'

In [None]:
from rdflib import Graph, Namespace, RDF, RDFS, OWL, XSD, Literal, URIRef

# Define the namespace (Ensure it matches your ontology)
HI = Namespace("http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/")

# Initialize RDF graph
g = Graph()
g.bind("hi", HI)

# Define ResearchPaper class if not already defined
ResearchPaper = HI["ResearchPaper"]
g.add((ResearchPaper, RDF.type, OWL.Class))
g.add((ResearchPaper, RDFS.label, Literal("Research Paper")))
g.add((ResearchPaper, RDFS.comment, Literal("Class representing research papers in the ontology.")))


# Define new data properties
hasAbstract = HI["hasAbstract"]
hasVenue = HI["hasVenue"]
hasCitations = HI["hasCitations"]
hasKeywords = HI["hasKeywords"]
hasExternalLink = HI["hasExternalLink"]

# Add data properties to ontology
for prop in [hasAbstract, hasVenue, hasKeywords, hasExternalLink]:
    g.add((prop, RDF.type, OWL.DatatypeProperty))
    g.add((prop, RDFS.domain, ResearchPaper))
    g.add((prop, RDFS.range, XSD.string))

# Add citations as an integer property
g.add((hasCitations, RDF.type, OWL.DatatypeProperty))
g.add((hasCitations, RDFS.domain, ResearchPaper))
g.add((hasCitations, RDFS.range, XSD.integer))

# Define object properties
hasConcept = HI["hasConcept"]
g.add((hasConcept, RDF.type, OWL.ObjectProperty))
g.add((hasConcept, RDFS.label, Literal("hasConcept")))
g.add((hasConcept, RDFS.comment, Literal("Links a research paper to a concept")))

# Add instances for research papers
for paper in papers_metadata:
    paper_uri = HI[paper["title"].replace(" ", "_")]
    g.add((paper_uri, RDF.type, ResearchPaper))
    g.add((paper_uri, HI["hasTitle"], Literal(paper["title"])))
    g.add((paper_uri, HI["hasAuthor"], Literal(paper["authors"])))
    g.add((paper_uri, HI["hasYear"], Literal(paper["year"], datatype=XSD.integer)))
    g.add((paper_uri, HI["hasDOI"], Literal(paper["doi"])))
    g.add((paper_uri, hasCitations, Literal(paper["citations"], datatype=XSD.integer)))
    g.add((paper_uri, hasKeywords, Literal(", ".join(paper["keywords"]))))
    g.add((paper_uri, hasExternalLink, Literal(paper["external_link"])))

    # Link paper to concepts
    for concept in paper["keywords"]:
        concept_uri = HI[concept.replace(" ", "_")]
        g.add((paper_uri, hasConcept, concept_uri))

# Save the updated ontology with enriched metadata
updated_ontology_path = "updated_hi_ontology_with_metadata.ttl"
g.serialize(destination=updated_ontology_path, format="turtle")

print(f"✅ Ontology updated with enriched metadata! Saved as {updated_ontology_path}")

KeyError: 'keywords'