this is the project shared in github

In [11]:
!pip install rdflib nltk pdfplumber scikit-learn spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [12]:
from rdflib import Graph, Namespace, RDF, OWL, RDFS

# Load the ontology
ontology_path = "hi_ontology.ttl"  # Update with the correct path
g = Graph()
g.parse(ontology_path, format="turtle")

# Define namespace
HI = Namespace("http://www.semanticweb.org/hi_ontology#")

# Extract classes
existing_classes = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.Class)]

# Extract object properties
existing_object_properties = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.ObjectProperty)]

# Extract data properties
existing_data_properties = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.DatatypeProperty)]

# Print summary
print("✅ Existing Classes:", existing_classes)
print("✅ Existing Object Properties:", existing_object_properties)
print("✅ Existing Data Properties:", existing_data_properties)

✅ Existing Classes: ['http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Context', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Domain', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Endgoal', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/EthicalConsideration', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ArtificialAgent', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/InformationProcessing', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Human', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Interaction', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/InteractionTask', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ProcessingMethod', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ProcessingTa

In [13]:
import os
import pdfplumber
import re
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")

# Function to extract text from all PDFs in a folder
def extract_text_from_pdfs(pdf_folder):
    extracted_texts = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, filename)
            with pdfplumber.open(file_path) as pdf:
                text = " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])
                extracted_texts.append(text)
    return extracted_texts

# Function to clean and preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    return " ".join(tokens)

# Extract key terms using TF-IDF
def extract_keywords_tfidf(texts, num_keywords=20):
    vectorizer = TfidfVectorizer(max_features=500, stop_words="english", ngram_range=(1,2))
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).tolist()[0]
    keyword_scores = dict(zip(feature_names, tfidf_scores))
    sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
    return [keyword for keyword, _ in sorted_keywords[:num_keywords]]

# Extract named entities (AI-related terms, datasets, models)
def extract_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in {"ORG", "PRODUCT", "EVENT"}]
    return list(set(entities))

# Provide the path to your PDFs folder
pdf_folder_path = "pdfs"  # Update this to your actual folder path

# Extract and process text
raw_texts = extract_text_from_pdfs(pdf_folder_path)
processed_texts = [preprocess_text(text) for text in raw_texts]

# Get key terms
tfidf_keywords = extract_keywords_tfidf(processed_texts)
named_entities = []
for text in raw_texts:
    named_entities.extend(extract_named_entities(text))

# Print extracted key concepts
print("🔹 Extracted Key Concepts:")
print("📌 Top Keywords:", tfidf_keywords)
print("📌 Named Entities:", list(set(named_entities)))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔹 Extracted Key Concepts:
📌 Top Keywords: ['trust', 'ai', 'agent', 'rl', 'design', 'learning', 'human', 'acc', 'user', 'pvossenetalaconversationalagent', 'sampling', 'figure', 'cid', 'available', 'al', 'data', 'model', 'cluster', 'uncertainty', 'wtchangetalhumancenteredaifordementiacare']
📌 Named Entities: ['YunfengZhang', 'Official Journal', 'Parker W', 'Hatice Ferhan Odabasi', 'al', 'Fitzpatrick', 'ElianeSommerfeld', 'Current Psychology', 'Instagram', 'the Clinical AI Deployment Cycle', 'User Modeling', 'JoshuaDMiller', 'AssociationforComputing', 'DZ', 'CRC Press', 'p<.001∗∗∗)andAIoutputtrust(r=.76', 'AI\nGeometrically', 'Antaki JF', 'Robben S', 'Alessandro Acquisti', 'the EFMI Working Group for Assessment of Health Information Systems', 'SocialScience', 'Acosta-ZazuetaG.A3Dshapegenerativemethodforaestheticprod-', '.315 F', 'Yearb\nMed Inform', 'Fig.1', 'Feedback Interface', 'P7', 'Ingeneral', 'Agrawal M', '0.392', 'AakritiKumar', 'Thelocationmapislaidonagrid', 'SantaClara', '• Itisv

In [14]:
# Convert extracted keywords to a set for comparison
extracted_keywords_set = set(tfidf_keywords)
extracted_named_entities_set = set(named_entities)

# Convert existing ontology classes and properties into sets
existing_classes_set = set([s.split("#")[-1] for s in existing_classes])
existing_object_properties_set = set([s.split("#")[-1] for s in existing_object_properties])
existing_data_properties_set = set([s.split("#")[-1] for s in existing_data_properties])

# Find missing classes
missing_classes = extracted_keywords_set - existing_classes_set

# Find missing object properties
missing_object_properties = extracted_keywords_set - existing_object_properties_set

# Find missing data properties (assuming no existing ones)
missing_data_properties = extracted_keywords_set - existing_data_properties_set if existing_data_properties else extracted_keywords_set

# Print results
print("❌ Missing Classes:", missing_classes)
print("❌ Missing Object Properties:", missing_object_properties)
print("❌ Missing Data Properties:", missing_data_properties)

❌ Missing Classes: {'agent', 'sampling', 'al', 'uncertainty', 'data', 'learning', 'cid', 'figure', 'trust', 'human', 'available', 'design', 'acc', 'ai', 'wtchangetalhumancenteredaifordementiacare', 'user', 'model', 'rl', 'pvossenetalaconversationalagent', 'cluster'}
❌ Missing Object Properties: {'agent', 'sampling', 'al', 'uncertainty', 'data', 'learning', 'cid', 'figure', 'trust', 'human', 'available', 'design', 'acc', 'ai', 'wtchangetalhumancenteredaifordementiacare', 'user', 'model', 'rl', 'pvossenetalaconversationalagent', 'cluster'}
❌ Missing Data Properties: {'agent', 'sampling', 'al', 'uncertainty', 'data', 'learning', 'cid', 'figure', 'trust', 'human', 'available', 'design', 'acc', 'ai', 'wtchangetalhumancenteredaifordementiacare', 'user', 'model', 'rl', 'pvossenetalaconversationalagent', 'cluster'}


In [15]:
import urllib.parse
from rdflib import URIRef, Graph, Namespace, RDF, RDFS, OWL, XSD, Literal

# Define the namespace (update it based on your ontology's namespace)
HI = Namespace("http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/")

# Initialize RDF graph
g = Graph()
g.bind("hi", HI)

# Function to clean concept names (replace spaces and special chars, ensure valid URI)
def clean_name(name):
    return urllib.parse.quote(name.replace(" ", "_").replace("-", "_").replace("/", "_"))

# Define safe mappings for problematic words
safe_mappings = {
    "acc": "Accuracy",
    "ai": "Artificial_Intelligence",
    "rl": "Reinforcement_Learning",
    "cid": "Concept_ID",
    "al": "Algorithm",
    "pvossenetalaconversationalagent": "Conversational_Agent",
    "wtchangetalhumancenteredaifordementiacare": "Human_Centered_AI_Dementia",
}

# Function to get safe name
def get_safe_name(name):
    return safe_mappings.get(name, clean_name(name))

# Add missing classes
for class_name in missing_classes:
    class_name_clean = get_safe_name(class_name)
    class_uri = HI[class_name_clean]
    g.add((class_uri, RDF.type, OWL.Class))
    g.add((class_uri, RDFS.label, Literal(class_name_clean)))  
    g.add((class_uri, RDFS.comment, Literal(f"Automatically added class: {class_name_clean}")))  

# Add missing object properties
for prop in missing_object_properties:
    prop_clean = get_safe_name(prop)
    prop_uri = HI[prop_clean]
    g.add((prop_uri, RDF.type, OWL.ObjectProperty))
    g.add((prop_uri, RDFS.domain, HI["AIModel"]))  
    g.add((prop_uri, RDFS.range, HI["AIConcept"]))  
    g.add((prop_uri, RDFS.label, Literal(prop_clean)))  
    g.add((prop_uri, RDFS.comment, Literal(f"Automatically added object property: {prop_clean}")))

# Add missing data properties
for prop in missing_data_properties:
    prop_clean = get_safe_name(prop)
    prop_uri = HI[prop_clean]
    g.add((prop_uri, RDF.type, OWL.DatatypeProperty))
    g.add((prop_uri, RDFS.domain, HI["AIModel"]))  
    g.add((prop_uri, RDFS.range, XSD.string))  
    g.add((prop_uri, RDFS.label, Literal(prop_clean)))  
    g.add((prop_uri, RDFS.comment, Literal(f"Automatically added data property: {prop_clean}")))  

# Save the updated ontology
updated_ontology_path = "updated_hi_ontology.ttl"
g.serialize(destination=updated_ontology_path, format="turtle")

print(f"✅ Ontology updated successfully! Saved as {updated_ontology_path}")

✅ Ontology updated successfully! Saved as updated_hi_ontology.ttl


In [5]:
extracted_titles=[
    "On the Interdependence of Reliance Behavior and Accuracy in AI-Assisted Decision-Making",
"Value-aware active learning",
"Human-Centered AI for Dementia Care: Using Reinforcement Learning for Personalized Interventions Support in Eating and Drinking Scenarios",
"Reinforcement Learning Requires Human-in-the-Loop Framing and Approaches",
"Trust in Clinical AI: Expanding the Unit of Analysis",
"A Hybrid Intelligence Approach to Training Generative Design Assistants: Partnership Between Human Experts and AI Enhanced Co-Creative Tools",
"Exploring the Dynamic Nature of Trust Using Interventions in a Human-AI Collaborative Task",
"A Conversational Agent for Structured Diary Construction Enabling Monitoring of Functioning & Well-being", 
"Knowledge Graphs in Support of Human-Machine Intelligence",
"Exosoul: ethical profiling in the digital world",
"Landmarks in Case-based Reasoning: From Theory to Data",
"Validation of a Measure of Trust in Artificial Intelligence",
]

In [6]:
import requests

def get_metadata_from_doi(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url).json()
    if "message" in response:
        metadata = response["message"]
        return {
            "title": metadata.get("title", ["Unknown"])[0],
            "authors": ", ".join([author.get("given", "") + " " + author.get("family", "") 
                                  for author in metadata.get("author", [])]),
            "year": metadata.get("published-print", {}).get("date-parts", [[None]])[0][0],
            "journal": metadata.get("container-title", ["Unknown"])[0],
            "doi": metadata.get("DOI", ""),
            "url": metadata.get("URL", "")
        }
    return None

doi_list = ["10.3233/FAIA230074",
            "10.3233/FAIA230085",
            "10.3233/FAIA240185",
            "10.3233/FAIA230098",
            "10.3233/FAIA220192",
            "10.3233/FAIA230078",
            "10.3233/FAIA220200",
            "10.3233/FAIA240204","","10.3233/FAIA220209","10.3233/FAIA240195"]  # Replace with your DOIs
papers_metadata = [get_metadata_from_doi(doi) for doi in doi_list]

# Print results
for paper in papers_metadata:
    print(f"📌 {paper['title']} ({paper['year']})")
    print(f"   📝 Authors: {paper['authors']}")
    print(f"   📖 Journal: {paper['journal']}")
    print(f"   🔗 DOI: {paper['doi']}")
    print(f"   🌐 URL: {paper['url']}\n")

📌 On the Interdependence of Reliance Behavior and Accuracy in AI-Assisted Decision-Making (2023)
   📝 Authors: Jakob Schoeffer, Johannes Jakubik, Michael Voessing, Niklas Kuehl, Gerhard Satzger
   📖 Journal: Frontiers in Artificial Intelligence and Applications
   🔗 DOI: 10.3233/faia230074
   🌐 URL: https://doi.org/10.3233/faia230074

📌 Value-Aware Active Learning (2023)
   📝 Authors: Burcu Sayin, Jie Yang, Andrea Passerini, Fabio Casati
   📖 Journal: Frontiers in Artificial Intelligence and Applications
   🔗 DOI: 10.3233/faia230085
   🌐 URL: https://doi.org/10.3233/faia230085

📌 Human-Centered AI for Dementia Care: Using Reinforcement Learning for Personalized Interventions Support in Eating and Drinking Scenarios (2024)
   📝 Authors: Wen-Tseng Chang, Shihan Wang, Stephanie Kramer, Michel Oey, Somaya Ben Allouch
   📖 Journal: Frontiers in Artificial Intelligence and Applications
   🔗 DOI: 10.3233/faia240185
   🌐 URL: https://doi.org/10.3233/faia240185

📌 Reinforcement Learning Require

In [None]:
from rdflib import Graph, Namespace, RDF, RDFS, OWL, XSD, Literal, URIRef
import urllib.parse

# 📌 Define the namespace (Ensure it matches your ontology)
ontology_path = "updated_hi_ontology.ttl"  # Update with your actual ontology file
HI = Namespace("http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/")

# 📌 Load existing ontology before modification
g = Graph()
try:
    g.parse(ontology_path, format="turtle")  # Load existing ontology
    print(f"✅ Successfully loaded existing ontology: {ontology_path}")
except Exception as e:
    print(f"⚠ Error loading ontology: {e}. Creating a new ontology.")

g.bind("hi", HI)  # Ensure the namespace is bound

# 📌 Define the ResearchPaper class if not already in ontology
ResearchPaper = HI["ResearchPaper"]
if (ResearchPaper, RDF.type, OWL.Class) not in g:
    g.add((ResearchPaper, RDF.type, OWL.Class))
    g.add((ResearchPaper, RDFS.label, Literal("Research Paper")))
    g.add((ResearchPaper, RDFS.comment, Literal("Class representing research papers in the ontology.")))

# 📌 Define new data properties if not already in ontology
data_properties = {
    "hasAbstract": XSD.string,
    "hasVenue": XSD.string,
    "hasKeywords": XSD.string,
    "hasExternalLink": XSD.string,
    "hasCitations": XSD.integer,
    "hasTitle": XSD.string,
    "hasAuthor": XSD.string,
    "hasYear": XSD.integer,
    "hasDOI": XSD.string
}

for prop, dtype in data_properties.items():
    prop_uri = HI[prop]
    if (prop_uri, RDF.type, OWL.DatatypeProperty) not in g:
        g.add((prop_uri, RDF.type, OWL.DatatypeProperty))
        g.add((prop_uri, RDFS.domain, ResearchPaper))
        g.add((prop_uri, RDFS.range, dtype))
        g.add((prop_uri, RDFS.label, Literal(prop)))
        g.add((prop_uri, RDFS.comment, Literal(f"Automatically added data property: {prop}")))

# 📌 Define object properties if not already in ontology
hasConcept = HI["hasConcept"]
if (hasConcept, RDF.type, OWL.ObjectProperty) not in g:
    g.add((hasConcept, RDF.type, OWL.ObjectProperty))
    g.add((hasConcept, RDFS.label, Literal("hasConcept")))
    g.add((hasConcept, RDFS.comment, Literal("Links a research paper to a concept")))

# 📌 Sample metadata for research papers (Update this list dynamically)
papers_metadata = [
    {"title": "On the Interdependence of Reliance Behavior and Accuracy in AI-Assisted Decision-Making",
     "authors": "John Doe, Jane Smith",
     "year": 2023,
     "doi": "10.3233/FAIA230074",
     "url": "https://doi.org/10.3233/FAIA230074",
     "venue": "FAIA",
     "abstract": "This paper explores the interdependence of reliance behavior and accuracy in AI-assisted decision-making.",
     "keywords": "AI, Decision-Making, Trust, Human-AI Interaction"},
    
    {"title": "Value-aware active learning",
     "authors": "Alice Brown, Bob White",
     "year": 2022,
     "doi": "10.3233/FAIA230085",
     "url": "https://doi.org/10.3233/FAIA230085",
     "venue": "FAIA",
     "abstract": "This study introduces a novel approach to active learning that integrates value-awareness for improved model training.",
     "keywords": "Machine Learning, Active Learning, Model Training, Value-aware Learning"}
]

# 📌 Add instances for research papers (only if not already present)
for paper in papers_metadata:
    paper_uri = HI[urllib.parse.quote(paper["title"].replace(" ", "_"))]

    if (paper_uri, RDF.type, ResearchPaper) not in g:
        g.add((paper_uri, RDF.type, ResearchPaper))
        g.add((paper_uri, HI["hasTitle"], Literal(paper["title"])))
        g.add((paper_uri, HI["hasAuthor"], Literal(paper["authors"])))
        g.add((paper_uri, HI["hasYear"], Literal(paper["year"], datatype=XSD.integer)))
        g.add((paper_uri, HI["hasDOI"], Literal(paper["doi"])))
        g.add((paper_uri, HI["hasExternalLink"], Literal(paper["url"])))
        g.add((paper_uri, HI["hasVenue"], Literal(paper["venue"])))
        g.add((paper_uri, HI["hasAbstract"], Literal(paper["abstract"])))
        g.add((paper_uri, HI["hasKeywords"], Literal(paper["keywords"])))

# 📌 Save the updated ontology back to the same file
g.serialize(destination=ontology_path, format="turtle")
print(f"✅ Ontology successfully updated and saved in {ontology_path}")

✅ Successfully loaded existing ontology: updated_hi_ontology.ttl
✅ Ontology successfully updated and saved in updated_hi_ontology.ttl
