In [1]:
!pip install rdflib nltk pdfplumber scikit-learn spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
from rdflib import Graph, Namespace, RDF, OWL, RDFS

# Load the ontology
ontology_path = "hi_ontology.ttl"  # Update with the correct path
g = Graph()
g.parse(ontology_path, format="turtle")

# Define namespace
HI = Namespace("http://www.semanticweb.org/hi_ontology#")

# Extract classes
existing_classes = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.Class)]

# Extract object properties
existing_object_properties = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.ObjectProperty)]

# Extract data properties
existing_data_properties = [str(s.split("#")[-1]) for s in g.subjects(RDF.type, OWL.DatatypeProperty)]

# Print summary
print("✅ Existing Classes:", existing_classes)
print("✅ Existing Object Properties:", existing_object_properties)
print("✅ Existing Data Properties:", existing_data_properties)

✅ Existing Classes: ['http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Context', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Domain', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Endgoal', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/EthicalConsideration', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ArtificialAgent', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/InformationProcessing', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Human', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/Interaction', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/InteractionTask', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ProcessingMethod', 'http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/ProcessingTa

In [3]:
import os
import pdfplumber
import re
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")

# Function to extract text from all PDFs in a folder
def extract_text_from_pdfs(pdf_folder):
    extracted_texts = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, filename)
            with pdfplumber.open(file_path) as pdf:
                text = " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])
                extracted_texts.append(text)
    return extracted_texts

# Function to clean and preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    return " ".join(tokens)

# Extract key terms using TF-IDF
def extract_keywords_tfidf(texts, num_keywords=20):
    vectorizer = TfidfVectorizer(max_features=500, stop_words="english", ngram_range=(1,2))
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).tolist()[0]
    keyword_scores = dict(zip(feature_names, tfidf_scores))
    sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
    return [keyword for keyword, _ in sorted_keywords[:num_keywords]]

# Extract named entities (AI-related terms, datasets, models)
def extract_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in {"ORG", "PRODUCT", "EVENT"}]
    return list(set(entities))

# Provide the path to your PDFs folder
pdf_folder_path = "pdfs"  # Update this to your actual folder path

# Extract and process text
raw_texts = extract_text_from_pdfs(pdf_folder_path)
processed_texts = [preprocess_text(text) for text in raw_texts]

# Get key terms
tfidf_keywords = extract_keywords_tfidf(processed_texts)
named_entities = []
for text in raw_texts:
    named_entities.extend(extract_named_entities(text))

# Print extracted key concepts
print("🔹 Extracted Key Concepts:")
print("📌 Top Keywords:", tfidf_keywords)
print("📌 Named Entities:", list(set(named_entities)))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saahithshetty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔹 Extracted Key Concepts:
📌 Top Keywords: ['trust', 'ai', 'agent', 'rl', 'design', 'learning', 'human', 'acc', 'user', 'pvossenetalaconversationalagent', 'sampling', 'figure', 'cid', 'available', 'al', 'data', 'model', 'cluster', 'uncertainty', 'wtchangetalhumancenteredaifordementiacare']
📌 Named Entities: ['Individual Differences,100:85–94,2016', 'forDET', 'Caetano', 'M.E.Taylor/ReinforcementLearningRequiresHuman-in-the-LoopFramingandApproaches 353', 'the Certifiably Optimal Rule Lists', 'Acc', 'Harv JL & Tech', 'Monitoringpatientswell', 'AI\nW', 'GreenewaldK', 'Taya R Cohen', 'Singh P, Pomerantz S, Doyle S, Kakarmath S', 'Digital Humanism', '0.002 0.0', 'Antecedents of Trust', 'Idonothavetheabilitytoadjusttoindividualpreferencesintheway', 'AIsystemandemphasizetheimportanceofmaintaininguserengagementandreducing', 'Hegde N', 'International Journal of Human', 'ACM Conference on Computer Supported Cooperative Work\n', 'Artificial Intelligence (AI', 'HCOMP', '0.04', 'LimitationsandConclus

In [4]:
# Convert extracted keywords to a set for comparison
extracted_keywords_set = set(tfidf_keywords)
extracted_named_entities_set = set(named_entities)

# Convert existing ontology classes and properties into sets
existing_classes_set = set([s.split("#")[-1] for s in existing_classes])
existing_object_properties_set = set([s.split("#")[-1] for s in existing_object_properties])
existing_data_properties_set = set([s.split("#")[-1] for s in existing_data_properties])

# Find missing classes
missing_classes = extracted_keywords_set - existing_classes_set

# Find missing object properties
missing_object_properties = extracted_keywords_set - existing_object_properties_set

# Find missing data properties (assuming no existing ones)
missing_data_properties = extracted_keywords_set - existing_data_properties_set if existing_data_properties else extracted_keywords_set

# Print results
print("❌ Missing Classes:", missing_classes)
print("❌ Missing Object Properties:", missing_object_properties)
print("❌ Missing Data Properties:", missing_data_properties)

❌ Missing Classes: {'wtchangetalhumancenteredaifordementiacare', 'model', 'figure', 'learning', 'available', 'human', 'cid', 'user', 'cluster', 'data', 'al', 'uncertainty', 'pvossenetalaconversationalagent', 'trust', 'sampling', 'rl', 'design', 'acc', 'agent', 'ai'}
❌ Missing Object Properties: {'wtchangetalhumancenteredaifordementiacare', 'model', 'figure', 'learning', 'available', 'human', 'cid', 'user', 'cluster', 'data', 'al', 'uncertainty', 'pvossenetalaconversationalagent', 'trust', 'sampling', 'rl', 'design', 'acc', 'agent', 'ai'}
❌ Missing Data Properties: {'wtchangetalhumancenteredaifordementiacare', 'model', 'figure', 'learning', 'available', 'human', 'cid', 'user', 'cluster', 'data', 'al', 'uncertainty', 'pvossenetalaconversationalagent', 'trust', 'sampling', 'rl', 'design', 'acc', 'agent', 'ai'}


In [7]:
import urllib.parse
from rdflib import URIRef, Graph, Namespace, RDF, RDFS, OWL, XSD, Literal

# Define the namespace (update it based on your ontology's namespace)
HI = Namespace("http://www.semanticweb.org/vbr240/ontologies/2022/4/untitled-ontology-51/")

# Initialize RDF graph
g = Graph()
g.bind("hi", HI)

# Function to clean concept names (replace spaces and special chars, ensure valid URI)
def clean_name(name):
    return urllib.parse.quote(name.replace(" ", "_").replace("-", "_").replace("/", "_"))

# Define safe mappings for problematic words
safe_mappings = {
    "acc": "Accuracy",
    "ai": "Artificial_Intelligence",
    "rl": "Reinforcement_Learning",
    "cid": "Concept_ID",
    "al": "Algorithm",
    "pvossenetalaconversationalagent": "Conversational_Agent",
    "wtchangetalhumancenteredaifordementiacare": "Human_Centered_AI_Dementia",
}

# Function to get safe name
def get_safe_name(name):
    return safe_mappings.get(name, clean_name(name))

# Add missing classes
for class_name in missing_classes:
    class_name_clean = get_safe_name(class_name)
    class_uri = HI[class_name_clean]
    g.add((class_uri, RDF.type, OWL.Class))
    g.add((class_uri, RDFS.label, Literal(class_name_clean)))  
    g.add((class_uri, RDFS.comment, Literal(f"Automatically added class: {class_name_clean}")))  

# Add missing object properties
for prop in missing_object_properties:
    prop_clean = get_safe_name(prop)
    prop_uri = HI[prop_clean]
    g.add((prop_uri, RDF.type, OWL.ObjectProperty))
    g.add((prop_uri, RDFS.domain, HI["AIModel"]))  
    g.add((prop_uri, RDFS.range, HI["AIConcept"]))  
    g.add((prop_uri, RDFS.label, Literal(prop_clean)))  
    g.add((prop_uri, RDFS.comment, Literal(f"Automatically added object property: {prop_clean}")))

# Add missing data properties
for prop in missing_data_properties:
    prop_clean = get_safe_name(prop)
    prop_uri = HI[prop_clean]
    g.add((prop_uri, RDF.type, OWL.DatatypeProperty))
    g.add((prop_uri, RDFS.domain, HI["AIModel"]))  
    g.add((prop_uri, RDFS.range, XSD.string))  
    g.add((prop_uri, RDFS.label, Literal(prop_clean)))  
    g.add((prop_uri, RDFS.comment, Literal(f"Automatically added data property: {prop_clean}")))  

# Save the updated ontology
updated_ontology_path = "updated_hi_ontology.ttl"
g.serialize(destination=updated_ontology_path, format="turtle")

print(f"✅ Ontology updated successfully! Saved as {updated_ontology_path}")

✅ Ontology updated successfully! Saved as updated_hi_ontology.ttl
