In [1]:
import pandas as pd
import numpy as np
import os
import urllib.parse  # for parsing strings to URI's
from urllib.parse import quote

#RDF libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace, Dataset, BNode  # basic RDF handling
from rdflib.namespace import FOAF, RDFS, XSD  # most common namespaces
from rdflib.tools.rdf2dot import rdf2dot

#Visualizing
from graphviz import Source


### Defining Namespaces and Dataset and a graph called KG

In [2]:
# Namespaces for our vocabulary items (schema information, existing vocabulary, etc.)
ACAD = Namespace('http://acad.io/schema#')
ACADDATA = Namespace('http://acad.io/data#')
VIVO = Namespace('http://vivoweb.org/ontology/core#')
DC = Namespace('http://purl.org/dc/terms/')
OWL = Namespace('http://www.w3.org/2002/07/owl#')


In [3]:
# Initialize a dataset and bind namespaces
dataset = Dataset()
dataset.bind('ACAD', ACAD)
dataset.bind('ACADDATA', ACADDATA)
dataset.bind('VIVO', VIVO)
dataset.bind('DC', DC)
dataset.bind('OWL', OWL)

kg = dataset.graph()

In [4]:
# Load the externally defined schema into the default graph (context) of the dataset
dataset.default_context.parse('vocabulary.ttl', format='turtle')

<Graph identifier=urn:x-rdflib:default (<class 'rdflib.graph.Graph'>)>

In [5]:
kg.parse('GraphData.ttl', format='turtle')

<Graph identifier=https://rdflib.github.io/.well-known/genid/rdflib/Nf691c79527384d6192bdd168443c60de (<class 'rdflib.graph.Graph'>)>

In [6]:
from tika import parser # Must have a java(7 or 7+) runtime installed as well
import spotlight
import os
import re



In [7]:
os.walk("COURSES/COURSES")

<generator object _walk at 0x0000026DB3179270>

####  Recusively getting all PDFs for every course we have the data for within the COURSES/ folder.

In [8]:
pdfs = [os.path.join(dp, f) for dp, dn, filenames in os.walk("COURSES/COURSES") for f in filenames if (os.path.splitext(f)[1] == '.pdf')]

In [9]:
pdfs


['COURSES/COURSES\\COMP6481_PPS\\Comp6481-Winter-2024_course_outline.pdf',
 'COURSES/COURSES\\COMP6481_PPS\\Lecture_1\\Other_Material\\Tutorial_1.pdf',
 'COURSES/COURSES\\COMP6481_PPS\\Lecture_1\\Slides\\Chapter1.pdf',
 'COURSES/COURSES\\COMP6481_PPS\\Lecture_2\\Other_Material\\Tutorial_2.pdf',
 'COURSES/COURSES\\COMP6481_PPS\\Lecture_2\\Slides\\Chapter2.pdf',
 'COURSES/COURSES\\COMP6481_PPS\\Lecture_3\\Other_Material\\Tutorial_3.pdf',
 'COURSES/COURSES\\COMP6481_PPS\\Lecture_3\\Slides\\Chapter3.pdf',
 'COURSES/COURSES\\COMP6741_IS\\course_outline.pdf',
 'COURSES/COURSES\\COMP6741_IS\\Lecture_1\\Other_Material\\Project_Assignment1.pdf',
 'COURSES/COURSES\\COMP6741_IS\\Lecture_1\\Readings\\syllabus.pdf',
 'COURSES/COURSES\\COMP6741_IS\\Lecture_1\\Slides\\week1.pdf',
 'COURSES/COURSES\\COMP6741_IS\\Lecture_2\\Readings\\Worksheet2.pdf',
 'COURSES/COURSES\\COMP6741_IS\\Lecture_2\\Slides\\Chapter_2.pdf',
 'COURSES/COURSES\\COMP6741_IS\\Lecture_2\\Worksheets\\Worksheet2_quest.pdf',
 'COURSES

In [10]:
# Creating the text file to save topics
courseTopicsTxt = open("courseTopics.txt", "w")

In [11]:
courseTopicsTxt

<_io.TextIOWrapper name='courseTopics.txt' mode='w' encoding='cp1252'>

In [12]:
for pdf in pdfs:
    pdf = pdf.replace("\\", "/")
    
    # Skip Outlines
    if "outline" in pdf.lower():
       continue
    
    # Opening PDF file
    parsed_pdf = parser.from_file(pdf) #sample.pdf
    print("Processing " + pdf)

    # Saving content of PDF
    # To get the text only, use parsed_pdf['text'] - parsed_pdf['content'] returns string
    data = parsed_pdf['content']

    # Linking of content to dbpedia resource
    annotations = spotlight.annotate('https://api.dbpedia-spotlight.org/en/annotate', data, confidence=0.4, support=20)

    # To keep duplicates from being written to the file
    linesSeen = set() # Holds lines already seen

    # Adding the topics
    for elt in annotations:
        try:
            # Writing the topic data in the text file - topicLabel topic_dbpedia_URI PDF_URI COURSE-COMPONENT-#
            url = elt.get("URI")
            name = url.split('/')[-1]
            course = pdf.split('/')[2]
            lecture = pdf.split('/')[3]
            #line = re.sub('[^A-Za-z0-9_-]+', '', elt.get("URI").replace("http://dbpedia.org/resource/", "")) + " " + elt.get("URI") + " " + pdf + " " + pdf.split("/")[2] + "\n"
            line = f"{name} {url} {course} {lecture} \n"
            #print(line)
            if line not in linesSeen and not line == "": # If the line is not a duplicate and it is not empty, add it to the topics file
                courseTopicsTxt.write(line)
                linesSeen.add(line)
        except Exception as e:
            url = elt.get("URI")
            name = url.split('/')[-1]
            line = f"{name} {url} {pdf}"
            print(pdf)
            print(f"Error in {elt}. Skipping")


# Showing where the new file can be found
print("The Course Topics File has been saved as " + courseTopicsTxt.name + " in " + os.getcwd())

# Closing and saving the text file with the data
courseTopicsTxt.close()



2024-04-13 19:43:31,374 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


Processing COURSES/COURSES/COMP6481_PPS/Lecture_1/Other_Material/Tutorial_1.pdf
Processing COURSES/COURSES/COMP6481_PPS/Lecture_1/Slides/Chapter1.pdf
Processing COURSES/COURSES/COMP6481_PPS/Lecture_2/Other_Material/Tutorial_2.pdf
Processing COURSES/COURSES/COMP6481_PPS/Lecture_2/Slides/Chapter2.pdf
Processing COURSES/COURSES/COMP6481_PPS/Lecture_3/Other_Material/Tutorial_3.pdf
Processing COURSES/COURSES/COMP6481_PPS/Lecture_3/Slides/Chapter3.pdf
Processing COURSES/COURSES/COMP6741_IS/Lecture_1/Other_Material/Project_Assignment1.pdf
Processing COURSES/COURSES/COMP6741_IS/Lecture_1/Readings/syllabus.pdf
Processing COURSES/COURSES/COMP6741_IS/Lecture_1/Slides/week1.pdf
Processing COURSES/COURSES/COMP6741_IS/Lecture_2/Readings/Worksheet2.pdf
Processing COURSES/COURSES/COMP6741_IS/Lecture_2/Slides/Chapter_2.pdf
Processing COURSES/COURSES/COMP6741_IS/Lecture_2/Worksheets/Worksheet2_quest.pdf
Processing COURSES/COURSES/COMP6741_IS/Lecture_3/Readings/worksheet2.pdf
Processing COURSES/COURSES/C

In [19]:
def add_topic(kg, topic_name, course_id, lecture_number, topic_link, course_uri):
    """
    Add a topic to the knowledge graph.
    """
    topic_uri = URIRef(topic_name.replace(' ', '_'))
    print(topic_uri)
    
    if "_" in lecture_number:
        lecture_number.replace("_", "")

    lecture_uri = URIRef(course_uri + '_' + lecture_number)
    print(lecture_uri)
    
    # Add triples for the topic
    kg.add((topic_uri, RDF.type, ACAD.Topic))
    kg.add((topic_uri, ACAD.topicName, Literal(topic_name, datatype=XSD.string)))
    kg.add((topic_uri, ACAD.hasTopicLink, Literal(topic_link, datatype=XSD.string)))

    # Connect the topic to a lecture
    kg.add((topic_uri, ACAD.hasProvenanceInformation, lecture_uri))
    
    # Connect the topic to the course
    kg.add((course_uri, ACAD.coversTopic, topic_uri))


### Adding Topic Triples

In [21]:
all_topics = open("courseTopics.txt").readlines()
for topic in all_topics:
    topic = topic.replace("\n", "")
    topic = topic.split()
    label = topic[0]
    uri = topic[1]
    course = topic[2]
    lecture = topic[3]
    # add_topic(kg, "Polymorphism", "COMP6481_PPS", "Lecture2", "https://www.wikidata.org/wiki/Q907364", URIRef(ACADDATA + 'COMP6481_PPS'))
    print((label, course, lecture, uri, URIRef(ACADDATA + course)))
    add_topic(kg, label, course, lecture, uri, URIRef(ACADDATA + course))
    break

    # print(g.serialize(format='turtle').decode('UTF-8')) # For testing
kg.serialize('GraphData2.ttl', format='turtle')

('Inheritance', 'COMP6481_PPS', 'Lecture_1', 'http://dbpedia.org/resource/Inheritance', rdflib.term.URIRef('http://acad.io/data#COMP6481_PPS'))
Inheritance
http://acad.io/data#COMP6481_PPS_Lecture_1


<Graph identifier=https://rdflib.github.io/.well-known/genid/rdflib/Nf691c79527384d6192bdd168443c60de (<class 'rdflib.graph.Graph'>)>