# Convert to JSON Format

Converts old style Turtle files (from previous runs) in a common JSON format.

In [1]:
%pip install rdflib

Looking in indexes: https://pypi.org/simple, https://PyPi-Local:****@pkgs.dev.azure.com/av360/_packaging/PyPi-Local/pypi/simple/
Note: you may need to restart the kernel to use updated packages.


In [3]:
from os.path import isfile, join
from os import listdir
from pathlib import Path
import sys
from rdflib import RDF, SDO, Graph, URIRef
import logging
import json

from rdflib import Literal

ns = "http://schema.org"
base = "C:/Users/patrick.maue/BE O365 MS-Cloud/BG365 - Bildungsplattform - General/Daten/YoutubeVideos/youtube/videos"
save = "C:/Users/patrick.maue/Documents/data/json/video"

In [43]:



def loadGraph(fileStr: str) -> Graph: 
    file: Path =  Path.joinpath(Path(base), fileStr)
    logging.info("Consuming file '{}'", fileStr)
    read = open(file, encoding="utf-8").read()

    g = Graph()
    g.parse(format="n3", data=read)
    # print("file was parsed into graph: "+fileStr)
    return g

def getSubj(g: Graph): 
    for sub in g.subjects(RDF.type, URIRef(base=ns, value="video")): 
        return sub

def getIdentifier(g: Graph) -> str: 
    sub = getSubj(g)
    nodeStr = str(sub)
    id = nodeStr.split(sep=":")[3]
    return id
    

def store(value: dict): 
    id = value["id"]
    serialized = json.dumps(data, ensure_ascii=False, indent=4)
    file: Path =  Path.joinpath(Path(save), "video_"+id+".json")
    with open(file, "w", encoding="utf-8") as outfile:
        outfile.write(serialized)

def constructDict(g: Graph) -> dict: 
    sub = getSubj(g)

    tags = []
    val = g.value(sub, URIRef(""))
            
    for obj in g.objects(sub, URIRef("https://eagl.azurewebsite.org/schema#Tag")):
        tags.append(str(obj))

    return {
        "type": "video", 
        "id": g.value(sub, URIRef("identifier", ns)),
        "publishedAt": g.value(sub, URIRef("publishedOn", ns)), 
        "title": str(g.value(sub, URIRef("title", ns))), 
        "description": str(g.value(sub, URIRef("description", ns))), 
        "tags": tags, 
        "categoryId": g.value(sub, URIRef("CategoryCode", "https://schema.org/")), 
        "defaultLanguage": None, 
        "licensed": None, 
        "duration": g.value(sub, URIRef("duration", ns)), 
        "thumbnailUrl":  g.value(sub, URIRef("thumbnailUrl", ns)), 
        "statistics": {
            "viewCount": g.value(sub, URIRef("interactionCount", ns)), 
            "likeCount": g.value(sub, URIRef("upvoteCount", ns)), 
            "favoriteCount": g.value(sub, URIRef("ratingCount", ns)), 
            "commentCount": g.value(sub, URIRef("commentCount", ns)), 
        },
        "channel": { 
            "id": None, 
            "title": None, 
        }
        
    }

print("Starting")

# file = join(base, "7ypjTWq5dU8.n3")
# g: Graph = loadGraph(file)
# data: dict = constructDict(g)
# store(data)


onlyfiles = [f for f in listdir(base) if isfile(join(base, f))]
for file in onlyfiles: 
    g: Graph = loadGraph(file)
    data: dict = constructDict(g)
    store(data)






Starting
