# Create Turtle files

Takes the JSON files and converts them to Turtle files (Linked Data Fragments). All TTL files are stored back to disk. 

In [1]:
%pip install rdflib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


# Globals and imports

In [13]:
from os.path import isfile, join
from os import listdir
from pathlib import Path
import sys
from rdflib import RDF, SDO, Graph, URIRef, BNode
import logging
import json

ns = "http://schema.org"
base = "D:\\Scraped\\youtube\\json\\captions"
save = "D:\\Scraped\\youtube\\rdf\\captions"

# Class Definitions

In [3]:
from __future__ import annotations

from rdflib import Graph, URIRef
from rdflib.term import Node, URIRef, Literal
from rdflib.namespace import FOAF, XSD, URIRef, RDF, DCTERMS, ORG


class GraphBuilder:
    node: URIRef

    def __init__(self):
        self.graph = Graph()

        pass

    def current_node(self, subj: URIRef, node_type: Node) -> GraphBuilder:
        self.node = subj
        self.graph.add((self.node, RDF.type, node_type))
        return self

    def add_literal(self, predicate: Node, value: str) -> GraphBuilder:
        if value:
            self.graph.add((self.node, predicate, Literal(value)))
        return self

    def add_literal_with_lang(self, predicate: Node, value: str, language: str) -> GraphBuilder:
        if value:
            self.graph.add((self.node, predicate, Literal(value, lang=language)))
        return self

    def link_node(self, predicate: Node, node_id: str, node_type: Node) -> GraphBuilder:
        if node_id:
            linked_node = URIRef(node_id)
            self.graph.add((linked_node, RDF.type, node_type))
            self.graph.add((self.node, predicate, linked_node))
        return self

    def link(self, predicate: Node, source: URIRef, target: URIRef) -> GraphBuilder: 
        self.graph.add((source, predicate, target))
        return self

    def add_type(self, node_type: URIRef) -> GraphBuilder: 
        self.graph.add((self.node, RDF.type, node_type))
        return self

    def add_datetime(self, predicate: Node, value: str) -> GraphBuilder:
        if value:
            self.graph.add((self.node, predicate, Literal(value, datatype=XSD.date)))
        return self

    def build(self):
        return self.graph


# Functions

In [35]:
from urllib import parse


def construct_captions_graph(data: dict) -> str:
    b = GraphBuilder()
    videoNode = BNode()
    b.current_node(videoNode, SDO.VideoObject)
    b.add_literal(SDO.identifier, data.get('video_id')) 
    b.add_literal_with_lang(SDO.caption, data.get("de"), "de")
    b.add_literal_with_lang(SDO.caption, data.get("en"), "en")

    graph = b.build()
    return graph.serialize(format="ttl")

def store(value: str, id: str): 
    file: Path =  Path.joinpath(Path(save), "captions_"+id+".ttl")
    with open(file, "w", encoding="utf-8") as outfile:
        outfile.write(value)


# Main Script

In [36]:
onlyfiles = [f for f in listdir(base) if isfile(join(base, f))]
for filename in onlyfiles: 
    file: Path =  Path.joinpath(Path(base), filename)
    content = file.open(mode='r', encoding="utf-8")
    
    data: dict = json.load(content)
    serialized = construct_captions_graph(data)
    
    id = data["video_id"]
    store(serialized, id)