In [198]:
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS, XSD, FOAF, BNode

In [199]:
SCHEMA = Namespace('http://schema.org/')
MYNS = Namespace('http://dsci558.org/rahulKhannaFakeNamespace/')

In [200]:
g = Graph()

In [201]:
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("foaf", FOAF)
g.bind("schema", SCHEMA)
g.bind("myns", MYNS)

In [151]:
import json
import ujson

In [152]:
with open("Rahul_Khanna_hw03_imdb_afi_el.json")  as f:
    joined_data = json.load(f)

In [153]:
def deserialize_jl(input_file):
    with open(input_file, "r") as f:
        return [ujson.loads(s) for s in f if s != "\n"]

In [154]:
imdb_data = deserialize_jl("imdb.jl")

In [155]:
afi_data = deserialize_jl("afi.jl")

In [156]:
key_afi_data = {}
for entry in afi_data:
    key_afi_data[entry["url"]] = entry
del afi_data

In [157]:
key_imdb_data = {}
for entry in imdb_data:
    key_imdb_data[entry["url"]] = entry
del imdb_data

In [202]:
node_uri = URIRef(MYNS['ProductionCompany'])
g.add((node_uri, RDF.type, SCHEMA.Class))
g.add((node_uri, RDFS.subClassOf, SCHEMA.Organization))
g.add((node_uri, SCHEMA.name, XSD.string))

node_uri = URIRef(MYNS['Genre'])
g.add((node_uri, RDF.type, SCHEMA.Class))
g.add((node_uri, SCHEMA.name, XSD.string))

node_uri = URIRef(MYNS['imdbAggRating'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDFS.domain, MYNS.Movie))
g.add((node_uri, RDFS.range, SCHEMA.AggregateRating))

node_uri = URIRef(MYNS['imdbMetascore'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDFS.domain, MYNS.Movie))
g.add((node_uri, RDFS.range, XSD.int))

node_uri = URIRef(MYNS['grossIncome'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDFS.domain, MYNS.Movie))
g.add((node_uri, RDFS.range, XSD.float))

node_uri = URIRef(MYNS['cinematographer'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDFS.domain, MYNS.Movie))
g.add((node_uri, RDFS.range, SCHEMA.Person))

node_uri = URIRef(MYNS['yearPublished'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDFS.domain, MYNS.Movie))
g.add((node_uri, RDFS.range, XSD.string))

node_uri = URIRef(MYNS['movieGenre'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDFS.domain, MYNS.Movie))
g.add((node_uri, RDFS.range, MYNS.Genre))

node_uri = URIRef(MYNS['Movie'])
g.add((node_uri, RDF.type, SCHEMA.Class))
g.add((node_uri, RDFS.subClassOf, SCHEMA.Movie))
g.add((node_uri, SCHEMA.productionCompany, MYNS.Production_company))
g.add((node_uri, SCHEMA.datePublished, SCHEMA.date))
g.add((node_uri, MYNS.year_published, XSD.string))
g.add((node_uri, MYNS.movie_genre, MYNS.Genre))
g.add((node_uri, SCHEMA.name, XSD.string))
g.add((node_uri, SCHEMA.contentRating, XSD.string))
g.add((node_uri, SCHEMA.duration, XSD.duration))
g.add((node_uri, MYNS.imdb_agg_rating, SCHEMA.AggregateRating))
g.add((node_uri, MYNS.imdb_metascore, XSD.int))
g.add((node_uri, MYNS.gross_income, XSD.float))
g.add((node_uri, SCHEMA.producer, SCHEMA.Person))
g.add((node_uri, SCHEMA.author, SCHEMA.Person))
g.add((node_uri, MYNS.cinematographer, SCHEMA.Person))

In [180]:
from dateutil import parser
import datetime

In [160]:
g.serialize('blah.ttl', format="turtle")

In [161]:
# g = Graph()

In [203]:
created_producers = {}
created_writers = {}
creatred_cina = {}
created_prod_company = {}
created_genre = {}
for entry in joined_data:
    imdb_url = entry["imdb_movie"]
    afi_url = entry["afi_movie"]
    parts = imdb_url.split("/")
    imdb_id = parts[len(parts)-2]
    movie = URIRef(MYNS["movie/{}".format(imdb_id.replace(" ", "-"))])
    g.add((movie, RDF.type, MYNS.Movie))
    if afi_url:
        afi_data = key_afi_data[afi_url]
        
        # Add Production Company
        if "production_company" in afi_data:
            production_company_name = afi_data["production_company"]
            production_company = None
            if production_company_name in created_prod_company:
                production_company = created_prod_company[production_company_name]
            else:
                node_uri = URIRef(MYNS["production-company/{}".format(production_company_name.replace(" ", "-"))])
                g.add((node_uri, RDF.type, MYNS.ProductionCompany))
                g.add((node_uri, SCHEMA.name, Literal(production_company_name)))
                created_prod_company[production_company_name] = node_uri
                production_company = node_uri

            g.add((movie, SCHEMA.productionCompany, production_company))
        
        # Add Producer
        if "producer" in afi_data:
            producer_name = afi_data["producer"]
            producer = None
            if producer_name in created_producers:
                producer = created_producers[producer_name]
            else:
                node_uri = URIRef(MYNS["producer/{}".format(producer_name.replace(" ", "-"))])
                g.add((node_uri, RDF.type, SCHEMA.Person))
                g.add((node_uri, SCHEMA.name, Literal(producer_name)))
                created_producers[producer_name] = node_uri
                producer = node_uri

            g.add((movie, SCHEMA.producer, producer))
        
        # Add Writer
        if "writer" in afi_data:
            author_name = afi_data["writer"]
            writer = None
            if author_name in created_writers:
                writer = created_writers[author_name]
            else:
                node_uri = URIRef(MYNS["writer/{}".format(author_name.replace(" ", "-"))])
                g.add((node_uri, RDF.type, SCHEMA.Person))
                g.add((node_uri, SCHEMA.name, Literal(author_name)))
                created_writers[author_name] = node_uri
                writer = node_uri

            g.add((movie, SCHEMA.author, writer))
        
        # Add Cinematographer
        if "cinematographer" in afi_data:
            cina_name = afi_data["cinematographer"]
            cina = None
            if cina_name in creatred_cina:
                cina = creatred_cina[cina_name]
            else:
                node_uri = URIRef(MYNS["cinematographer/{}".format(cina_name.replace(" ", "-"))])
                g.add((node_uri, RDF.type, SCHEMA.Person))
                g.add((node_uri, SCHEMA.name, Literal(cina_name)))
                creatred_cina[cina_name] = node_uri
                cina = node_uri

            g.add((movie, MYNS.cinematographer, cina))
        
        # Add Release Date
        if "release_date" in afi_data:
            date_string = afi_data["release_date"]
            if len(date_string) > 4:
                date_obj = parser.parse(date_string)
                g.add((movie, SCHEMA.datePublished, Literal(date_obj)))
    
    imdb_data = key_imdb_data[imdb_url]
    
    # Add Release Year
    if "year" in imdb_data:
        year = int(imdb_data["year"])
        g.add((movie, MYNS.yearPublished, Literal(year)))
    
    # Add Genre
    if "genre" in imdb_data:
        genre_string = imdb_data["genre"]
        genres = genre_string.lower().split(", ")
        for genre_name in genres:
            genre = None
            if genre_name in created_genre:
                genre = created_genre[genre_name]
            else:
                node_uri = URIRef(MYNS["genre/{}".format(genre_name.replace(" ", "-"))])
                g.add((node_uri, RDF.type, MYNS.Genre))
                g.add((node_uri, SCHEMA.name, Literal(genre_name)))
                created_genre[genre_name] = node_uri
                genre = node_uri
            g.add((movie, MYNS.movieGenre, genre))
    
    # Add name
    if "name" in imdb_data:
        name = imdb_data["name"]
        g.add((movie, SCHEMA.name, Literal(name)))
    
    # Add "Certificate" -> Content Rating
    if "certificate" in imdb_data:
        content_rating = imdb_data["certificate"]
        g.add((movie, SCHEMA.contentRating, Literal(content_rating)))
    
    # Add duration
    if "runtime" in imdb_data:
        duration_string = imdb_data["runtime"]
        duration = None
        duration_mins = int(duration_string[:len(duration_string)-4])
        if duration_mins > 60:
            duration_hours = duration_mins // 60
            duration_mins = duration_mins % 60
            duration = "P{}H{}M".format(duration_hours, duration_mins)
        else:
            duration = "P{}M".format(duration_mins)

        g.add((movie, SCHEMA.duration, Literal(duration)))
    
    # Add Imdb Rating Info
    if "votes" in imdb_data and "rating" in imdb_data:
        movie_votes = imdb_data["votes"]
        movie_votes_int = int(movie_votes.replace(",", ""))
        rating = float(imdb_data["rating"])

        node_uri = URIRef(MYNS["aggRating/{}-{}".format(name.replace(" ", "-"), str(datetime.datetime.now().date()))])
        g.add((node_uri, RDF.type, SCHEMA.AggregateRating))
        g.add((node_uri, SCHEMA.ratingCount, Literal(movie_votes_int)))
        g.add((node_uri, SCHEMA.ratingValue, Literal(rating)))
        g.add((node_uri, SCHEMA.itemReviewed, movie))

        g.add((movie, MYNS.imdbAggRating, node_uri))
    
    # Add Imdb Metascore Info
    if "metascore" in imdb_data:
        imdb_metascore = int(imdb_data["metascore"])
        g.add((movie, MYNS.imdbMetascore, Literal(imdb_metascore)))
    
    # Add Gross Income Info
    if "gross" in imdb_data:
        gross_income_string = imdb_data["gross"]
        gross_income = float(gross_income_string[1:len(gross_income_string)-1])
        g.add((movie, MYNS.grossIncome, Literal(gross_income)))

In [204]:
g.serialize('Rahul_Khanna_hw03_movie_triples.ttl', format="turtle")