In [1]:
import pandas as pd #for handling csv and csv contents
from rdflib import Graph, Literal, RDFS, RDF, URIRef, Namespace, Dataset #basic RDF handling
from rdflib.namespace import FOAF , XSD #most common namespaces
import urllib.parse #for parsing strings to URI's
from iribaker import to_iri
import math

In [2]:
base_link = "http://www.gra.fo/schema/untitled-ekg#"

# name space for resources
data = base_link+"/resource/"
# data = base_link+'/'
DATA = Namespace(data)
# namespace for vocabulary
# vocab = base_link+'/vocab/'
vocab = base_link
VOCAB = Namespace(vocab)
# URI
# graph_uri = URIRef(data+'/examplegraph')
graph_uri = URIRef(base_link)

dataset = Dataset()
dataset.bind('g20data', DATA)
dataset.bind('g20vocab', VOCAB)

## load TBOX
g = dataset.graph(graph_uri)
dataset.default_context.parse('TBOX_withoutLabels.ttl', format='turtle')

<Graph identifier=urn:x-rdflib:default (<class 'rdflib.graph.Graph'>)>

In [36]:
# reading csv for conferences/journals
data_folder = "data/Processed_data/"
df = pd.read_csv(data_folder+"Conf_Jour_Area.csv")

conf_df = df.loc[df['Type']=='Journal']
conf_df = df.loc[df['Type']=='Conference']

for _,conf_r in conf_df.iterrows():
    # Journal
    conf = URIRef(to_iri(data+conf_r[0]))
    conf_name = Literal(conf_r[0], datatype=XSD['string'])

    g.add((conf, RDFS.label, conf_name))
    g.add((conf, RDF.type, VOCAB["Conference"]))

    # Journal - related_to -> Area
    for area in conf_r[-1].split(";"):
        area = area.strip()
        area = URIRef(to_iri(data+area))
        area_name = Literal(area, datatype=XSD['string'])

        g.add((area, RDFS.label, area_name))
        g.add((conf, VOCAB['journal_related_to'], area))

for _,conf_r in conf_df.iterrows():
    # Conference
    conf = URIRef(to_iri(data+conf_r[0]))
    conf_name = Literal(conf_r[0], datatype=XSD['string'])

    g.add((conf, RDFS.label, conf_name))
    g.add((conf, RDF.type, VOCAB["Journal"]))

    # Conference - related_to -> Area
    for area in conf_r[-1].split(";"):
        area = area.strip()
        area = URIRef(to_iri(data+area))
        area_name = Literal(area, datatype=XSD['string'])

        g.add((area, RDFS.label, area_name))
        g.add((conf, VOCAB['conference_related_to'], area))

In [37]:
df1 = pd.read_csv(data_folder+'Redundant_Columns.csv') # another column we need
df = pd.read_csv(data_folder+"Cite_fake_New.csv")
df['Paper Year'] = df1['Year']

# putting the same names as in the schema
df['Paper Type'] = df['Paper Type'].replace({'Full Paper':'FullPaper', 'Short Paper': 'Shortpaper', 'Demo Paper':'Demopaper'})
df['Conference Type'] = df['Conference Type'].replace({"Regular Conference":"Regularconference", "Expert Group": "Expertgroup"})

In [38]:
for _,row in df.iterrows():
    authors = [a.strip() for a in row[1].split(", ")]
    title, source_title, proceeding, volume = row[2:6]
    areas = [a.strip() for a in row[6].split(";")]
    cj_type = row[7]
    c_type = row[8]
    p_type = row[9]
    is_approved = True if row[10] == 'Yes' else False
    leader1 = row[11]
    leader2 = row[12]
    p_year = int(row[13])

    paper_n = URIRef(to_iri(data+title))
    title_n = Literal(title, datatype=XSD['string'])
    g.add((paper_n, RDFS.label, title_n))
    year_n = Literal(p_year, datatype=XSD.integer)
    g.add((paper_n, VOCAB['paperyear'], year_n))
    g.add((paper_n, RDF.type, VOCAB[p_type]))

    for author in authors:
        author_n = URIRef(to_iri(data+author))
        author_name_n = Literal(author, datatype=XSD['string'])
        g.add((author_n, RDFS.label, author_name_n))
        g.add((author_n, VOCAB['writes'], paper_n))
    
    for area in areas:
        area_n = URIRef(to_iri(data+area))
        area_name = Literal(area_n, datatype=XSD['string'])
        g.add((area_n, RDFS.label, area_name))
        g.add((paper_n, VOCAB['paper_related_to'], area_name))
    
    if cj_type == 'Conference':
        conf_n = URIRef(to_iri(data+source_title))
        g.add((paper_n, VOCAB['submitted_to_conference'], conf_n))
        g.add((conf_n, RDF.type, VOCAB[c_type]))
        
        if type(proceeding) != str:
            raise(KeyError("Proceeding not defined for a conference paper"))
        else:
            proceeding_n = URIRef(to_iri(data+proceeding))
            proceeding_name_n = Literal(proceeding, datatype=XSD.string)
            g.add((proceeding_n, RDFS.label, proceeding_name_n))
            g.add((proceeding_n, VOCAB['belongs_to_conference'], conf_n))
            if is_approved:
                g.add((paper_n, VOCAB['published_in_proceeding'], proceeding_n))
        
        chair_n = URIRef(to_iri(data+leader1))
        chair_name_n = Literal(leader1, datatype=XSD.string)
        g.add((chair_n, RDFS.label, chair_name_n))
        g.add((conf_n, VOCAB['handled_by_chair'], chair_n))
        chair_n = URIRef(to_iri(data+leader2))
        chair_name_n = Literal(leader2, datatype=XSD.string)
        g.add((chair_n, RDFS.label, chair_name_n))
        g.add((conf_n, VOCAB['handled_by_chair'], chair_n))

    elif cj_type == 'Journal':
        jour_n = URIRef(to_iri(data+source_title))
        g.add((paper_n, VOCAB['submitted_to_journal'], jour_n))
        
        if type(volume) != str:
            raise(KeyError("Volume not defined for a journal paper"))
        else:
            volume_n = URIRef(to_iri(data+volume))
            volume_name_n = Literal(volume, datatype=XSD.string)
            g.add((volume_n, RDFS.label, volume_name_n))
            g.add((volume_n, VOCAB['belongs_to_journal'], jour_n))
            if is_approved:
                g.add((paper_n, VOCAB['published_in_volume'], volume_n))
        
        editor_n = URIRef(to_iri(data+leader1))
        editor_name_n = Literal(leader1, datatype=XSD.string)
        g.add((jour_n, VOCAB['handled_by_editor'], editor_n))
        editor_n = URIRef(to_iri(data+leader2))
        editor_name_n = Literal(leader2, datatype=XSD.string)
        g.add((editor_n, RDFS.label, editor_name_n))
        g.add((jour_n, VOCAB['handled_by_editor'], editor_n))

In [39]:
df = pd.read_csv(data_folder+"review_New.csv")

In [41]:
for _,row in df.iterrows():
    title = row[6]
    decision = True if row[7]=='1' else False
    reviewer = row[9]
    review_text = row[10]
    assigned_by = row[11]
    is_conference = True if row[2] == 'Conference' else False

    paper_n = URIRef(to_iri(data+title))

    reviewer_n = URIRef(to_iri(data+reviewer))
    reviewer_name_n = Literal(title, datatype=XSD['string'])
    g.add((reviewer_n, RDFS.label, reviewer_name_n))
    
    decision_n = URIRef(to_iri(data+f"decision by {reviewer} for {title}"))
    decision_literal = Literal(f"decision by {reviewer} for {title}", datatype=XSD.string)
    g.add((decision_n, RDFS.label, decision_literal))
    review_text_n = Literal(review_text, datatype=XSD.string)
    g.add((decision_n, VOCAB["review_text"], review_text_n))
    is_accepted_n = Literal(decision, datatype=XSD.boolean)
    g.add((decision_n, VOCAB["is_accepted"], is_accepted_n))

    leader_n = URIRef(to_iri(data+assigned_by))

    if is_conference:
        g.add((leader_n, VOCAB["chair_assigns"], reviewer_n))
    else:
        g.add((leader_n, VOCAB["editor_assigns"], reviewer_n))
    
    g.add((reviewer_n, VOCAB["submits"], decision_n))
    g.add((decision_n, VOCAB["about"], paper_n))


In [42]:
g.serialize(format='ttl', destination='abox1.ttl')

<Graph identifier=http://www.gra.fo/schema/untitled-ekg# (<class 'rdflib.graph.Graph'>)>