In [1]:
!pip install rdflib
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import re
import rdflib
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import random
import pandas as pd
import numpy as np
import json
from transformers import pipeline
import csv
from sklearn.metrics import pairwise_distances

In [3]:
WDT = Namespace('http://www.wikidata.org/prop/direct/')


def dataloader():
    print('############# Start loading data #############')

    ner_pipeline = pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english')

    # Build the graph
    graph = rdflib.Graph()
    graph.parse('data/14_graph.nt', format='turtle')

    # Load embedding dictionaries
    with open('data/entity_ids.del', 'r') as ifile:
        ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
        id2ent = {v: k for k, v in ent2id.items()}
    with open('data/relation_ids.del', 'r') as ifile:
        rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
        id2rel = {v: k for k, v in rel2id.items()}

    triple_df = pd.read_csv('data/14_graph.tsv', sep='\t', names=["entity1", "relation", "entity2"])
    entity_emb = np.load('data/entity_embeds.npy')
    relation_emb = np.load('data/relation_embeds.npy')

    ent2imb = {str(ent): str(imb) for ent, imb in graph.subject_objects(WDT.P345)}

    ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
    lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

    # Load multimedia dataset
    f = open('data/images.json')
    mediadata = json.load(f)

    # Load crowdsource dataset
    crowd_df = pd.read_csv('data/crowd_data.tsv', sep='\t')

    print('Data loading done.')
    return graph, ent2id, id2ent, rel2id, id2rel, ent2lbl, lbl2ent, triple_df, entity_emb, \
           relation_emb, ent2imb, mediadata, crowd_df, ner_pipeline

In [4]:
def questionprocessor(question, ner_pipeline):
    if question.find('VI -') != -1:
        question = question.replace('-', '–')

    if question.find('ecommend') != -1:
        qtype = 'Recommend'
    elif (question.find('picture') != -1) or (question.find('like') != -1) or (question.find('figure') != -1):
        qtype = 'Multimedia'
    else:
        print('Please choose a question type: 1. KG 2. Embedding')
        tmp = input()
        if tmp == '1':
            qtype = 'KG'
        else:
            qtype = 'Embedding'

    if question.find('of') != -1:
        sub1 = "of "
        sub2 = " ?"
        idx1 = question.find(sub1)
        idx2 = question.find(sub2)
        movie = question[idx1 + len(sub1): idx2]
        return qtype, movie
    else:
        movies = []
        entities = ner_pipeline(question, aggregation_strategy="simple")
        for entity in entities:
            movies.append(entity['word'])

    if question.find('ecommend') != -1:
        return qtype, movies
    else:
        return qtype, movies[0]

In [66]:
def factual(question, graph, movies, ent2lbl, lbl2ent, ans_df):
    WD = Namespace('http://www.wikidata.org/entity/')
    WDT = Namespace('http://www.wikidata.org/prop/direct/')

    if question.find('of') != -1:
        sub1 = " the "
        sub2 = " of "
        idx1 = question.find(sub1)
        idx2 = question.find(sub2)
        relation = question[idx1 + len(sub1): idx2]
        print('The relation is', relation)

    if question.find('direct') != -1:
        relation = 'director'

    query_relURI = '''
        SELECT ?rel WHERE{{
            ?rel rdfs:label "{}"@en.
            }}'''.format(relation) 

    relURIList = list(graph.query(query_relURI))
    for idx, relURI in enumerate(relURIList):
        rel_tmp = relURI[0].n3()
        if WDT in rel_tmp:
            rel = rel_tmp

    mov = lbl2ent[movies].n3()

    ent1 = re.sub('<|>', '', mov)
    ent2 = re.sub('<|>', '', rel)
    crowd_idx1 = 'wd:'+re.findall(r'http://www.wikidata.org/entity/(.*)', ent1)[0]
    crowd_idx2 = 'wdt:'+re.findall(r'http://www.wikidata.org/prop/direct/(.*)', ent2)[0]
    
    if (crowd_idx1 in ans_df['Input1ID'].values) & (crowd_idx2 in ans_df['Input2ID'].values):
        tmp = ans_df.loc[ans_df['Input1ID']==crowd_idx1]
        ans = tmp['Input3ID'].values[0]
        if ans.startswith('wd:'):
            ans = ent2lbl[rdflib.term.URIRef(ent1)]
        print('The answer is',ans+', according to the crowd, who had an inter-rater agreement of:', tmp['Kappa'].values[0])
        print('The answer distribution is:', tmp['Correct'].values[0],'support vote and', (3-tmp['Correct'].values[0]), 'reject vote.')

    else:
        # rel = '<'+rel+'>'

        idxs = triple_df[(triple_df['entity1'] == mov) & (triple_df['relation'] == rel)].index.values

        entity2 = triple_df['entity2'].iloc[idxs[0]]
        entity2 = re.sub('<|>', '', entity2)
        entity2_lbl = ent2lbl[rdflib.term.URIRef(entity2)]

        answers = [
            'A: I think it is ' + entity2_lbl,
            'A: ' + entity2_lbl + ' is the ' + relation + ' of ' + movies
        ]

        print(random.choice(answers))

In [67]:
def embedding(question, graph, movies, ent2id, id2ent, rel2id, id2rel, triple_df, entity_emb, relation_emb):
    WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')

    if question.find('the') != -1:
        sub1 = " the "
        sub2 = " of "
        idx1 = question.find(sub1)
        idx2 = question.find(sub2)
        relation = question[idx1 + len(sub1): idx2]
        print('relation is ', relation)

    if question.find('direct') != -1:
        relation = 'director'

    query_relURI = '''
        SELECT ?rel WHERE{{
            ?rel rdfs:label "{}"@en.
            }}'''.format(relation)

    relURI = []
    relURIList = list(graph.query(query_relURI))
    for idx, relURI in enumerate(relURIList):
        tmp = str(relURI[0])
        if WDT in tmp:
            rel = tmp

    mov = str(lbl2ent[movies].n3())

    rel_id = rel2id[rdflib.term.URIRef(rel)]
    mov_id = ent2id[rdflib.term.URIRef(re.sub('<|>','',mov))]

    rel = '<'+rel+'>'

    topN = 3
    rel_emb = np.atleast_2d(relation_emb[rel_id])
    rel_dist = pairwise_distances(rel_emb, relation_emb)
    relation2 = []
    for idx in rel_dist.argsort().reshape(-1)[:3]:
        relation2.append(str(id2rel[idx].n3()))

    idxs = 0
    idxs = triple_df[(triple_df['entity1'] == mov) & (triple_df['relation'] == rel)].index.values
    length = len(idxs)
    if length == 0:
        idxs = triple_df[(triple_df['entity1'] == mov) & (triple_df['relation'] == relation2[1])].index.values

    entity2 = triple_df['entity2'].iloc[idxs[0]]
    entity2 = re.sub('<|>','',entity2)
    entity2_id = ent2id[rdflib.term.URIRef(entity2)]

    # TransE
    topN = 3
    emb = np.atleast_2d(entity_emb[entity2_id])
    dist = pairwise_distances(emb, entity_emb)
    entity2 = []
    for idx in dist.argsort().reshape(-1)[:3]:
        print(ent2lbl[id2ent[idx]])

In [7]:
def multimedia(graph, movies, mediadata, ner_pipeline):
    WDT = Namespace('http://www.wikidata.org/prop/direct/')
    lbl2ent = {str(lbl): str(ent) for ent, lbl in graph.subject_objects(RDFS.label)}
    ent2imb = {str(ent): str(imb) for ent, imb in graph.subject_objects(WDT.P345)}

    entities = ner_pipeline(question, aggregation_strategy="simple")
    for entity in entities:
        lbl = entity['word']

    ent = lbl2ent[lbl]
    imb = ent2imb[ent]

    if imb[:2] == 'tt':
        for item in mediadata:
            if imb in item['movie']:
                print(item['img'])
                break
    elif imb[:2] == 'nm':
        for item in mediadata:
            if (imb in item['cast']) & (len(item['cast'])==1):
                print(item['img'])
                break
    else:
        print('Not a movie or human.')

In [8]:
def recommend(question, graph, movies, ent2lbl, triple_df):
    mov_list = []

    mov_list = []
    for mov in movies:
        mov_lbl = [str(s) for s,  in graph.query('''
            SELECT ?movie WHERE {
                ?movie rdfs:label '%s'@en .
            }'''%mov)]
        if len(mov_lbl)!= 0:
            mov_list.append(mov_lbl)
    
    for i in range(len(mov_list)):
        for j in range(len(mov_list[i])):
            mov_list[i][j] = '<' + mov_list[i][j] + '>'

    dfs = []
    for i in range(len(mov_list)):
        df = []
        df = triple_df.loc[triple_df['entity1'].isin(mov_list[i])]
        dfs.append(df)

    for i in range(1, len(mov_list)):
        dfs[i] = pd.merge(dfs[i-1], dfs[i], on=["relation", "entity2"])
    
    rel_df = dfs[len(mov_list)-1]
    rel_df = rel_df.drop_duplicates(subset=['relation', 'entity2'])
    entity2 = rel_df['entity2'].values.tolist()

    common = []
    for ent in entity2:
        if '<' in ent:
            ent = re.sub('<|>','',ent)
            lbl = ent2lbl[rdflib.term.URIRef(ent)]
        else:
            lbl = ent
        common.append(lbl)

    entity1 = []
    entity1 = triple_df['entity1'].loc[triple_df['entity2'].isin(entity2)]  
    entity1 = entity1.value_counts()[len(mov_list):len(mov_list)+3].index.tolist()  

    answers = []
    for ent in entity1:
        ent = re.sub('<|>','',ent)
        lbl = ent2lbl[rdflib.term.URIRef(ent)]
        answers.append(lbl)

    print(answers[0]+', '+answers[1]+', '+answers[2])

In [9]:
def crowdsource(crowd_df):

    crowd_df.drop(['Title','Reward','AssignmentId','AssignmentStatus'], inplace=True, axis=1)
    crowd_df['LifetimeApprovalRate'] = crowd_df['LifetimeApprovalRate'].str.rstrip('%').astype('float') / 100.0
    crowd_df = crowd_df.loc[(crowd_df['WorkTimeInSeconds'] >= 50) & (crowd_df['LifetimeApprovalRate'] >= 0.7)]
    crowd_df.drop(['WorkerId','WorkTimeInSeconds','LifetimeApprovalRate'], inplace=True, axis=1)
    ans_df = crowd_df.groupby(['HITId']).first()

    rate = []
    ans_df['Correct'] = None

    for i in range(1, len(ans_df)+1):
        # Get the specific group
        df = crowd_df.loc[crowd_df['HITId']== i]

        corr_count = int(df['AnswerID'][df['AnswerID']==1].count())
        incorr_count = int(df['AnswerID'][df['AnswerID']==2].count())
        ans_df['Correct'][i] = corr_count


        rate.append([corr_count, incorr_count])
        if (corr_count < incorr_count):
            ans_df['AnswerLabel'][i] = 'INCORRECT'

            fixValueLoc = df['FixValue'].first_valid_index()
            fixPositionLoc = df['FixPosition'].first_valid_index()

            if fixValueLoc is not None:

                fixPosition = crowd_df['FixPosition'][fixPositionLoc]
                fixValue = crowd_df['FixValue'][fixValueLoc]

                if fixPosition == 'Subject':
                    if fixValue.startswith('Q'):
                        ans_df['Input1ID'][i] = 'wd:'+fixValue
                    else:
                        ans_df['Input1ID'][i] = fixValue
                elif fixPosition == 'Predicate':
                    if fixValue.startswith('P'):
                        ans_df['Input2ID'][i] = 'wdt:'+fixValue
                    else:
                        ans_df['Input2ID'][i] = fixValue
                else:
                    if fixValue.startswith('Q'):
                        ans_df['Input3ID'][i] = 'wd:'+fixValue
                    else:
                        ans_df['Input3ID'][i] = fixValue
        
        else:
            ans_df['AnswerLabel'][i] = 'CORRECT'

    def checkInput(rate, n):
        """ 
        Check correctness of the input matrix
        @param rate - ratings matrix
        @return n - number of raters
        @throws AssertionError 
        """
        N = len(rate)
        k = len(rate[0])
        assert all(len(rate[i]) == k for i in range(k)), "Row length != #categories)"
        assert all(isinstance(rate[i][j], int) for i in range(N) for j in range(k)), "Element not integer" 
        assert all(sum(row) == n for row in rate), "Sum of ratings != #raters)"

    def fleissKappa(rate,n):
        """ 
        Computes the Kappa value
        @param rate - ratings matrix containing number of ratings for each subject per category 
        [size - N X k where N = #subjects and k = #categories]
        @param n - number of raters   
        @return fleiss' kappa
        """

        N = len(rate)
        k = len(rate[0])
        print("#raters = ", n, ", #subjects = ", N, ", #categories = ", k)
        checkInput(rate, n)

        #mean of the extent to which raters agree for the ith subject 
        PA = sum([(sum([i**2 for i in row])- n) / (n * (n - 1)) for row in rate])/N
        print("PA = ", PA)
        
        # mean of squares of proportion of all assignments which were to jth category
        PE = sum([j**2 for j in [sum([rows[i] for rows in rate])/(N*n) for i in range(k)]])
        print("PE =", PE)
        
        kappa = -float("inf")
        try:
            kappa = (PA - PE) / (1 - PE)
            kappa = float("{:.3f}".format(kappa))
        except ZeroDivisionError:
            print("Expected agreement = 1")

        print("Fleiss' Kappa =", kappa)
        
        return kappa

    len1 = len(ans_df[ans_df['HITTypeId']=='7QT'])
    len2 = len(ans_df[ans_df['HITTypeId']=='8QT'])
    len3 = len(ans_df[ans_df['HITTypeId']=='9QT'])

    rate1 = rate[:len1]
    rate2 = rate[len1:len1+len2]
    rate3 = rate[len1+len2:]

    kappa1 = fleissKappa(rate1, 3)
    kappa2 = fleissKappa(rate2, 3)
    kappa3 = fleissKappa(rate3, 3)

    ans_df['Kappa'] = None
    ans_df['Kappa'][:len1] = kappa1
    ans_df['Kappa'][len1:len1+len2] = kappa2
    ans_df['Kappa'][len1+len2:] = kappa3

    return ans_df

In [10]:
graph, ent2id, id2ent, rel2id, id2rel, ent2lbl, lbl2ent, triple_df, \
entity_emb, relation_emb, ent2imb, mediadata, crowd_df, ner_pipeline = dataloader()
ans_df = crowdsource(crowd_df)

############# Start loading data #############
Data loading done.


In [71]:
question = input('Q: ')
qtype, movies = questionprocessor(question, ner_pipeline)

if qtype == 'KG':
    factual(question, graph, movies, ent2lbl, lbl2ent, ans_df)
elif qtype == 'Embedding':
    embedding(question, graph, movies, ent2id, id2ent, rel2id, id2rel, triple_df, entity_emb, relation_emb)
elif qtype == 'Multimedia':
    multimedia(graph, movies, mediadata, ner_pipeline)
elif qtype == 'Recommend':
    recommend(question, graph, movies, ent2lbl, triple_df)

Q: Who is the executive producer of X-Men: First Class?
Please choose a question type: 1. KG 2. Embedding
1
The relation is executive producer
The answer is X-Men: First Class, according to the crowd, who had an inter-rater agreement of: 0.263
The answer distribution is: 2 support vote and 1 reject vote.
