In [1]:
!pip install rdflib
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdflib
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 8.2 MB/s 
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 244 kB/s 
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 43.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [2]:
import re
import rdflib
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import random
import pandas as pd
import numpy as np
import json
from transformers import pipeline
import csv
from sklearn.metrics import pairwise_distances

In [3]:
WDT = Namespace('http://www.wikidata.org/prop/direct/')


def dataloader():
    print('############# Start loading data #############')

    ner_pipeline = pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english')

    # Build the graph
    graph = rdflib.Graph()
    graph.parse('drive/MyDrive/14_graph.nt', format='turtle')

    # Load embedding dictionaries
    with open('drive/MyDrive/entity_ids.del', 'r') as ifile:
        ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
        id2ent = {v: k for k, v in ent2id.items()}
    with open('drive/MyDrive/relation_ids.del', 'r') as ifile:
        rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
        id2rel = {v: k for k, v in rel2id.items()}

    triple_df = pd.read_csv('drive/MyDrive/14_graph.tsv', sep='\t', names=["entity1", "relation", "entity2"])
    entity_emb = np.load('drive/MyDrive/entity_embeds.npy')
    relation_emb = np.load('drive/MyDrive/relation_embeds.npy')

    ent2imb = {str(ent): str(imb) for ent, imb in graph.subject_objects(WDT.P345)}

    ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
    lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

    # Load multimedia dataset
    f = open('drive/MyDrive/images.json')
    mediadata = json.load(f)

    # Load crowdsource dataset
    crowd_df = pd.read_csv('drive/MyDrive/crowd_data.tsv', sep='\t')

    print('Data loading done.')
    return graph, ent2id, id2ent, rel2id, id2rel, ent2lbl, lbl2ent, triple_df, entity_emb, \
           relation_emb, ent2imb, mediadata, crowd_df, ner_pipeline

In [4]:
def questionprocessor(question, ner_pipeline):
    if question.find('VI -') != -1:
        question = question.replace('-', '–')

    if question.find('ecommend') != -1:
        qtype = 'Recommend'
    elif (question.find('picture') != -1) or (question.find('like') != -1) or (question.find('figure') != -1):
        qtype = 'Multimedia'
    else:
        qtype = 'KG & Embedding'

    if question.find('of') != -1:
        sub1 = "of "
        sub2 = " ?"
        idx1 = question.find(sub1)
        idx2 = question.find(sub2)
        movie = question[idx1 + len(sub1): idx2]
        return qtype, movie
    else:
        movies = []
        entities = ner_pipeline(question, aggregation_strategy="simple")
        for entity in entities:
            movies.append(entity['word'])

    if question.find('ecommend') != -1:
        return qtype, movies
    else:
        return qtype, movies[0]

In [5]:
def factual(question, graph, movies, ent2lbl, lbl2ent, ans_df):
    try:
        WD = Namespace('http://www.wikidata.org/entity/')
        WDT = Namespace('http://www.wikidata.org/prop/direct/')

        if question.find('of') != -1:
            sub1 = " the "
            sub2 = " of "
            idx1 = question.find(sub1)
            idx2 = question.find(sub2)
            relation = question[idx1 + len(sub1): idx2]
            print('The relation is', relation)

        if question.find('by') != -1:
            sub1 = " the "
            sub2 = " by "
            idx1 = question.find(sub1)
            idx2 = question.find(sub2)
            relation = question[idx1 + len(sub1): idx2]
            print('The relation is', relation)

        if question.find('direct') != -1:
            relation = 'director'

        query_relURI = '''
            SELECT ?rel WHERE{{
                ?rel rdfs:label "{}"@en.
                }}'''.format(relation) 

        relURIList = list(graph.query(query_relURI))
        for idx, relURI in enumerate(relURIList):
            rel_tmp = relURI[0].n3()
            if WDT in rel_tmp:
                rel = rel_tmp

        mov = lbl2ent[movies].n3()

        ent1 = re.sub('<|>', '', mov)
        ent2 = re.sub('<|>', '', rel)
        crowd_idx1 = 'wd:'+re.findall(r'http://www.wikidata.org/entity/(.*)', ent1)[0]
        crowd_idx2 = 'wdt:'+re.findall(r'http://www.wikidata.org/prop/direct/(.*)', ent2)[0]

        if (crowd_idx1 in ans_df['Input1ID'].values) & (crowd_idx2 in ans_df['Input2ID'].values):
            tmp = ans_df.loc[ans_df['Input1ID']==crowd_idx1]
            ans = tmp['Input3ID'].values[0]
            if ans.startswith('wd:'):
                ans = ent2lbl[rdflib.term.URIRef(ent1)]
            message = 'The answer is '+ans+', according to the crowd, who had an inter-rater agreement of: '\
            +str(tmp['Kappa'].values[0])+ \
            '.\nThe answer distribution is: '+str(tmp['Correct'].values[0])+\
            ' support vote and '+str(3-tmp['Correct'].values[0])+' reject vote.'

        else:
            idxs = triple_df[(triple_df['entity1'] == mov) & (triple_df['relation'] == rel)].index.values

            entity2 = triple_df['entity2'].iloc[idxs[0]]
            entity2 = re.sub('<|>', '', entity2)
            entity2_lbl = ent2lbl[rdflib.term.URIRef(entity2)]

            answers = [
                'KG: I think it is ' + entity2_lbl+'\n',
                'KG: ' + entity2_lbl + ' is the ' + relation + ' of ' + movies+'\n'
            ]

            message = random.choice(answers)
    except:
        message = 'KG: No answer.\n'
    return message

In [6]:
def embedding(question, graph, movies, ent2id, id2ent, rel2id, id2rel, triple_df, entity_emb, relation_emb):
    try:
        WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')

        if question.find('the') != -1:
            sub1 = " the "
            sub2 = " of "
            idx1 = question.find(sub1)
            idx2 = question.find(sub2)
            relation = question[idx1 + len(sub1): idx2]
            print('relation is ', relation)

        if question.find('direct') != -1:
            relation = 'director'

        query_relURI = '''
            SELECT ?rel WHERE{{
                ?rel rdfs:label "{}"@en.
                }}'''.format(relation)

        relURI = []
        relURIList = list(graph.query(query_relURI))
        for idx, relURI in enumerate(relURIList):
            tmp = str(relURI[0])
            if WDT in tmp:
                rel = tmp

        mov = lbl2ent[movies].n3()

        rel_id = rel2id[rdflib.term.URIRef(rel)]
        mov_id = ent2id[rdflib.term.URIRef(re.sub('<|>','',mov))]

        topN = 3
        rel_emb = np.atleast_2d(relation_emb[rel_id])
        rel_dist = pairwise_distances(rel_emb, relation_emb)
        relation2 = []
        for idx in rel_dist.argsort().reshape(-1)[:3]:
            relation2.append(str(id2rel[idx].n3()))

        rel = '<'+rel+'>'
        idxs = 0
        idxs = triple_df[(triple_df['entity1'] == mov) & (triple_df['relation'] == rel)].index.values
        length = len(idxs)
        if length == 0:
            idxs = triple_df[(triple_df['entity1'] == mov) & (triple_df['relation'] == relation2[1])].index.values

        entity2 = triple_df['entity2'].iloc[idxs[0]]
        entity2 = re.sub('<|>','',entity2)
        entity2_id = ent2id[rdflib.term.URIRef(entity2)]

        # TransE
        topN = 3
        emb = np.atleast_2d(entity_emb[entity2_id])
        dist = pairwise_distances(emb, entity_emb)
        entity2 = []
        answers = []
        for idx in dist.argsort().reshape(-1)[:3]:
            answers.append(ent2lbl[id2ent[idx]])
        
        message = '\n----------The answers suggested by embeddings are '+answers[0]+', '+answers[1]+', '+answers[2]
    except:
        message = '\n----------Embedding: No answer.'

    return message

In [7]:
def multimedia(question, graph, movies, mediadata, ner_pipeline):
    try:
        WDT = Namespace('http://www.wikidata.org/prop/direct/')
        lbl2ent = {str(lbl): str(ent) for ent, lbl in graph.subject_objects(RDFS.label)}
        ent2imb = {str(ent): str(imb) for ent, imb in graph.subject_objects(WDT.P345)}

        entities = ner_pipeline(question, aggregation_strategy="simple")
        for entity in entities:
            lbl = entity['word']

        ent = lbl2ent[lbl]
        imb = ent2imb[ent]

        if imb[:2] == 'tt':
            for item in mediadata:
                if imb in item['movie']:
                    message = item['img']
                    break
        elif imb[:2] == 'nm':
            for item in mediadata:
                if (imb in item['cast']) & (len(item['cast'])==1):
                    message = item['img']
                    break
        else:
            message = 'Not a movie or human.'
    except:
        message = 'No answer.'
        
    return 'image:'+message

In [8]:
def recommend(question, graph, movies, ent2lbl, triple_df):
    try:
        mov_list = []

        mov_list = []
        for mov in movies:
            mov_lbl = [str(s) for s,  in graph.query('''
                SELECT ?movie WHERE {
                    ?movie rdfs:label '%s'@en .
                }'''%mov)]
            if len(mov_lbl)!= 0:
                mov_list.append(mov_lbl)
        
        for i in range(len(mov_list)):
            for j in range(len(mov_list[i])):
                mov_list[i][j] = '<' + mov_list[i][j] + '>'

        dfs = []
        for i in range(len(mov_list)):
            df = []
            df = triple_df.loc[triple_df['entity1'].isin(mov_list[i])]
            dfs.append(df)

        for i in range(1, len(mov_list)):
            dfs[i] = pd.merge(dfs[i-1], dfs[i], on=["relation", "entity2"])
        
        rel_df = dfs[len(mov_list)-1]
        rel_df = rel_df.drop_duplicates(subset=['relation', 'entity2'])
        entity2 = rel_df['entity2'].values.tolist()

        common = []
        for ent in entity2:
            if '<' in ent:
                ent = re.sub('<|>','',ent)
                lbl = ent2lbl[rdflib.term.URIRef(ent)]
            else:
                lbl = ent
            common.append(lbl)

        entity1 = []
        entity1 = triple_df['entity1'].loc[triple_df['entity2'].isin(entity2)]  
        entity1 = entity1.value_counts()[len(mov_list):len(mov_list)+3].index.tolist()  

        answers = []
        for ent in entity1:
            ent = re.sub('<|>','',ent)
            lbl = ent2lbl[rdflib.term.URIRef(ent)]
            answers.append(lbl)

        message = answers[0]+', '+answers[1]+', '+answers[2]
    except:
        message = 'No answer.'

    return message

In [9]:
def crowdsource(crowd_df):
    crowd_df.drop(['Title','Reward','AssignmentId','AssignmentStatus'], inplace=True, axis=1)
    crowd_df['LifetimeApprovalRate'] = crowd_df['LifetimeApprovalRate'].str.rstrip('%').astype('float') / 100.0
    crowd_df = crowd_df.loc[(crowd_df['WorkTimeInSeconds'] >= 50) & (crowd_df['LifetimeApprovalRate'] >= 0.7)]
    crowd_df.drop(['WorkerId','WorkTimeInSeconds','LifetimeApprovalRate'], inplace=True, axis=1)
    ans_df = crowd_df.groupby(['HITId']).first()

    rate = []
    ans_df['Correct'] = None

    for i in range(1, len(ans_df)+1):
        # Get the specific group
        df = crowd_df.loc[crowd_df['HITId']== i]

        corr_count = int(df['AnswerID'][df['AnswerID']==1].count())
        incorr_count = int(df['AnswerID'][df['AnswerID']==2].count())
        ans_df['Correct'][i] = corr_count


        rate.append([corr_count, incorr_count])
        if (corr_count < incorr_count):
            ans_df['AnswerLabel'][i] = 'INCORRECT'

            fixValueLoc = df['FixValue'].first_valid_index()
            fixPositionLoc = df['FixPosition'].first_valid_index()

            if fixValueLoc is not None:

                fixPosition = crowd_df['FixPosition'][fixPositionLoc]
                fixValue = crowd_df['FixValue'][fixValueLoc]

                if fixPosition == 'Subject':
                    if fixValue.startswith('Q'):
                        ans_df['Input1ID'][i] = 'wd:'+fixValue
                    else:
                        ans_df['Input1ID'][i] = fixValue
                elif fixPosition == 'Predicate':
                    if fixValue.startswith('P'):
                        ans_df['Input2ID'][i] = 'wdt:'+fixValue
                    else:
                        ans_df['Input2ID'][i] = fixValue
                else:
                    if fixValue.startswith('Q'):
                        ans_df['Input3ID'][i] = 'wd:'+fixValue
                    else:
                        ans_df['Input3ID'][i] = fixValue
        
        else:
            ans_df['AnswerLabel'][i] = 'CORRECT'


    def checkInput(rate, n):
        """ 
        Check correctness of the input matrix
        @param rate - ratings matrix
        @return n - number of raters
        @throws AssertionError 
        """
        N = len(rate)
        k = len(rate[0])
        assert all(len(rate[i]) == k for i in range(k)), "Row length != #categories)"
        assert all(isinstance(rate[i][j], int) for i in range(N) for j in range(k)), "Element not integer" 
        assert all(sum(row) == n for row in rate), "Sum of ratings != #raters)"

    def fleissKappa(rate,n):
        """ 
        Computes the Kappa value
        @param rate - ratings matrix containing number of ratings for each subject per category 
        [size - N X k where N = #subjects and k = #categories]
        @param n - number of raters   
        @return fleiss' kappa
        """

        N = len(rate)
        k = len(rate[0])
        print("#raters = ", n, ", #subjects = ", N, ", #categories = ", k)
        checkInput(rate, n)

        #mean of the extent to which raters agree for the ith subject 
        PA = sum([(sum([i**2 for i in row])- n) / (n * (n - 1)) for row in rate])/N
        print("PA = ", PA)
        
        # mean of squares of proportion of all assignments which were to jth category
        PE = sum([j**2 for j in [sum([rows[i] for rows in rate])/(N*n) for i in range(k)]])
        print("PE =", PE)
        
        kappa = -float("inf")
        try:
            kappa = (PA - PE) / (1 - PE)
            kappa = float("{:.3f}".format(kappa))
        except ZeroDivisionError:
            print("Expected agreement = 1")

        print("Fleiss' Kappa =", kappa)
        
        return kappa

    len1 = len(ans_df[ans_df['HITTypeId']=='7QT'])
    len2 = len(ans_df[ans_df['HITTypeId']=='8QT'])
    len3 = len(ans_df[ans_df['HITTypeId']=='9QT'])

    rate1 = rate[:len1]
    rate2 = rate[len1:len1+len2]
    rate3 = rate[len1+len2:]

    kappa1 = fleissKappa(rate1, 3)
    kappa2 = fleissKappa(rate2, 3)
    kappa3 = fleissKappa(rate3, 3)

    ans_df['Kappa'] = None
    ans_df['Kappa'][:len1] = kappa1
    ans_df['Kappa'][len1:len1+len2] = kappa2
    ans_df['Kappa'][len1+len2:] = kappa3
    
    return ans_df

In [None]:
graph, ent2id, id2ent, rel2id, id2rel, ent2lbl, lbl2ent, triple_df, \
entity_emb, relation_emb, ent2imb, mediadata, crowd_df, ner_pipeline = dataloader()
ans_df = crowdsource(crowd_df)

In [13]:
def chatAgent(question):
    qtype, movies = questionprocessor(question, ner_pipeline)

    if qtype == 'KG & Embedding':
        message = factual(question, graph, movies, ent2lbl, lbl2ent, ans_df)+\
        embedding(question, graph, movies, ent2id, id2ent, rel2id, id2rel, triple_df, entity_emb, relation_emb)
    elif qtype == 'Multimedia':
        message = multimedia(question, graph, movies, mediadata, ner_pipeline)
    elif qtype == 'Recommend':
        message = recommend(question, graph, movies, ent2lbl, triple_df)

    return message

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
que = input()
message = chatAgent(que)
message

KeyboardInterrupt: ignored

In [14]:
import time
import atexit
import getpass
import requests  # install the package via "pip install requests"
from collections import defaultdict

# url of the speakeasy server
url = 'https://speakeasy.ifi.uzh.ch'
listen_freq = 3


class DemoBot:
    def __init__(self, username, password):
        self.agent_details = self.login(username, password)
        self.session_token = self.agent_details['sessionToken']
        self.chat_state = defaultdict(lambda: {'messages': defaultdict(dict), 'initiated': False, 'my_alias': None})

        atexit.register(self.logout)

    def listen(self):
        while True:
            # check for all chatrooms
            current_rooms = self.check_rooms(session_token=self.session_token)['rooms']
            for room in current_rooms:
                # ignore finished conversations
                if room['remainingTime'] > 0:
                    room_id = room['uid']
                    if not self.chat_state[room_id]['initiated']:
                        # send a welcome message and get the alias of the agent in the chatroom
                        self.post_message(room_id=room_id, session_token=self.session_token, message='Hi, you can send me any message and check if it is echoed in {} seconds.'.format(listen_freq))
                        self.chat_state[room_id]['initiated'] = True
                        self.chat_state[room_id]['my_alias'] = room['alias']

                    # check for all messages
                    all_messages = self.check_room_state(room_id=room_id, since=0, session_token=self.session_token)['messages']

                    # you can also use ["reactions"] to get the reactions of the messages: STAR, THUMBS_UP, THUMBS_DOWN

                    for message in all_messages:
                        if message['authorAlias'] != self.chat_state[room_id]['my_alias']:

                            # check if the message is new
                            if message['ordinal'] not in self.chat_state[room_id]['messages']:
                                self.chat_state[room_id]['messages'][message['ordinal']] = message
                                print('\t- Chatroom {} - new message #{}: \'{}\' - {}'.format(room_id, message['ordinal'], message['message'], self.get_time()))

                                ##### You should call your agent here and get the response message #####
                                try:
                                    m = chatAgent(message['message'])
                                    self.post_message(room_id=room_id, session_token=self.session_token, message=m)
                                    print(m)
                                except:
                                    print("Something went wrong.")

            time.sleep(listen_freq)

    def login(self, username, password):
        agent_details = requests.post(url=url + "/api/login", json={"username": username, "password": password}).json()
        print('- User {} successfully logged in with session \'{}\'!'.format(agent_details['userDetails']['username'], agent_details['sessionToken']))
        return agent_details

    def check_rooms(self, session_token):
        return requests.get(url=url + "/api/rooms", params={"session": session_token}).json()

    def check_room_state(self, room_id, since, session_token):
        return requests.get(url=url + "/api/room/{}/{}".format(room_id, since), params={"roomId": room_id, "since": since, "session": session_token}).json()

    def post_message(self, room_id, session_token, message):
        tmp_des = requests.post(url=url + "/api/room/{}".format(room_id),
                                params={"roomId": room_id, "session": session_token}, data=message).json()
        if tmp_des['description'] != 'Message received':
            print('\t\t Error: failed to post message: {}'.format(message))

    def get_time(self):
        return time.strftime("%H:%M:%S, %d-%m-%Y", time.localtime())

    def logout(self):
        if requests.get(url=url + "/api/logout", params={"session": self.session_token}).json()['description'] == 'Logged out':
            print('- Session \'{}\' successfully logged out!'.format(self.session_token))

In [15]:
username = 'running.hou_bot'
password = 'y2trUquIVCl3RQ'
demobot = DemoBot(username, password)
demobot.listen()

- User running.hou_bot successfully logged in with session 'node0mu2yuis9evenjgmde7jk47oy83295'!
	- Chatroom e5c5592d-aea8-4ee4-98d0-0df789f1667d - new message #1: ' Who is the director of Good Will Hunting?' - 17:11:21, 14-12-2022
The relation is director
relation is  director
KG: I think it is Gus Van Sant

----------The answers suggested by embeddings are Gus Van Sant, David Lynch, Scott Patrick Green
	- Chatroom 14160857-15f6-4381-8475-56323cb65214 - new message #1: 'Let me know what Sandra Bullock looks like.' - 17:11:53, 14-12-2022
image:3866/rm325111296.jpg
	- Chatroom 6260ff97-7d00-4111-9c4a-04835228130a - new message #0: 'What is the box office of The Princess and the Frog?' - 17:12:51, 14-12-2022
The relation is box office
relation is  box office
The answer is 267000000, according to the crowd, who had an inter-rater agreement of: 0.236.
The answer distribution is: 2 support vote and 1 reject vote.
----------Embedding: No answer.
	- Chatroom 6260ff97-7d00-4111-9c4a-0483522813

KeyboardInterrupt: ignored