# Simple search engine over the wikidataset

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
import urllib.request
import json

## 1. Get the data

In [9]:
url = 'http://island.ricerca.di.unimi.it/~alfio/shared/inforet/wikipeople.json'
data = urllib.request.urlopen(url).read()

In [10]:
D = json.loads(data)

In [11]:
D[0]

{'docid': 'Q868_1',
 'entity': 'Q868',
 'target': 'Aristotle (ancient Greek philosopher)',
 'wikipage': 'https://en.wikipedia.org/wiki/Aristotle',
 'query': 'Aristotle philosopher',
 'document': 'Aristotle (/ærɪsˈtɒtəl/;[3] Greek: Ἀριστοτέλης Aristotélēs, pronounced\xa0[aristotélɛːs]; 384–322\xa0BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece. Taught by Plato, he was the founder of the Lyceum, the Peripatetic school of philosophy, and the Aristotelian tradition. His writings cover many subjects including physics, biology, zoology, metaphysics, logic, ethics, aesthetics, poetry, theatre, music, rhetoric, psychology, linguistics, economics, politics, and government. Aristotle provided a complex synthesis of the various philosophies existing prior to him. It was above all from his teachings that the West inherited its intellectual lexicon, as well as problems and methods of inquiry. As a result, his philosophy has exerted a unique influence on almost

## 2.Search Engine

In [12]:
import nltk 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
class TfIdfSearch(object):
    
    def __init__(self, dataset):
        self.vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
        self.documents = [x['document'] for x in dataset]
        self.entities = [x['entity'] for x in dataset]
        self.query_texts = list(set([(x['query'], x['entity']) for x in dataset]))
        self.X = self.vectorizer.fit_transform(self.documents)
    
    def search(self, query):
        q = self.vectorizer.transform([query])
        sigma = cosine_similarity(q, self.X)
        return sorted(enumerate(sigma[0]), key = lambda x: -x[1])

## Example

In [23]:
E = TfIdfSearch(D)

In [27]:
query_map = dict(E.query_texts)

In [32]:
q = 'Aristotle philosopher'
R = E.search(q)
query_map[q]
rev = dict([(y, x) for x, y in query_map.items()])

In [35]:
query_map[q]

'Q868'

In [36]:
rev['Q4791095']

'Aristotle painting Jusepe de Ribera'

In [31]:
for d, s in R[:10]:
    print(d, E.entities[d], s)

73 Q868 0.23774831891870213
66 Q868 0.2229476440697656
75 Q868 0.22056981710633633
6 Q868 0.19931313742099963
69 Q868 0.19849925162252838
64 Q868 0.1943173384490755
214 Q4791095 0.1780438328415235
70 Q868 0.17005345749432937
82 Q868 0.16859913159333562
81 Q868 0.1574998278445402


In [39]:
top_k = 10
R_docs = [E.documents[x] for x, _ in R[:top_k]]
N_docs = [E.documents[x] for x, _ in R[top_k:]]

In [40]:
R_docs

['The 19th-century German philosopher Friedrich Nietzsche has been said to have taken nearly all of his political philosophy from Aristotle.[162] Aristotle rigidly separated action from production, and argued for the deserved subservience of some people ("natural slaves"), and the natural superiority (virtue, arete) of others. It was Martin Heidegger, not Nietzsche, who elaborated a new interpretation of Aristotle, intended to warrant his deconstruction of scholastic and philosophical tradition.[163]\n',
 'The immediate influence of Aristotle\'s work was felt as the Lyceum grew into the Peripatetic school. Aristotle\'s notable students included Aristoxenus, Dicaearchus, Demetrius of Phalerum, Eudemos of Rhodes, Harpalus, Hephaestion, Mnason of Phocis, Nicomachus, and Theophrastus. Aristotle\'s influence over Alexander the Great is seen in the latter\'s bringing with him on his expedition a host of zoologists, botanists, and researchers. He had also learned a great deal about Persian cu

## 3.Indexing

In [41]:
from collections import defaultdict

In [55]:
Ri = defaultdict(lambda: 1)
Ni = defaultdict(lambda: 1)

for doc in R_docs:
    tokens = nltk.word_tokenize(doc)
    for token in tokens: 
        Ri[token] += 1

for doc in N_docs: 
    tokens = nltk.word_tokenize(doc)
    for token in tokens: 
        Ni[token] += 1
        
Sr = sum(Ri.values()) + len(Ri)
Sn = sum(Ni.values()) + len(Ni)

In [56]:
def p(w):
    return Ri[w] / Sr, Ni[w] / Sn

In [57]:
p('century')

(0.0010901162790697674, 0.0004222877863111191)

In [61]:
KL = {}
for w in Ri.keys(): 
    k, q = p(w)
    kl = k * np.log(k / q)
    KL[w] = kl

In [62]:
candidates = sorted(KL.items(), key = lambda x: -x[1])

In [63]:
candidates

[('Aristotle', 0.04121999901396165),
 ('philosopher', 0.01211403614173217),
 ('Ribera', 0.009236044993877219),
 ('philosophers', 0.007571272588582606),
 ('Russell', 0.0070644994379590285),
 ('Aristotelian', 0.006311920879716427),
 ('translations', 0.006057018070866085),
 ('Nietzsche', 0.005740378693412765),
 ('Averroes', 0.005740378693412765),
 ('Caravaggio', 0.005740378693412765),
 ('Boethius', 0.005740378693412765),
 ('Moerbeke', 0.005740378693412765),
 ('Antarctica', 0.005740378693412765),
 ('Raphael', 0.005740378693412765),
 ('Rembrandt', 0.005740378693412765),
 ('philosophy', 0.00545798986727868),
 ('Alexander', 0.00517907477513331),
 ('intellectual', 0.005049536703773142),
 ('Jonathan', 0.004984767668093057),
 ('Homer', 0.004984767668093057),
 ('Athens', 0.004725200146630395),
 ('Western', 0.004721316342372859),
 ('scholastic', 0.0045427635531495635),
 ('Nicomachus', 0.0045427635531495635),
 ('thinkers', 0.0045427635531495635),
 ('theology', 0.0045427635531495635),
 ('Aquinas', 0