In [3]:
import json
import numpy as np
from scipy.sparse import csr_matrix
from dominate import document
from dominate.tags import ul, li, a, h1, h2

In [104]:
class PersonalizedPageRank:
    
    def __init__(self, graphFile, contentFile):
        self.graph_file = graphFile
        self.content_file = contentFile
        self.generate_graph()
        self.generate_contents()
        
    def generate_graph(self):
        with open(self.graph_file, 'r') as f:
            # We have the graph encoded as an adjacency list in a JSON file 
            g = json.load(f)
            # The data structure read from JSON is already "good enough" for us
        self.graph = g
        self.num_nodes = len(g.keys())
        
    def generate_contents(self):
        with open(self.content_file, 'r') as f:
            # We have the graph encoded as an adjacency list in a JSON file 
            c = json.load(f)
            # The data structure read from JSON is already "good enough" for us
        self.contents = c
        
    def compute_stochastic_matrix(self):
        # we make a dictionary saving for each key in the graph
        # the corresponding index in the matrix
        key_to_pos = dict(zip(self.graph.keys(), range(0,self.num_nodes)))
        row = []
        col = []
        data = []

        for key in self.graph.keys():
            for edge in self.graph[key]:
                row.append(key_to_pos[key])
                col.append(key_to_pos[edge])
                data.append(1/len(self.graph[key]))
        R = csr_matrix((data, (row, col)), shape=(self.num_nodes, self.num_nodes))
    
        self.R = R
    
    def generate_seed(self, topic):
        key_to_pos = dict(zip(self.graph.keys(), range(0,self.num_nodes)))
        seeds = np.zeros(self.num_nodes)

        for key in self.contents.keys():
            lower_key_content = [x.lower() for x in self.contents[key]]
            mask = [topic in x for x in lower_key_content]
            #if topic in lower_key_content:
            if any(mask):
                seeds[key_to_pos[key]] = 1

        if (np.sum(seeds)!=0):
            self.J = seeds/sum(seeds)
        else: 
            self.J = seeds
        
    def PersonalizedPageRank_iteration(self, x, alpha):
        P = (1 - alpha) * self.R
        x_prime = (P.T).dot(x) + alpha * self.J
        return x_prime
    
    def compute_PersonalizedPageRank(self, topic, alpha, epsilon):
        dictionary = {}
        # We compute the transition matrix without the teleportation
        self.compute_stochastic_matrix()
        # The jump vector is imply a vector of ones divided by its length
        self.generate_seed(topic)
        if (np.sum(self.J)==0):
            print('There are no pages related to '+topic)
            x = np.zeros(self.num_nodes)
            return x, dictionary
        #J = np.ones(n)/n
        # The starting point can be a uniform distribution across all nodes
        # x = np.ones(n)/n
        # ...or a random stochastic vector
        x = np.random.rand(self.num_nodes)
        x = x/x.sum()
        # We can now iterate until the norm one of the changes in the
        # last iteration goes below epsilon
        err = np.inf # initially infinity
        while (err > epsilon):
            x_new = self.PersonalizedPageRank_iteration(x, alpha)
            err = (abs(x_new - x)).sum()
            x = x_new
        for i, k in enumerate(self.graph.keys()):
            dictionary[k] = x[i]
        
        return x, dictionary

In [105]:
class User:

    def __init__(self, name, surname):
        self.name = name
        self.surname = surname

        self.rates = {}

In [106]:
class IRsystem():

    def __init__(self, user, pagerank):
        self.user = user
        self.pagerank = pagerank

    def compute_weights(self):
        
        self.weights = np.array([self.user.rates[x] for x in self.user.rates.keys()])
        self.weights = self.weights/np.sum(self.weights) 
    
    
    def compute_final_pagerank(self, alpha, epsilon):
        
        self.compute_weights()
        xs = []
        
        for topic in self.user.rates.keys():
            self.pagerank.generate_seed(topic)
            x, _ = self.pagerank.compute_PersonalizedPageRank(topic, alpha, epsilon)
            xs.append(x)
        
        xs = np.array(xs)
        self.pagerank_vector = np.dot(self.weights, xs) 
        self.dict = {}
        for i, k in enumerate(self.pagerank.graph.keys()):
            self.dict[k] = self.pagerank_vector[i]
             
    def write_result(self):
        
        paths = []
        for w in sorted(self.dict, key=self.dict.get, reverse=True):
            not_useful_pages= ['User', 'Category', 'Wiki','Help','Image','Special','License', 'GFDL', 'language']
            mask = [x not in w for x in not_useful_pages]
            if all(mask):
                paths.append('simple/'+w)
                #print(w, self.dict[w])

        with document(title='Result') as doc:
            h1('Result')
            for path in paths:
                name = path[13:]
                name = name.replace('.html','')
                ul(li(a(name, href=path), __pretty=False))


        with open('result.html', 'w') as f:
            f.write(doc.render())

In [74]:
user = User(name='Roberto', surname='Corti')

for i in range(0, 5):
    topic = input('Topic '+str(i+1))

    rate = int(input('Rate '+topic+': '))

    while rate < 0 or rate > 5:
        rate = int(input('Not valid rate!!!\nRate '+topic+': '))

    user.rates[topic] = rate

Topic 1italy
Rate italy: 4
Topic 2dario
Rate dario: 3
Topic 3milan
Rate milan: 2
Topic 4rome
Rate rome: 1
Topic 5los angeles
Rate los angeles: 2


In [14]:
user.rates

{'italy': 1, 'spain': 5, 'flamenco': 4, 'music': 2, 'history': 3}

In [10]:
user = User(name='Roberto', surname='Corti')
ratings = [1, 2, 3, 1, 0, 5, 0]

for i, topic in enumerate(user.rates.keys()):
    user.rates[topic] = ratings[i]

In [11]:
user.rates

{}

In [102]:
p = PersonalizedPageRank(graphFile='wikipediaGraph_simple.json',
                         contentFile='wikipediaGraph_simple_contents.json')

In [240]:
x, _ = p.compute_PersonalizedPageRank('art', alpha=0.2, epsilon=0.001)

In [103]:
%%time

system = IRsystem(user=user, pagerank=p)

system.compute_final_pagerank(alpha=0.2, epsilon=0.001)

There are no pages related to dario
CPU times: user 4.56 s, sys: 43.7 ms, total: 4.6 s
Wall time: 4.51 s


In [87]:
%%time
system.write_result()

CPU times: user 1.39 s, sys: 15.7 ms, total: 1.41 s
Wall time: 1.41 s


In [76]:
sum(system.pagerank_vector)

nan

In [177]:
system.write_result()

In [71]:
import urllib.parse
s = urllib.parse.quote('result.html')
print(s)

result.html


# Test Part

In [24]:
def read_graph(filename):
    with open(filename, 'r') as f:
        # We have the graph encoded as an adjacency list in a JSON file 
        g = json.load(f)
        # The data structure read from JSON is already "good enough" for us
        return g

In [25]:
g = read_graph('wikipediaGraph_simple.json')
contents = read_graph('wikipediaGraph_simple_contents.json')
n = len(g.keys())

edges = 0
for x in g.keys():
    edges += len(g[x])

print('Number of nodes: ', n)
print('Number of edges: ', edges)

Number of nodes:  49142
Number of edges:  1235150


In [26]:
def compute_R(graph):
    n = len(graph.keys())
    # we make a dictionary saving for each key in the graph
    # the corresponding index in the matrix
    key_to_pos = dict(zip(graph.keys(), range(0,n)))
    row = []
    col = []
    data = []

    for key in g.keys():
        for edge in g[key]:
            row.append(key_to_pos[key])
            col.append(key_to_pos[edge])
            data.append(1/len(g[key]))
    R = csr_matrix((data, (row, col)), shape=(n, n))
    
    return R

In [27]:
def PageRank_iteration(x, R, J, alpha):
    n = len(x)
    P = (1 - alpha) * R
    x_prime = (P.T).dot(x) + alpha * J
    return x_prime

In [28]:
def compute_PageRank(graph, alpha, epsilon):
    n = len(graph.keys())
    # We compute the transition matrix without the teleportation
    R = compute_R(graph)
    # The jump vector is imply a vector of ones divided by its length
    J = np.ones(n)/n
    # The starting point can be a uniform distribution across all nodes
    # x = np.ones(n)/n
    # ...or a random stochastic vector
    x = np.random.rand(n)
    x = x/x.sum()
    # We can now iterate until the norm one of the changes in the
    # last iteration goes below epsilon
    err = np.inf # initially infinity
    while (err > epsilon):
        x_new = PageRank_iteration(x, R, J, alpha)
        err = (abs(x_new - x)).sum()
        print(err)
        x = x_new
    print("PageRank scores:")
    print(x.shape)
    dictionary = {}
    for i, k in enumerate(graph.keys()):
        #print(f"{k}: {x[0,i]}")
        dictionary[k] = x[i]
    return x, dictionary

In [47]:
x, d = compute_PageRank(g, 0.1, 0.001)

1.3327620469985153
0.5990622799152815
0.2024156612640305
0.06787624055880093
0.027143456935648805
0.012659829616234207
0.006221520023289405
0.003218270806647274
0.001759673260306674
0.0010517924166833898
0.0006995316157667597
PageRank scores:
(49142,)


In [48]:
for w in sorted(d, key=d.get, reverse=True):
    if 'User' not in w and 'Wiki' not in w and 'Help' not in w and 'Image' not in w:
        print(w, d[w])

c/a/t/Special~Categories_101d.html 0.01345646864333563
g/n/u/GNU_Free_Documentation_License_9d01.html 0.0065179869953776925
u/n/i/United_States_09d4.html 0.0022425868043349953
e/n/g/English_language.html 0.0021989252466322984
g/f/d/GFDL_75b9.html 0.001685863667791453
l/a/w/Law.html 0.0015180755866310747
e/n/c/Encyclopedia.html 0.0010080831186846345
i/n/t/Internet_Protocol_7ac7.html 0.0009743412530472346
b/o/o/Book.html 0.0008014914505259054
o/p/e/Open_content.html 0.0007817996575713396
m/o/n/Money.html 0.0007816406288380457
f/i/l/Film.html 0.0007480821516640918
c/o/p/Copyleft.html 0.0007299527654039442
f/r/e/French_language.html 0.0007294742169354009
2/0/0/2001.html 0.000711139272380194
g/e/r/German_language.html 0.0007110733669819079
d/i/c/Dictionary.html 0.0006717345682934263
r/u/l/Rule.html 0.0006709280242304423
s/o/n/Song.html 0.0006569509700390552
n/o/n/Non-profit.html 0.0006518402659861693
w/e/b/Website.html 0.0006459701309527916
m/o/v/Movie.html 0.0006439703263591564
f/a/i/Fair_

s/e/a/Season.html 1.985835696098855e-05
h/i/m/Himalaya.html 1.985517925068227e-05
i/n/f/Infection.html 1.9848038180755856e-05
1/6/5/1650s.html 1.9841202657953843e-05
h/a/m/Hamburg.html 1.9834790933871823e-05
e/l/i/Elizabeth_I_of_England_2ce6.html 1.9826992541735598e-05
w/a/t/Water_Deer_30cf.html 1.981952780760486e-05
1/9/0/1901.html 1.9816342352571423e-05
h/e/a/Category~Health_95b9.html 1.980794907798645e-05
a/r/i/Aristotle.html 1.9761060929821927e-05
m/a/r/Mars_(planet).html 1.975470373956852e-05
u/2E/s/U.S._postal_abbreviations_d7c6.html 1.974046727528844e-05
s/h/i/Shiva.html 1.972819960663449e-05
n/e/t/Net_jargon.html 1.9719822989132544e-05
d/e/s/Dessert.html 1.9717198422771955e-05
f/i/n/Finance.html 1.9716323230558217e-05
l/i/s/List_of_U.S._states_by_elevation_e223.html 1.970798026705032e-05
s/e/r/Service.html 1.969365477286548e-05
f/o/r/Forest.html 1.968792687885751e-05
n/a/p/Napoleon_Bonaparte_f92d.html 1.967963640833607e-05
1/9/1/1913.html 1.96608100558706e-05
1/6/6/1660.html 1.

f/o/u/Four.html 1.0779425171721907e-05
a/r/t/Art_film.html 1.0771876889845835e-05
l/o/c/Local_area_network.html 1.077136094253482e-05
t/r/a/Transparency.html 1.076662060035001e-05
g/e/n/Genocide.html 1.0757350249108612e-05
v/i/l/Vilnius.html 1.0756415822088923e-05
d/i/c/Dictatorship.html 1.0740600162514679e-05
s/t/u/Stuntman.html 1.073821560845533e-05
b/r/i/Brittany_Murphy_babd.html 1.0737146767730975e-05
t/o/k/Tokelau.html 1.0732042425717375e-05
n/o/r/Norfolk_Island_312a.html 1.0731921782194053e-05
c/o/l/Columbus.html 1.0725239580910637e-05
n/b/c/NBC_89b7.html 1.0719535353589384e-05
p/r/o/WP~PROTECT_6d34.html 1.0718271844235773e-05
f/e/r/Fertilization.html 1.0714366475494141e-05
j/2E/_/J._M._W._Turner_f57b.html 1.0710901478988562e-05
d/p/_/WP~DP_58bd.html 1.070984024801048e-05
g/r/e/Great_Fire_Of_London_f6fe.html 1.0709142489938966e-05
i/n/d/Category~India_948e.html 1.070589074099996e-05
s/o/n/Sony.html 1.070502906746113e-05
1/0/6/1066.html 1.0704665545206558e-05
h/i/s/History_of_slav

p/i/n/Pink_Floyd_f331.html 7.219331383193828e-06
s/e/r/Sermon.html 7.21914336813426e-06
g/r/e/Gregor_Mendel_b43a.html 7.218777961880696e-06
c/u/p/Cup.html 7.218446484036775e-06
l/u/s/Lusatian_Neisse_b9a5.html 7.217430014321214e-06
w/e/s/Westminster_Abbey_ded8.html 7.213671833985649e-06
r/o/m/Romance.html 7.212458922358907e-06
n/u/m/Category~Number_theory_ef8a.html 7.212418608408332e-06
g/o/u/Gouda_(cheese).html 7.211214163939466e-06
n/o/r/North_Vietnam_5079.html 7.20974018716356e-06
e/q/u/Equidae.html 7.209176805867576e-06
o/l/i/Olive_(disambiguation).html 7.2080067720903994e-06
f/i/g/Fight.html 7.204310289509347e-06
t/o/r/Toronto.html 7.200586063655346e-06
o/r/g/Organic_compound.html 7.199338814278135e-06
t/w/e/Twelve_Apostles_56fc.html 7.198480146482604e-06
a/r/t/Article_(grammar).html 7.197347779968936e-06
s/h/i/Shirt.html 7.195822826016991e-06
s/t/o/Stopwatch.html 7.1932482549708166e-06
g/o/t/Gotthold_Ephraim_Lessing_9e89.html 7.192325790190029e-06
v/i/c/Vicente_Piccio,_Jr._b73f.ht

i/n/t/Interpretation.html 5.3494164659385285e-06
r/o/m/Category~Roman_gods_and_goddesses_9158.html 5.349317933864827e-06
q/u/o/Quotient.html 5.3482347406967914e-06
c/r/a/Crater.html 5.3477528035935175e-06
x/e/n/Xenarthra.html 5.3475584948989195e-06
h/s/i/Hsinchu_County_01cf.html 5.347400519101352e-06
1/3/4/1340.html 5.347201305469196e-06
s/u/r/Surface_area.html 5.347019062834088e-06
o/d/d/Odd_number.html 5.346550457962783e-06
n/e/o/Neoptolemus.html 5.346507454024901e-06
b/r/o/Brown_versus_board_of_education.html 5.3465035277186864e-06
w/r/i/Category~Writing_1185.html 5.346347939044356e-06
2/0/0/Category~2003.html 5.345649823382018e-06
a/l/a/Alan_Alda_35e3.html 5.345461752708304e-06
c/o/s/Cost_of_living.html 5.344847448105382e-06
g/a/n/Gangtok.html 5.3447191178949475e-06
s/l/e/Sleeve.html 5.344500085669821e-06
t/u/r/Category~Turkish_football_clubs_9a35.html 5.344453765221771e-06
n/o/r/North_Africa_6a9e.html 5.34246361305701e-06
p/s/y/Psychiatry.html 5.341881462392129e-06
m/u/g/Mughal_Em

p/o/l/Poll_tax.html 4.652414277111089e-06
n/o/r/Nord-Pas-de-Calais_c7b5.html 4.651984053186709e-06
1/5/4/1540.html 4.651717008450781e-06
1/7/1/1717.html 4.651646687802112e-06
n/e/a/Near.html 4.6513266594348244e-06
b/a/n/Banana_republic.html 4.651106181280736e-06
c/a/p/Talk~Captain_Lou_Albano_ab9e.html 4.650415224786056e-06
c/u/m/Cumming,_Georgia_f04b.html 4.649775589878987e-06
p/i/r/Piri_Reis_map_cb44.html 4.649772910151349e-06
s/i/l/Silly_Putty_93f6.html 4.649323673118219e-06
s/k/i/Skink.html 4.6482120105380395e-06
b/i/g/Big_Ben_0dc3.html 4.648063614771612e-06
a/u/s/Category~Austrian_people_1321.html 4.647455103053392e-06
i/n/f/Template~Infobox_musical_artist_a77d.html 4.647175149348953e-06
m/o/v/Category~Movie_studios_5d8e.html 4.647127615037164e-06
b/r/e/Breakfast.html 4.646725423148181e-06
u/n/e/Unemployment.html 4.646682192270707e-06
g/a/m/Talk~Game_designer_421a.html 4.646309933977666e-06
m/o/n/Monty_Hall_Problem_3599.html 4.646107621670599e-06
h/e/r/Heresy.html 4.645850879778921

w/i/t/Witchcraft.html 4.00878485576756e-06
1/5/7/Category~1570s.html 4.008708628473566e-06
r/e/p/Category~Reproductive_system_2eab.html 4.008561950067956e-06
p/u/m/Pumpkin_Studios_2355.html 4.00853454467349e-06
e/l/i/Elijah_Wood_09e0.html 4.0080576217440084e-06
g/e/r/Germanic_language.html 4.007367476554598e-06
p/a/r/Category~Paris_4589.html 4.007245276465875e-06
l/a/b/Laboratory_techniques.html 4.007012070649304e-06
a/n/t/Anton_Bruckner_e2ed.html 4.006943468451921e-06
j/a/z/Category~Jazz_musicians_48f7.html 4.006309211385783e-06
c/a/u/Category~Caucasus_5830.html 4.0060785821989436e-06
f/r/e/Fresco.html 4.005895403506284e-06
r/e/a/Rear_Window_f8a8.html 4.005799363655268e-06
1/5/6/1569.html 4.005352318624452e-06
w/i/n/Windows_Media_Player_acd2.html 4.003757663851531e-06
i/t/a/Category~Italian_composers_91a0.html 4.003738181491262e-06
s/p/r/Sprint.html 4.003733251982299e-06
1/6/7/Category~1670s.html 4.00366468346777e-06
c/a/n/Canadian_dollar.html 4.0033033353627726e-06
b/o/b/Talk~Bob_Dyl

e/a/r/Earthquake-proof.html 3.50344412384583e-06
1/9/4/Category~1947_deaths.html 3.5032364372434836e-06
1/6/2/1629.html 3.5028434916626636e-06
c/i/t/Category~Cities_in_Missouri_b67f.html 3.502770624704748e-06
f/r/a/Frankfurt_U-Bahn_6c24.html 3.5023474477249473e-06
s/o/l/Solomonid_dynasty.html 3.5023215223105856e-06
a/m/e/Category~American_sports_0b26.html 3.5017713361999598e-06
i/c/e/Ice_skating.html 3.501570572708743e-06
n/o/b/Noble_gas.html 3.5014515351870095e-06
v/a/n/Vantaa.html 3.5013099848827824e-06
1/3/2/1324.html 3.5012430752406947e-06
1/0/7/1078.html 3.501148278243198e-06
1/9/5/Category~1959_movies.html 3.5009420088294114e-06
t/r/i/Triceratops.html 3.5007858585280198e-06
j/a/n/Jana_Gana_Mana_84d3.html 3.500718482994798e-06
h/o/l/Holi.html 3.5005928321609687e-06
b/o/l/Category~Bolivia_f7b5.html 3.5005451775468323e-06
1/6/0/Category~160s_BC_6be7.html 3.5003377570463634e-06
s/e/i/Seine.html 3.500007916061729e-06
1/3/1/1314.html 3.499827058186096e-06
f/a/r/Fark.html 3.499724961198

c/o/n/Condensation.html 3.078280736622671e-06
m/a/y/Maya_Lin_c3cd.html 3.0779359984235526e-06
d/y/a/Dyadic.html 3.0777135442287605e-06
i/n/o/Inorganic_compound.html 3.0776574605219566e-06
b/a/t/Battle_of_Bosworth_Field_0ad8.html 3.0775845025556175e-06
c/a/b/Cabrillo_beach.html 3.0775246888337823e-06
f/a/c/Factories.html 3.0774134145260944e-06
c/h/l/Chloride.html 3.077269430024291e-06
p/r/o/Project_Apollo_b4b9.html 3.0771832081744405e-06
s/c/o/Category~Scottish_writers_bde8.html 3.0771593204027995e-06
c/e/n/Central_Bank_Bombing_4e0d.html 3.0768158704872685e-06
b/i/g/Big_10.html 3.076767343111107e-06
r/e/a/Real_Salt_Lake_3d0c.html 3.0762184074141752e-06
k/h/y/Khyber_Pass_793b.html 3.076126195489269e-06
a/d/o/Adolf_von_Henselt_2b29.html 3.0760670466365027e-06
s/w/e/Template~SWE_9550.html 3.07598513739476e-06
1/3/_/13_July_1538.html 3.075929778210787e-06
j/a/c/Jackson_Pollack_d207.html 3.075662960606419e-06
h/a/m/Ham.html 3.0753500581233103e-06
m/o/n/Monounsaturated_fat.html 3.075345925344

1/8/8/Category~1888_deaths.html 2.8119711025983796e-06
m/a/g/Magnum_P.I._b835.html 2.8118675509665564e-06
i/g/u/Iguanodon.html 2.8118206033095467e-06
1/_/o/1_October_de66.html 2.8116713198263293e-06
1/8/6/Category~1866_births.html 2.811584977247503e-06
o/w/e/Owen_Spencer-Thomas_ac0a.html 2.8113398068432724e-06
1/7/4/Category~1744_births.html 2.811337137736591e-06
1/2/8/Category~1282.html 2.8113203465718993e-06
h/i/s/Category~History_of_Native_Americans_510a.html 2.811189872107408e-06
l/i/s/Lisa_Kudrow_1be1.html 2.810610885872025e-06
1/4/6/Category~1460s_deaths.html 2.8105817819833452e-06
c/l/o/Clone_Wars_7a1c.html 2.81056480948607e-06
b/r/u/Bruntal.html 2.8105555191745243e-06
s/e/x/Talk~Sex_e542.html 2.810509587483045e-06
g/e/o/Georges_Brassens_d272.html 2.8104503387358235e-06
c/h/e/Template~Chess_diagram_34a5.html 2.8101579047844774e-06
h/i/r/Talk~Hiragana_7739.html 2.810005681122557e-06
y/a/n/Yankee_Doodle_Dandy_9e0b.html 2.809837553310488e-06
1/8/7/Category~1879.html 2.8096602872564

e/d/e/Ede.html 2.6080890993493007e-06
o/l/i/Oligochaeta.html 2.60798912273471e-06
r/i/c/Richard_Roper_e974.html 2.6076514468866198e-06
g/d/p/GDP_35c9.html 2.6075696542374924e-06
s/a/u/Sausages.html 2.60754525160629e-06
p/o/r/Porto_Velho_5bef.html 2.6072490061351686e-06
1/2/3/Category~1236.html 2.606988238139291e-06
h/e/a/Headstreams.html 2.606976900717766e-06
5/_/a/5_April_e120.html 2.606944329080383e-06
b/i/x/Bixente_Lizarazu_e323.html 2.60681612619421e-06
1/3/_/13_October_2850.html 2.606779929683862e-06
s/e/a/Talk~Sea_star_e67e.html 2.6067254333899123e-06
1/6/6/Category~1668.html 2.606582823814165e-06
t/h/e/The_Warriors_dd79.html 2.606567856377564e-06
p/r/o/Template~Provinces_of_Cuba_map_b2a9.html 2.6064102859424864e-06
l/i/p/Lip-syncing.html 2.606227959680545e-06
d/i/m/Dimetrodon.html 2.6062107341317614e-06
l/i/s/List_of_Prime_Ministers_of_Greenland_34c0.html 2.6061921926436034e-06
1/8/1/Category~1810.html 2.6059563246768633e-06
f/a/i/Faith_No_More_71ac.html 2.6058847893085695e-06
r

l/i/b/Libertarian.html 2.493475427691515e-06
e/n/d/Template~Endspoiler_7e79.html 2.493406104334304e-06
b/i/r/Template~Birthyr_bac5.html 2.493406104332811e-06
d/e/a/Template~Deathyr_c277.html 2.4934061043313778e-06
d/a/b/Template~Dablink_fdf2.html 2.4934061043266115e-06
l/i/s/Talk~List_of_cat_breeds_8800.html 2.493339661612466e-06
7/1/0/Category~710s.html 2.4932928345681787e-06
f/r/a/Talk~Francis_Bacon_bba5.html 2.4931947444043406e-06
c/o/l/Collingwood.html 2.493188402336836e-06
a/f/l/AFL_01f4.html 2.493188402336836e-06
c/o/m/Category~Compositions_by_Dmitri_Shostakovich_3d41.html 2.492990946446036e-06
k/c/_/KC_&_the_Sunshine_Band_2f76.html 2.4929562415272476e-06
j/u/r/Category~Jurassic_cfb8.html 2.4929408823178417e-06
b/l/o/Blouse.html 2.4925340929159707e-06
c/a/r/Carbohydrates.html 2.4924486423021795e-06
e/t/h/Ethanol_fuel.html 2.49242389501762e-06
b/u/b/Talk~Bubonic_plague_26f7.html 2.492332750265482e-06
u/2E/s/U.S._President_8806.html 2.492209749496729e-06
1/1/7/Category~1175.html 2.

p/l/e/Pleiades.html 2.3719534065390286e-06
p/h/o/Phoebe.html 2.371953406539028e-06
a/b/i/Abiword.html 2.3716579651622906e-06
3/_/d/3_December_dad1.html 2.3715857463043047e-06
m/a/g/Magnesia.html 2.3714633590773543e-06
1/6/5/Category~1651.html 2.3714136239300166e-06
m/a/u/Talk~Mauna_Loa_c9e5.html 2.371412871526505e-06
n/i/h/Nihilism.html 2.3712396683794675e-06
f/o/r/Template~Former_F1_driver_f620.html 2.371215396024175e-06
1/3/2/Category~1326.html 2.371181821233365e-06
t/r/a/Transcaucasus.html 2.37117662730974e-06
a/b/s/Absolute_monarchy.html 2.3711437004137144e-06
a/p/a/Talk~Apartheid_4806.html 2.371033631781841e-06
v/a/g/Vaginal_secretion.html 2.3708698324707117e-06
t/h/e/The_Good_Life_1ce8.html 2.3708217548069253e-06
h/e/t/Heterodontosaurus.html 2.370742488186011e-06
p/s/y/Psychedelic_Pop_b5d9.html 2.3707309631217913e-06
s/o/n/Sonic_boom.html 2.370633524780155e-06
1/1/9/Category~1198.html 2.3700216074938375e-06
l/e/v/Levant.html 2.3698510841379495e-06
e/d/u/Category~Educators_39b2.ht

f/e/n/Fenerbahçe.html 2.120392694944058e-06
h/u/m/Human_Death_dd4e.html 2.119910819006692e-06
2/9/_/29_February_e894.html 2.1195020621872445e-06
8/_/n/8_November_88da.html 2.1194247814417006e-06
f/a/l/Falun_Dafa_3572.html 2.119193418716581e-06
j/u/n/Junctions.html 2.1191286914716218e-06
i/n/t/Intrusive_vulcanicity.html 2.1188508572982544e-06
a/m/e/American_Constitution_f91f.html 2.1188508572982544e-06
e/m/i/Emit.html 2.1184862533092476e-06
r/o/l/Roller_coasters.html 2.118234939584047e-06
a/s/s/Assault_Rifle_0e07.html 2.1179569889755348e-06
s/a/v/Savannah.html 2.117943943369354e-06
s/e/p/September_11,_2001.html 2.117341382884925e-06
e/m/p/Emperor_Akihito_of_Japan_0a00.html 2.1170727322393145e-06
o/r/g/Organization_for_Economic_Co-operation_and_Development_ffad.html 2.1167388900290353e-06
d/i/p/Dipavali.html 2.1167388900290353e-06
z/a/r/Zarathushtra.html 2.1167388900290353e-06
s/h/a/Shankaracharya.html 2.1167388900290353e-06
s/ã/o/São_Paulo_(city)_cfca.html 2.115944736761765e-06
m/a/j/Ma

c/o/r/Cornwallis.html 2.0349192137072157e-06
c/o/r/Coral_Sea_Islands_Territory_a1cc.html 2.0349192137072157e-06
c/o/r/Coruña.html 2.0349192137072157e-06
c/o/r/Coral_mushrooms.html 2.0349192137072157e-06
c/o/r/Cormoros_Islands_3fbb.html 2.0349192137072157e-06
c/o/r/Corse.html 2.0349192137072157e-06
c/o/r/Cordoba,_Argentina_3591.html 2.0349192137072157e-06
c/o/t/Cototient.html 2.0349192137072157e-06
c/o/s/Cost_Of_Living_1dac.html 2.0349192137072157e-06
c/o/n/Confucian.html 2.0349192137072157e-06
c/o/n/Convection_rain.html 2.0349192137072157e-06
c/o/n/Concert-master.html 2.0349192137072157e-06
c/o/n/Control_Paradigm_3680.html 2.0349192137072157e-06
c/o/n/Conceptual_Metaphor_f79d.html 2.0349192137072157e-06
c/o/n/Containers.html 2.0349192137072157e-06
c/o/n/Concertmistress.html 2.0349192137072157e-06
c/o/n/Consistancy.html 2.0349192137072157e-06
c/o/n/Contaminated.html 2.0349192137072157e-06
c/o/n/Contaminates.html 2.0349192137072157e-06
c/o/n/Template~Continent_5cd0.html 2.034919213707215

m/o/t/Mother_Tongue_2adb.html 2.0349192137072157e-06
m/o/t/Mothers.html 2.0349192137072157e-06
m/o/t/Motel.html 2.0349192137072157e-06
m/o/e/Moebius_strip.html 2.0349192137072157e-06
m/o/e/Moebius_band.html 2.0349192137072157e-06
m/o/s/Mosovce.html 2.0349192137072157e-06
m/o/s/Moselle_river.html 2.0349192137072157e-06
m/o/s/Mosel.html 2.0349192137072157e-06
m/o/s/Moschus_fuscus.html 2.0349192137072157e-06
m/o/s/Moschus_berezovskii.html 2.0349192137072157e-06
m/o/s/Moses_(Michelangelo_50da.html 2.0349192137072157e-06
m/o/s/Mosses.html 2.0349192137072157e-06
m/o/s/Moschus_moschiferus.html 2.0349192137072157e-06
m/o/s/Moschidae.html 2.0349192137072157e-06
m/o/s/Moss_city.html 2.0349192137072157e-06
m/o/s/Moschus_chrysogaster.html 2.0349192137072157e-06
m/o/n/Monica_Lewinsky_scandal_628b.html 2.0349192137072157e-06
m/o/n/Monotremes.html 2.0349192137072157e-06
m/o/n/Monkeys.html 2.0349192137072157e-06
m/o/n/Monicagate.html 2.0349192137072157e-06
m/o/n/Mongolian_Death_Worm_9efe.html 2.034919

In [52]:
def generate_seed(topic, g, contents):    

    n = len(contents.keys())
    key_to_pos = dict(zip(g.keys(), range(0,n)))
    seeds = np.zeros(n)

    for key in contents.keys():
        lower_key_content = [x.lower() for x in contents[key]]
        mask = [topic in x for x in lower_key_content]
        #if topic in lower_key_content:
        if any(mask):
            print(lower_key_content)
            seeds[key_to_pos[key]] = 1
    if (sum(seeds) != 0):
        seeds = seeds/sum(seeds)
        return seeds
    else: 
        return topic+' is not present'

In [48]:
obj = generate_seed('football', g, contents)

isinstance(obj, str)

  seeds = seeds/sum(seeds)


True

In [56]:
def compute_PersonalizedPageRank(graph, topic, contents, alpha, epsilon):
    n = len(graph.keys())
    # We compute the transition matrix without the teleportation
    R = compute_R(graph)
    # The jump vector is imply a vector of ones divided by its length
    J_s = generate_seed(topic, graph, contents)
    #J = np.ones(n)/n
    # The starting point can be a uniform distribution across all nodes
    # x = np.ones(n)/n
    # ...or a random stochastic vector
    x = np.random.rand(n)
    x = x/x.sum()
    # We can now iterate until the norm one of the changes in the
    # last iteration goes below epsilon
    err = np.inf # initially infinity
    while (err > epsilon):
        x_new = PageRank_iteration(x, R, J_s, alpha)
        err = (abs(x_new - x)).sum()
        print(err)
        x = x_new
    #print("PageRank scores:")
    #print(x.shape)
    dictionary = {}
    for i, k in enumerate(graph.keys()):
        #print(f"{k}: {x[0,i]}")
        dictionary[k] = x[i]
        
    
    return x, dictionary

In [58]:
topic = input('Insert the topic: ')

x, d = compute_PersonalizedPageRank(g, topic, contents, 0.15, 0.0001)

Insert the topic: rome
['iapetos', 'atlas (titan)', 'greek mythology', 'kronos', 'oceanid', 'prometheus', 'tartaros', 'titan (mythology)', 'twelve olympians', 'menoitios', 'epimetheus']
['iquitos', 'amazon river', 'loreto region', 'port', 'macromedia flash']
['iberian peninsula', 'country alias spain', 'country alias portugal', 'country alias andorra', 'country alias gibraltar', 'country alias uk', 'ancient greece', 'ancient rome', 'andorra', 'atlantic ocean', 'europe']
['user talk:uranusx2006', 'ancient rome', 'blockinblox', 'vector', 'vector', 'administrators', 'core article', 'how to write simple english articles', 'list of articles all languages should have', 'policies and guidelines', 'requested articles']
['image:unkeler rheinpromenade.jpg']
['wind', 'air', 'atmosphere', 'barometric pressure', 'building', 'death', 'dust', 'hemisphere', 'house', 'hurricane', 'pressure']
['water wheel', '1st century', '1st century bc', 'ancient china', 'ancient rome', 'aragon', 'axle', 'blade', 'bu

In [59]:
x

array([0.00000000e+00, 1.66321818e-08, 3.27149467e-07, ...,
       3.41805474e-07, 2.13529960e-08, 1.80392918e-06])

In [61]:
s = generate_seed('rome', g, contents)

sum(s)

['iapetos', 'atlas (titan)', 'greek mythology', 'kronos', 'oceanid', 'prometheus', 'tartaros', 'titan (mythology)', 'twelve olympians', 'menoitios', 'epimetheus']
['iquitos', 'amazon river', 'loreto region', 'port', 'macromedia flash']
['iberian peninsula', 'country alias spain', 'country alias portugal', 'country alias andorra', 'country alias gibraltar', 'country alias uk', 'ancient greece', 'ancient rome', 'andorra', 'atlantic ocean', 'europe']
['user talk:uranusx2006', 'ancient rome', 'blockinblox', 'vector', 'vector', 'administrators', 'core article', 'how to write simple english articles', 'list of articles all languages should have', 'policies and guidelines', 'requested articles']
['image:unkeler rheinpromenade.jpg']
['wind', 'air', 'atmosphere', 'barometric pressure', 'building', 'death', 'dust', 'hemisphere', 'house', 'hurricane', 'pressure']
['water wheel', '1st century', '1st century bc', 'ancient china', 'ancient rome', 'aragon', 'axle', 'blade', 'bucket', 'cast iron', 'en

0.9999999999999976

In [62]:
sum(x)

0.9945575401685459

In [44]:
for w in sorted(d, key=d.get, reverse=True):
    not_useful_pages= ['User', 'Category', 'Wiki','Help','Image','Special','License', 'GFDL', 'language']
    mask = [x not in w for x in not_useful_pages]
    if all(mask):
        print(w, d[w])


m/i/l/Mily_Balakirev_bdf2.html 0.0775321034265031
d/o/m/Dominique_de_Villepin_ebd9.html 0.07731145357987616
2/0/0/2005.html 0.004044306158037692
2/9/_/29_May_1fa8.html 0.0036067469002368626
m/a/y/May_29.html 0.0031508636929368995
p/i/a/Piano.html 0.002475718513343929
o/r/c/Orchestra.html 0.002351628203712983
s/o/n/Song.html 0.0023358164323535905
f/r/a/France.html 0.002321766579207325
m/u/s/Music.html 0.0022507998164125204
r/u/s/Russia.html 0.002248267125365224
p/r/e/President.html 0.0021429982186196572
n/i/k/Nikolai_Rimsky-Korsakov_f9a5.html 0.0020910844813390047
m/o/d/Modest_Mussorgsky_fa84.html 0.0020906094222166425
c/h/o/Choir.html 0.002049919931504199
a/l/e/Alexander_Borodin_6e52.html 0.0020389220465229026
1/8/3/1837.html 0.002020699437150513
1/9/5/1953.html 0.002002568318419267
1/9/1/1910.html 0.0019945216150150766
c/o/m/Composer.html 0.0019858464744563187
i/n/f/Influence.html 0.0019823117956257164
c/é/s/César_Cui_482b.html 0.0019822365193174116
s/t/_/St_Petersburg_a87b.html 0.001

j/e/s/Jesus_Christ_21ac.html 1.3505593221087948e-05
1/5/t/15th_century.html 1.3481158525906701e-05
p/u/b/Talk~Public_domain_5611.html 1.3470296156977637e-05
a/l/p/Alps.html 1.3447672255277546e-05
1/0/0/1003.html 1.3446890709433152e-05
h/o/m/Home_page.html 1.3426580996464067e-05
l/o/r/Lorraine.html 1.3416770042000244e-05
b/r/i/British.html 1.340797929149962e-05
a/d/v/Advertising.html 1.3398411711652605e-05
g/a/y/Gay-bashing.html 1.3392827248412827e-05
1/8/2/1820.html 1.3392754354576846e-05
s/i/z/Size.html 1.3392582055924768e-05
m/i/l/Milk.html 1.3390688702806434e-05
m/s/n/MSN_9caf.html 1.3383278572914369e-05
j/a/m/James_II_of_England_fb06.html 1.338280738611854e-05
1/8/6/1866.html 1.3357603801500134e-05
h/o/w/WP~HOW_8408.html 1.3345625064891606e-05
s/o/l/Solar_system.html 1.333528367617585e-05
c/o/m/Template~Commons_44b0.html 1.3317521237054457e-05
m/o/n/Monk.html 1.3298006071597855e-05
w/a/r/War_of_1812.html 1.3294182909580492e-05
k/i/l/Kilometre.html 1.3242550148610502e-05
v/i/l/Villa

c/r/e/Cream.html 7.043582115146142e-06
r/e/l/Relative.html 7.0379715941639105e-06
e/x/p/Experience.html 7.0325270628146095e-06
b/u/t/Butter.html 7.030349872787632e-06
f/o/r/Formula_One_17d2.html 7.029741328118736e-06
m/y/t/Mythology.html 7.027221041591803e-06
o/w/l/Owl.html 7.0218031032162214e-06
h/o/u/Hour.html 7.0143614178594575e-06
a/n/g/Angela_Merkel_e07c.html 7.014091477231634e-06
a/n/t/Ant.html 7.012894415512689e-06
m/a/r/Marseille.html 7.011070301775057e-06
t/e/s/Template~Test_d929.html 7.0094689726171695e-06
1/9/1/Template~1911.html 7.008395680078631e-06
a/u/v/Auvergne.html 7.002202208360591e-06
l/a/k/Lakshmi.html 6.995928167360598e-06
m/i/d/Middle-earth_characters.html 6.99298298596752e-06
b/o/t/Botswana.html 6.992967189701301e-06
1/7/5/1759.html 6.980673157125667e-06
a/r/a/Aramaic.html 6.97747471265272e-06
d/i/n/Dinosaur.html 6.97692219918766e-06
t/h/e/The_Clash_8395.html 6.976856117867636e-06
1/9/1/1911_Encyclopaedia_Britannica_243b.html 6.976201280493057e-06
b/u/d/Buddy_Hol

h/e/l/Helmut_Schmidt_1de3.html 2.5924084551214313e-06
o/r/g/Organic_chemistry.html 2.5909747514005676e-06
m/p/e/MPEG-4_f86d.html 2.589323908258586e-06
h/o/_/Ho_Chi_Minh_43ed.html 2.5891189160589886e-06
2/0/0/2006_FIFA_World_Cup_25dc.html 2.588119191582768e-06
a/r/a/Aragon.html 2.58477976090426e-06
v/å/l/Vålerenga_I.F._Fotball_e885.html 2.584517819358671e-06
k/a/r/Karachi.html 2.5844870217637614e-06
p/a/u/Paul_of_Tarsus_aa07.html 2.5843542861044097e-06
i/l/i/Iliad.html 2.5841067993732054e-06
5/_/d/5_December_0a75.html 2.5835481724641314e-06
g/a/l/Galois_module.html 2.581582520877839e-06
t/h/u/Thumb.html 2.581226260059105e-06
v/i/c/Victoria,_Australia_096e.html 2.580939324772596e-06
s/a/i/Saint_Paul,_Minnesota_76e0.html 2.5800482689436354e-06
c/o/m/Computer_recycling.html 2.579257859937887e-06
e/u/l/Euler's_Identity_abbc.html 2.578891808982935e-06
m/o/n/Montpelier,_Vermont_2f8c.html 2.5776955326483726e-06
d/e/m/Democrat.html 2.5763447224293445e-06
g/o/a/Goa.html 2.575602232155127e-06
h/i

1/5/8/1588.html 1.3586960669644056e-06
g/i/g/Gigabyte.html 1.358520588783486e-06
p/l/a/Plant_pathology.html 1.3577919165849578e-06
w/i/n/Windsor,_Ontario_57c6.html 1.3570655789187599e-06
b/o/e/Boeing_747.html 1.3567522655160093e-06
t/h/r/Throat.html 1.3561469031401691e-06
h/a/b/Habitat.html 1.3560819065141376e-06
c/r/e/Creative_network.html 1.355873067285942e-06
f/o/x/Fox_Broadcasting_Company_86ee.html 1.3557442008667047e-06
p/h/y/Phycology.html 1.3551122533596985e-06
j/o/e/Joe_DiMaggio_29be.html 1.3547415895793507e-06
t/a/i/Tailor.html 1.3546482676261625e-06
s/u/r/Surrender.html 1.3543229016101687e-06
p/o/m/Pompey.html 1.3542418500013522e-06
b/a/l/Balearic_Islands_424a.html 1.3538733733493362e-06
d/e/s/Descendant.html 1.3532284843808436e-06
y/e/s/Yes.html 1.3531239957250692e-06
a/e/g/Aegean.html 1.3528564853091647e-06
h/e/l/Helicopter_rotor.html 1.352389673801314e-06
s/a/v/Savanna.html 1.3520225768828987e-06
t/u/p/Tupac_Shakur_8f89.html 1.351945783203189e-06
c/a/t/Catalyst.html 1.3510

s/c/o/Talk~Scotland_19c8.html 8.3544284054937e-07
v/a/n/Vancouver_Canucks_ad4e.html 8.352240713921908e-07
f/a/l/Falcon.html 8.34943223296403e-07
b/a/n/Bandages.html 8.348562416501442e-07
m/i/c/Michelle_Trachtenberg_35c4.html 8.34663127554419e-07
k/u/r/Kuru.html 8.345113740711682e-07
1/4/_/14_November_889e.html 8.34472371116087e-07
w/e/s/West_Africa_e6b4.html 8.343082914385732e-07
c/r/o/Crocodilia.html 8.334365766435641e-07
i/n/f/Template~Infobox_Celebrity_91fa.html 8.329641333121063e-07
s/m/i/Smiley.html 8.325549129209764e-07
b/o/r/Bornholm.html 8.322309168638295e-07
f/a/r/Far.html 8.321452898556904e-07
i/n/f/Template~Infobox_Band_6ca2.html 8.320890249403762e-07
m/i/l/Milton_Keynes_e083.html 8.319567394104954e-07
1/0/5/1056.html 8.317876537288484e-07
p/a/r/Template~Particles_4c6c.html 8.316728514124001e-07
h/y/p/Hypnosis.html 8.315152904878861e-07
h/a/a/Haarlem.html 8.31197921368315e-07
t/i/d/Tidal_energy.html 8.311342843372033e-07
a/n/t/Anthrax.html 8.310978422029963e-07
i/n/d/Templat

1/4/4/1442.html 5.662679263336466e-07
p/h/i/Philippine_Sea_2e6d.html 5.662074999936631e-07
m/a/m/Mammalogy.html 5.661779553733541e-07
s/a/i/Saint_Martin_(Netherlands)_a103.html 5.661705498849192e-07
1/0/4/1046.html 5.660626587703422e-07
r/o/l/Roller_coaster.html 5.656736549712757e-07
p/r/o/Protons.html 5.65375250609785e-07
b/o/o/Boom.html 5.648738591945025e-07
h/a/r/Harbour.html 5.648398416671172e-07
p/i/n/Pinar_del_Río_province_32a3.html 5.647050283824544e-07
a/r/i/Arid.html 5.646745766127001e-07
c/h/e/Chelsea_F.C._c6c0.html 5.646713607840782e-07
f/o/r/For_Dummies_4a03.html 5.64560246083075e-07
r/o/g/Roger_Ebert_1ae7.html 5.64503676023308e-07
b/l/u/Blues_(disambiguation).html 5.644987421115256e-07
m/u/r/Murderer.html 5.644620789791246e-07
c/h/o/Talk~Chopstick_e7df.html 5.641615311420403e-07
a/s/h/Ashton_Kutcher_4ef3.html 5.640918394825051e-07
a/s/h/Ash.html 5.640730733402077e-07
1/3/2/1322.html 5.639195229101542e-07
h/a/n/Handcuffs.html 5.636730571796915e-07
c/u/p/Cupboard.html 5.6365

w/o/r/World_Heritage_Site_8177.html 3.6900162441773155e-07
l/i/s/List_of_Hindu_goddesses_917b.html 3.6884030864046905e-07
i/b/n/Ibn_Khaldun_a947.html 3.688153733730958e-07
j/e/r/Jermaine_Jackson_fcb2.html 3.687007006077122e-07
l/o/c/Locust.html 3.686536342028329e-07
b/l/o/Blogs.html 3.6851261983368506e-07
p/h/i/Phish.html 3.684560571041366e-07
b/u/n/Bundesautobahn_45.html 3.6845405206610015e-07
c/a/m/Camouflage_(disambiguation).html 3.684499682953744e-07
b/r/i/Britcom.html 3.6824239283745895e-07
u/n/i/Universal_House_of_Justice_dedd.html 3.682400746233554e-07
p/o/l/Polycarbonate.html 3.682138604345589e-07
t/e/n/Tenerife.html 3.6812088050132487e-07
c/o/t/Cottage.html 3.681126867589939e-07
h/m/s/HMS_Victory_0c3b.html 3.6807459254885625e-07
c/a/b/Cabbage.html 3.6788620929947713e-07
t/e/s/Template~Test4im_97f8.html 3.6784863299013166e-07
b/a/y/Bayern_Munich_a6fb.html 3.678464759118537e-07
t/h/e/The_Searchers_b8ca.html 3.6737282891524893e-07
f/i/n/Talk~Finland_ac58.html 3.6734746227014445e-

i/n/d/Indoors.html 2.0541459818515504e-07
r/e/p/Reptiles.html 2.0535147117843854e-07
g/i/l/Gilbert_and_Ellice_Islands_8a4e.html 2.0534774357499862e-07
l/a/k/Lake_District_79ed.html 2.0530577237712852e-07
c/a/t/Template~Catmore_5ac2.html 2.0525787542750573e-07
m/o/r/Template~More_7bbc.html 2.0525787542750573e-07
m/o/n/Monounsaturated_fat.html 2.051902619863729e-07
p/e/a/Peacock.html 2.0513460237536488e-07
r/u/t/Ruthenium.html 2.051232207180461e-07
t/h/e/The_Blues_Brothers_(movie)_119f.html 2.05106071845571e-07
s/i/t/Sith.html 2.0503832851441966e-07
s/h/e/Sheffield_F.C._9c7f.html 2.0503270498173077e-07
f/l/a/Flammable.html 2.0494494349638502e-07
c/e/s/Cessna.html 2.0475612106731995e-07
s/e/e/Seep_tube_worm.html 2.046511643174066e-07
q/w/e/QWERTY_c398.html 2.0452733015441177e-07
s/u/n/Sun_Tzu_(mathematician)_e244.html 2.0451333519461204e-07
h/i/b/Hibiscus_syriacus.html 2.0441532366104736e-07
v/y/a/Vyacheslav_Molotov_9754.html 2.0437826815841745e-07
t/o/u/Tourette_syndrome.html 2.043710066

s/a/h/Sahel.html 9.506239479223969e-08
n/e/w/News_agency.html 9.506215154244989e-08
t/h/e/The_Cult_673b.html 9.504689951604841e-08
z/i/r/Zirconium.html 9.50233238220735e-08
l/a/s/Laserdisc.html 9.502302067485501e-08
s/w/a/Swamp_cooler.html 9.496377493925385e-08
t/h/u/Thurso.html 9.489651232408575e-08
p/a/c/Pac-Man_7bbe.html 9.486962102680942e-08
e/n/g/Talk~English_people_2265.html 9.482995598098677e-08
r/a/p/Rapa.html 9.482070860139604e-08
o/r/d/Order_of_magnitude.html 9.476708783131778e-08
s/q/u/Squaring_the_circle.html 9.473160655714463e-08
t/o/o/Toolbox.html 9.46869086025104e-08
l/i/s/List_of_riots.html 9.466883754510009e-08
f/l/e/Flensburg.html 9.457100781594486e-08
f/u/n/Fungicide.html 9.454770374118558e-08
p/e/n/Pentomino.html 9.453402152571177e-08
j/ö/t/Jötunheimr.html 9.451597753814779e-08
a/p/p/Apple_pie.html 9.448319673921033e-08
a/p/p/Apple_sauce.html 9.448319673921033e-08
f/a/i/Fairuz.html 9.445827060477215e-08
g/e/s/Gesualdo.html 9.445716615813898e-08
r/e/d/Redhill.html 9.

c/h/r/Christlich_Soziale_Union_6236.html 3.820010086381143e-08
f/u/l/Full_cut_panties.html 3.816455920065022e-08
h/e/m/Hemera.html 3.8153955596753373e-08
a/r/m/Armens.html 3.8149770673396104e-08
1/4/_/14_January_78aa.html 3.814507433950456e-08
v/a/c/Vaccinium.html 3.813515320353617e-08
b/l/u/Blueberries.html 3.812504271550678e-08
a/-/z/Template~A-Z_e656.html 3.8122746888751763e-08
d/i/a/Diablo_II_0e9e.html 3.810353890863996e-08
s/e/x/Sexuality.html 3.810262591757331e-08
1/0/_/10_December_3cf0.html 3.809160898896063e-08
o/p/e/Opel_Corsa_1d28.html 3.805051830587985e-08
a/c/r/Acrocorinth.html 3.80065267694509e-08
o/r/a/Orange,_New_South_Wales_abae.html 3.7994898240357164e-08
p/r/o/Talk~Prometheus_c22a.html 3.797577407949515e-08
w/i/m/Wimbledon,_London_b385.html 3.7951162360467456e-08
s/c/o/Scone,_Perth_and_Kinross_8317.html 3.7904377700471926e-08
f/l/a/Flatfish.html 3.788370455429366e-08
b/a/n/Bank_of_China_Tower_7997.html 3.782426357321993e-08
e/u/r/European_plaice.html 3.780845109164631

f/i/a/Talk~Fiat_Ulysse_1ed4.html 1.6956947360558045e-09
g/u/é/Guéra.html 1.688614439640844e-09
k/e/n/Talk~Kenpo_33eb.html 1.676230203077677e-09
u/n/c/Talk~Uncle_John's_Bathroom_Reader_5207.html 1.629388707882142e-09
c/r/y/Cryptid.html 1.6059968418923078e-09
b/r/a/Bracelet.html 1.589708767461154e-09
p/s/y/Talk~Psychonauts_2027.html 1.5343542884595447e-09
l/a/s/Talk~Last_call_bell_9efc.html 1.5258245488877366e-09
b/o/x/Boxer_short.html 1.5126760922251036e-09
y/o/s/Talk~Yoshi_8aca.html 1.5125156254334405e-09
k/e/a/Template~Keane_ab84.html 1.5026694185209174e-09
c/h/e/Talk~Chessmaster_9acc.html 1.4843991055253728e-09
f/r/i/Fridge.html 1.4719289083232505e-09
t/h/e/Template~The_Beatles_af0c.html 1.4711972013024436e-09
s/o/n/Sony_PSP_ce81.html 1.4613193800330668e-09
i/n/f/Template_talk~Infobox_Movie_e83f.html 1.4477288838968374e-09
r/e/g/Regions_(Peru)_43b5.html 1.426279616137158e-09
i/s/s/Issac_Newton_e50b.html 1.4252558351958365e-09
g/w/i/Gwinnett_County,_Georgia_020f.html 1.411658671011569

r/e/p/Republic_of_Namibia_0092.html 0.0
r/e/p/Republic_of_Malawi_9c2a.html 0.0
r/e/p/Republic_of_Zimbabwe_f568.html 0.0
r/e/p/Republiek_Suriname_a3de.html 0.0
r/e/p/Republic_of_Liberia_5d45.html 0.0
r/e/p/Republic_of_Sudan_941e.html 0.0
r/e/p/Republic_of_Singapore_c1c5.html 0.0
r/s/a/Rsa.html 0.0
r/f/d/Template~RfD2_6cd3.html 0.0
r/f/d/Template~Rfd_23cc.html 0.0
r/f/d/Template~Rfd2_d76d.html 0.0
r/f/c/RFC_2822_773a.html 0.0
r/f/c/RFC822_7358.html 0.0
r/f/c/RFC_822_71fb.html 0.0
r/h/i/Rhinocerotidae.html 0.0
r/h/i/Rhine_river.html 0.0
r/h/i/Rhino.html 0.0
r/h/a/Rhamhorhynychus.html 0.0
r/h/a/Rhamhorhynchus.html 0.0
r/h/e/Rhea_Sylvia_4446.html 0.0
r/h/e/Rheinland-Pfalz_1e48.html 0.0
r/h/ö/Rhön_Mountains_a6f1.html 0.0
r/h/y/Rhythm_&_blues.html 0.0
r/b/_/WP~RB_b5c0.html 0.0
t/o/o/Tooth_floss.html 0.0
t/o/o/Toowoomba.html 0.0
t/o/w/Towomba.html 0.0
t/o/w/Tower_of_Pisa_cc47.html 0.0
t/o/l/Toledo,_OH_6929.html 0.0
t/o/c/Template~TOC_a775.html 0.0
t/o/k/Tokyo,_Japan_fec7.html 0.0
t/o/k/Tokio.h

p/e/a/Talk~Pearl_Harbour_Incident_6de3.html 0.0
p/e/r/Periodic_chart.html 0.0
p/e/r/Percussion_instruments.html 0.0
p/e/r/Persian_Leopard_87ad.html 0.0
p/e/t/Peter_Rubens_56b0.html 0.0
p/e/t/Peter_cushing.html 0.0
p/e/t/Peter_Paul_Reubens_c913.html 0.0
p/e/t/Peter_the_great.html 0.0
p/e/t/Petrol_bomb.html 0.0
p/e/t/Pete_Ham_5df6.html 0.0
p/e/t/Pete_doherty.html 0.0
p/e/t/Peter_I_f224.html 0.0
p/e/t/Petrus_Paulus_Rubens_748a.html 0.0
p/e/t/Peter_Tchaikovsky_d35a.html 0.0
p/e/t/Peter_tchaikovsky.html 0.0
p/e/e/Pee-Wee_Herman_e55f.html 0.0
p/e/e/Pee-wee_Herman_f2d7.html 0.0
p/e/e/Pee_Wee_Herman_fb11.html 0.0
p/e/s/Pestle.html 0.0
p/e/n/Penguins.html 0.0
p/e/n/Penile_erection.html 0.0
p/e/n/Talk~Penelope_Taynt_993f.html 0.0
p/e/n/Pennies.html 0.0
p/e/p/Pepperoncini.html 0.0
p/s/1/PS1_a14e.html 0.0
p/s/2/PS2_0df1.html 0.0
p/s/y/Psychedelic_mushrooms.html 0.0
p/y/g/Pygmy_Chimpanzee_2eb5.html 0.0
p/y/t/Pythagoras'_Theorem_7a9c.html 0.0
p/h/o/Pholcus_phalangioides.html 0.0
p/h/o/Phol.html 0.0


In [67]:
%%time

lista = []
for key in contents.keys():
        lower_key_content = [x.lower() for x in contents[key]]
        mask = ['milan' in x for x in lower_key_content]
        #if topic in lower_key_content:
        if any(mask):
            lista.append(lower_key_content)
            #print(lower_key_content)

CPU times: user 85.1 ms, sys: 0 ns, total: 85.1 ms
Wall time: 83.5 ms
