# Assignement : PageRank Algorithm

In [None]:
import numpy as np
from numpy import matrix
import pandas as pd
from scipy.sparse import coo_matrix

In [4]:



T = matrix([
    [0, 0, 1, 1/2],
    [1/3, 0, 0, 0],
    [1/3, 1/2, 0, 1/2],
    [1/3, 1/2, 0, 0],
])
p = matrix([[1/4], [1/4], [1/4], [1/4]])
for i in range(100):
   p = T*p
p



matrix([[0.38709677],
        [0.12903226],
        [0.29032258],
        [0.19354839]])

In [5]:
# la matrice T pour le snapshot de wikipedia sera gigantesque mais avec plein de 0
# Donc on va juste récupérer les points d'intéret du style ((1,4,1/2), (2,1,1/3))
# utiliser pandas pour lire le csv.
# faire la boucle for avec numpy parce qu'il utilise du c++ (plus rapide) et les fichiers csv sont gros.
# proposition : probabilité après k+1 = stochastic 



# --- LOAD DATA ---

names = pd.read_csv("wikidata/names.csv")
edges = pd.read_csv("wikidata/edges.csv")

# Récupération de tout les id, entrant comme sortant.
all_ids = pd.concat([edges["FromNode"], edges["ToNode"]]).unique()
id_to_idx = {node_id: i for i, node_id in enumerate(all_ids)}
N = len(all_ids)


print("Nombre de noeuds :", N)


Nombre de noeuds : 199903


In [6]:


# --- CONSTRUCTION DE LA MATRICE DE TRANSITION SPARSE ---

# On calcule le nombre de liens sortants pour normaliser les appels vers une page
out_degree = edges["FromNode"].value_counts()

rows = []   # lignes = destinations (ToNode)
cols = []   # colonnes = sources (FromNode)
data = []   # valeurs = 1/out_degree

for f, t in zip(edges["FromNode"], edges["ToNode"]):
    cols.append(id_to_idx[f])
    rows.append(id_to_idx[t])
    data.append(1.0 / out_degree[f])
    if len(cols) < 10:
        print("f :", f, " t :", t, " rows :", rows, " cols :", cols, " data :", data )

# Construction de la matrice T en format sparse CSR
T = coo_matrix((data, (rows, cols)), shape=(N, N)).tocsr()




f : 175973  t : 1  rows : [38619]  cols : [0]  data : [np.float64(0.025)]
f : 130880  t : 2  rows : [38619, 52918]  cols : [0, 1]  data : [np.float64(0.025), np.float64(0.007936507936507936)]
f : 145856  t : 2  rows : [38619, 52918, 52918]  cols : [0, 1, 2]  data : [np.float64(0.025), np.float64(0.007936507936507936), np.float64(0.012658227848101266)]
f : 159190  t : 2  rows : [38619, 52918, 52918, 52918]  cols : [0, 1, 2, 3]  data : [np.float64(0.025), np.float64(0.007936507936507936), np.float64(0.012658227848101266), np.float64(0.0625)]
f : 159200  t : 2  rows : [38619, 52918, 52918, 52918, 52918]  cols : [0, 1, 2, 3, 4]  data : [np.float64(0.025), np.float64(0.007936507936507936), np.float64(0.012658227848101266), np.float64(0.0625), np.float64(0.017543859649122806)]
f : 159207  t : 2  rows : [38619, 52918, 52918, 52918, 52918, 52918]  cols : [0, 1, 2, 3, 4, 5]  data : [np.float64(0.025), np.float64(0.007936507936507936), np.float64(0.012658227848101266), np.float64(0.0625), np.flo

In [7]:
# --- PAGE RANK ---

alpha = 0.85
p = np.ones(N) / N
teleport = (1 - alpha) / N

for _ in range(110):  # converge généralement avant
    p = alpha * (T @ p) + teleport

# --- EXTRACTION DES TOP PAGES ---

top_k = 100
top_idx = np.argsort(-p)[:top_k]

for rank, idx in enumerate(top_idx, 1):
    original_id = all_ids[idx]
    if original_id < len(names):
        name = names.iloc[original_id-1]["Name"]
    else:
        name = "(unknown)"
    print(f"{rank}. {name} — score {p[idx]:.6f}")

1. United States — score 0.002348
2. United Kingdom — score 0.001282
3. World War II — score 0.001051
4. France — score 0.000990
5. Latin — score 0.000853
6. Germany — score 0.000847
7. Canada — score 0.000729
8. English language — score 0.000695
9. China — score 0.000694
10. India — score 0.000679
11. Italy — score 0.000657
12. Catholic Church — score 0.000625
13. Australia — score 0.000623
14. England — score 0.000621
15. World War I — score 0.000579
16. London — score 0.000572
17. Europe — score 0.000568
18. Mathematics — score 0.000553
19. Russia — score 0.000521
20. Greek language — score 0.000509
21. Japan — score 0.000491
22. New York City — score 0.000485
23. Soviet Union — score 0.000468
24. French language — score 0.000452
25. Spain — score 0.000448
26. Netherlands — score 0.000429
27. Paris — score 0.000418
28. Middle Ages — score 0.000405
29. European Union — score 0.000390
30. Washington, D.C. — score 0.000385
31. Scotland — score 0.000369
32. New York — score 0.000346
33.

In [None]:
# la matrice T pour le snapshot de wikipedia sera gigantesque mais avec plein de 0
# Donc on va juste récupérer les points d'intéret du style ((1,4,1/2), (2,1,1/3))
# utiliser pandas pour lire le csv.
# faire la boucle for avec numpy parce qu'il utilise du c++ (plus rapide) et les fichiers csv sont gros.
# proposition : probabilité après k+1 = stochastic 

# ==== CODE FINAL

import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

# --- LOAD DATA ---

names = pd.read_csv("wikidata/names.csv")
edges = pd.read_csv("wikidata/edges.csv")

# Récupération de tout les id, entrant comme sortant.
all_ids = pd.concat([edges["FromNode"], edges["ToNode"]]).unique()
id_to_idx = {node_id: i for i, node_id in enumerate(all_ids)}
N = len(all_ids)


print("Nombre de noeuds :", N)


# --- CONSTRUCTION DE LA MATRICE DE TRANSITION SPARSE ---

# On calcule le nombre de liens sortants pour normaliser les appels vers une page
out_degree = edges["FromNode"].value_counts()

rows = []   # lignes = destinations (ToNode)
cols = []   # colonnes = sources (FromNode)
data = []   # valeurs = 1/out_degree

for f, t in zip(edges["FromNode"], edges["ToNode"]):
    cols.append(id_to_idx[f])
    rows.append(id_to_idx[t])
    data.append(1.0 / out_degree[f])
    if len(cols) < 10:
        print("f :", f, " t :", t, " rows :", rows, " cols :", cols, " data :", data )

# Construction de la matrice T en format sparse CSR
T = coo_matrix((data, (rows, cols)), shape=(N, N)).tocsr()



# --- PAGE RANK ---

alpha = 0.85
p = np.ones(N) / N
teleport = (1 - alpha) / N

for _ in range(110):  # converge généralement avant
    p = alpha * (T @ p) + teleport

# --- EXTRACTION DES TOP PAGES ---

top_k = 100
top_idx = np.argsort(-p)[:top_k]

for rank, idx in enumerate(top_idx, 1):
    original_id = all_ids[idx]
    if original_id < len(names):
        name = names.iloc[original_id-1]["Name"]
    else:
        name = "(unknown)"
    print(f"{rank}. {name} — score {p[idx]:.6f}")

Nombre de noeuds : 199903
1. Hollywood Bowl — score 0.002348
2. The Independent — score 0.001282
3. Phoney War — score 0.001051
4. INSEE code — score 0.000990
5. Unclean animal — score 0.000853
6. Brandenburg an der Havel — score 0.000847
7. George A. Drew — score 0.000729
8. United Nations in popular culture — score 0.000695
9. Sichuan Basin — score 0.000694
10. Non-resident Indian and person of Indian origin — score 0.000679
11. Paolo Rossi — score 0.000657
12. Roman Catholic Archdiocese of Berlin — score 0.000625
13. Aboriginal Australians — score 0.000623
14. Terry Venables — score 0.000621
15. Northwood Headquarters — score 0.000579
16. Hyde Park, London — score 0.000572
17. Codling moth — score 0.000568
18. Function (mathematics) — score 0.000553
19. Volga Federal District — score 0.000521
20. Lacuna (manuscripts) — score 0.000509
