<a href="https://colab.research.google.com/github/Robprogram2002/PageRank_implementacion_Python/blob/main/web_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importmos las librerias necesarias
import requests  # libreria para acceder a paginas web
from bs4 import BeautifulSoup  # libreria para manipular el contenido de una pagina web
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# creamos las variables donde guardaremos la información

# grafo directo que representará la red de paginas
G = nx.DiGraph()
# diccionario para almacenar cada url con su codigo numerico (hash) 
pages = {}
# diccionario para almacenar las relaciones entre las paginas
relations = {}

In [None]:
# función que asigna a cada cadena de texto un valor numerico único (hash)
def hash(s):
    h = 0;
    for i in s:
        h = 131 * h + ord(i)
    return h

In [None]:
# pagina web de la Wikipedia que será usada como base para obtener el resto de 
# paginas y sus relaciones
root = 'https://en.wikipedia.org/wiki/Machine_learning'
wiki_hash = hash(root)
wiki_hash

6439427782865357410

In [None]:
def crawler(url):
  """
  Dada la url de una pagina web de la Wikipedia, se extraen todos los enlaces 
  en la pagina a otras del mismo sitio y se almacenan.
  """

  try:
    # intentamos acceder a la pagina con la url recivida
    response = requests.get(url)
  except requests.exceptions.ConnectionError:
    print('Given URL: %s is not available'% url)
    return
  
  # creamos el hash asociado a esta url
  page_hash = hash(url)
  content = BeautifulSoup(response.text, 'html')
  main = content.find("div", {"id": "mw-content-text"})
  try:
    # obtnemos todos los links en el contenido central de la página
    links = main.findAll('a', href= True)
    base = 'https://en.wikipedia.org'

    # iteramos sobre cada uno de estos enlaces
    for i in links[:int(len(links)/4)]:
      # se almacena la url a la pagina de destino
      child_url = i['href']

      # se filtran y corrigen las urls para quedarse solo con las que
      # referencian a otras paginas de la wikipedia
      if child_url[0] == '#':
        continue
      elif child_url[0] == '/':
        child_url = base + child_url
      if '/wiki/' not in child_url:
        continue
      
      # se guarda el hash de la url
      child_hash = hash(child_url)
      if child_hash not in pages:
        pages[child_hash] = child_url

      # se guarda la relación entre las paginas
      if page_hash in relations:
        relations[page_hash].append(child_hash)
      else:
        relations[page_hash] = [child_hash]

  except:
    return

In [None]:
# ejecutamos la función anterior sobre la pagina tomada como base
pages[wiki_hash] = root
crawler(root)

In [None]:
# se muestra el número de paginas recorridas
len(list(pages.keys()))

7935

In [None]:
# y las relaciones encontradas
len(relations[wiki_hash])

286

In [None]:
# ahora se repite el proceso para cada una de las paginas a las que esta conectada
# la pagina base
for x in relations[wiki_hash]:
  crawler(pages[x])

In [None]:
# con esta información se crea el grafo
G = nx.DiGraph()
keys = list(relations.keys())
for key in keys:
  for link in relations[key]:
    G.add_edge(key, link)

key 1 of 226
key 2 of 226
key 3 of 226
key 4 of 226
key 5 of 226
key 6 of 226
key 7 of 226
key 8 of 226
key 9 of 226
key 10 of 226
key 11 of 226
key 12 of 226
key 13 of 226
key 14 of 226
key 15 of 226
key 16 of 226
key 17 of 226
key 18 of 226
key 19 of 226
key 20 of 226
key 21 of 226
key 22 of 226
key 23 of 226
key 24 of 226
key 25 of 226
key 26 of 226
key 27 of 226
key 28 of 226
key 29 of 226
key 30 of 226
key 31 of 226
key 32 of 226
key 33 of 226
key 34 of 226
key 35 of 226
key 36 of 226
key 37 of 226
key 38 of 226
key 39 of 226
key 40 of 226
key 41 of 226
key 42 of 226
key 43 of 226
key 44 of 226
key 45 of 226
key 46 of 226
key 47 of 226
key 48 of 226
key 49 of 226
key 50 of 226
key 51 of 226
key 52 of 226
key 53 of 226
key 54 of 226
key 55 of 226
key 56 of 226
key 57 of 226
key 58 of 226
key 59 of 226
key 60 of 226
key 61 of 226
key 62 of 226
key 63 of 226
key 64 of 226
key 65 of 226
key 66 of 226
key 67 of 226
key 68 of 226
key 69 of 226
key 70 of 226
key 71 of 226
key 72 of 226
k

In [None]:
G.number_of_nodes()

7935

In [None]:
G.number_of_edges()

19680

In [None]:
def getRank(tr_mat,N, treshold):
  pi_0 = np.array([1/N for i in range(N)])
  pi_last = pi_0
  pi_next = np.matmul(pi_0, tr_mat)
  while np.linalg.norm(pi_next-pi_last) > treshold:
    pi_last = pi_next
    pi_next = np.matmul(pi_last, tr_mat)
  return pi_next

def pageRank(links, graph = None, threshold = 0.1, alpha = 0.15): 
  if graph is None:
    G = nx.DiGraph()
    G.add_edges_from(links)
  else:
    G = graph

  nodes = list(G.nodes)
  N = len(nodes)
  P = np.zeros((N,N))

  
  for i in range(N):
    nbrs = list(G.neighbors(nodes[i]))
    n = len(nbrs)
    for j in range(N):
      if nodes[j] in nbrs:
        P[i][j] = 1 / n
      else:
        P[i][j] = 0

  P = (1-alpha)*P + (alpha / N)*np.ones((N,N))
   
  stationary = getRank(P, N, threshold)
  return {
      'ranking': pd.Series(stationary, index = nodes).sort_values(ascending=False),
      'matrix': pd.DataFrame(P, nodes, nodes),
      'graph': G
  }

In [None]:
# Finalmente se aplica el algoritmo desarrollado a el grafo creado con la información
# recuperada
result = pageRank(None, graph = G)

In [None]:
# se muestra el top 10 del ranking 
rank = result['ranking']
top_10 = rank[:10]
index = [pages[i] for i in list(top_10.index)]
top_10.index = index
top_10

https://en.wikipedia.org/wiki/Machine_learning              0.000267
https://en.wikipedia.org/wiki/Artificial_neural_network     0.000178
https://commons.wikimedia.org/wiki/Main_Page                0.000163
https://en.wikipedia.org/wiki/Statistical_classification    0.000162
https://en.wikipedia.org/wiki/Perceptron                    0.000156
https://en.wikipedia.org/wiki/Supervised_learning           0.000155
https://en.wikipedia.org/wiki/Data_mining                   0.000153
https://en.wikipedia.org/wiki/Regression_analysis           0.000151
https://en.wikipedia.org/wiki/Unsupervised_learning         0.000145
https://en.wikipedia.org/wiki/Cluster_analysis              0.000143
dtype: float64