In [1]:
import numpy as np
import pandas as pd

## PageRank Problem

In [2]:
links = np.array(pd.read_csv('LinkData.csv')) # put your file location here
# i personally hate pandas.  loc, iloc, at, slices ... it's all hard to remember...
# R does dataframes so much better
# so i translated to an np array, they're easier to deal with..

In [3]:
npages = links.shape[0] # how many pages
P = np.zeros((npages,npages)) # initialize the P matrix with all zeros, then come back later to fix

In [4]:
for row in range(npages):
    links_this_page = np.sum(links[row,:]) # how many trues on this row
    P[row,links[row,:]] = 1.0/links_this_page 
    # if True, then there is a link from page row to that page so set set P to be 1/number of trues at each true


In [5]:
start_page = 16
click_page = 17
P[start_page,click_page]

0.025

In [6]:
col_to_sum = 92
np.sum(P[:,col_to_sum])

0.24447055137844612

In [7]:
v = np.ones((1,npages))/npages # equally likely to start on each page
v

array([[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
        0.01]])

In [8]:
v2 = v @ P # do the mulitplication
v2

array([[0.00573833, 0.00769424, 0.00555043, 0.0055146 , 0.00453227,
        0.00557325, 0.0051904 , 0.00556341, 0.00457945, 0.00554611,
        0.00753453, 0.00781997, 0.00757092, 0.00771049, 0.01407555,
        0.01193625, 0.01130167, 0.00886274, 0.00583722, 0.01185877,
        0.00841383, 0.00609158, 0.00880036, 0.00779918, 0.00968227,
        0.0125245 , 0.01052719, 0.0096278 , 0.01128387, 0.01170446,
        0.01019642, 0.00963237, 0.00916069, 0.01089168, 0.01576276,
        0.01121872, 0.01460209, 0.01054923, 0.01640059, 0.01002743,
        0.011999  , 0.01023118, 0.01506729, 0.01101342, 0.01178936,
        0.01551068, 0.01027617, 0.01456252, 0.01014159, 0.01872343,
        0.01162508, 0.01434055, 0.01478432, 0.01454741, 0.01396555,
        0.0092492 , 0.01396915, 0.01273325, 0.01607843, 0.01515518,
        0.0145669 , 0.01379209, 0.00982474, 0.00944187, 0.00992358,
        0.01467456, 0.0158174 , 0.00667514, 0.01539148, 0.01079153,
        0.01045996, 0.00945565, 0.01121931, 0.01

In [9]:
np.sum(v2)

1.0

In [10]:
np.sum(v2[0,0:25])

0.1907778199377474

In [11]:
np.sum(v2[0,75:])

0.19714336664103238

In [12]:
nloop = 10
vold = np.ones((1,npages))/npages
for i in range(nloop):
    vnew = vold @ P
    vold = vnew.copy() # another thing that annoys me about python...all lists/vectors are pointers...i never know when I need to use copy

vnew = vold @ P
np.max(np.abs(vnew-vold)) # this was enough iterations of the loop

3.811623702429379e-09

In [13]:
vnew

array([[0.00429773, 0.00750547, 0.0049219 , 0.00391867, 0.00459899,
        0.00490018, 0.00402286, 0.00490787, 0.0053511 , 0.00580511,
        0.00790014, 0.00915911, 0.00793979, 0.00779602, 0.01460387,
        0.0124883 , 0.01121931, 0.01017101, 0.00593845, 0.01163902,
        0.0082568 , 0.00682729, 0.00934004, 0.00882597, 0.01008305,
        0.01202228, 0.01071045, 0.01091075, 0.00934851, 0.0112114 ,
        0.01117286, 0.00913975, 0.00925464, 0.00963831, 0.01431424,
        0.0095833 , 0.01571117, 0.01124787, 0.01544025, 0.01063508,
        0.01113467, 0.01038905, 0.01418032, 0.01168492, 0.01250504,
        0.01437684, 0.00971705, 0.0141186 , 0.01075935, 0.01829176,
        0.01168584, 0.01546038, 0.01449708, 0.01420542, 0.01426497,
        0.01057476, 0.01249518, 0.01241953, 0.01554644, 0.01548465,
        0.01380967, 0.01348518, 0.01058995, 0.01057582, 0.00933189,
        0.01462574, 0.0158777 , 0.00685918, 0.0144265 , 0.01085781,
        0.01120941, 0.00860189, 0.01127566, 0.01

In [14]:
which_page = 60
vnew[0,which_page]

0.013809671904806699

In [15]:
np.argmax(vnew)

49

In [16]:
np.argmin(vnew)

92

In [17]:
np.argsort(vnew)[0][::-1]

array([49, 66, 36, 58, 59, 51, 38, 65, 14, 52, 68, 45, 34, 54, 53, 42, 47,
       60, 61, 77, 76, 73, 44, 56, 15, 57, 78, 25, 50, 43, 19, 83, 81, 72,
       37, 16, 29, 70, 30, 88, 40, 27, 69, 48, 26, 39, 62, 63, 55, 79, 41,
       89, 75, 17, 24, 46, 33, 35, 74, 28, 22, 64, 32, 11, 31, 23, 80, 96,
       71, 84, 20, 12, 10, 85, 13, 86, 82, 90,  1, 95, 87, 67, 21, 18,  9,
        8,  2,  7,  5, 91, 98,  4, 97,  0,  6,  3, 94, 93, 99, 92],
      dtype=int64)