# PageRank

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import linalg as la
from scipy import sparse as spa

### Problem 1

In [2]:
A = np.array([[ 0, 0, 0, 0, 0, 0, 0, 1],
              [ 1, 0, 0, 0, 0, 0, 0, 0],
              [ 0, 0, 0, 0, 0, 0, 0, 0],
              [ 1, 0, 1, 0, 0, 0, 1, 0],
              [ 1, 0, 0, 0, 0, 1, 1, 0],
              [ 1, 0, 0, 0, 0, 0, 1, 0],
              [ 1, 0, 0, 0, 0, 0, 0, 0],
              [ 1, 0, 0, 0, 0, 0, 0, 0]])

In [3]:
def p_1(filename, N):
    rawdata = pd.read_csv(filename, delimiter='\t')
    Adj = np.zeros((N,N))
    for i in range(N):
        for j in range(N):
            Adj[i,j] = (rawdata['From Node'][((rawdata['From Node'] == i)
                                              & (rawdata['To Node'] == j))].count())
    return Adj

In [4]:
adj_1 = p_1('matrix.txt', N=8)
adj_1

array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.]])

### Problem 2

In [5]:
def p_2(A):
    n, n = A.shape
    ones = np.ones((n, n))
    D = np.empty((n,1)) #note D is a column vector
    for i in range(n):
        if A[i, :].sum() == 0:
            A[i, :] = ones[i, :]
        D[i,0] = A[i, :].sum()
    temp = A/D
    K = temp.T
    return K, D

In [6]:
K = p_2(A)
print(K)

(array([[0.        , 1.        , 0.125     , 0.33333333, 0.33333333,
        0.5       , 1.        , 1.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.33333333, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.33333333,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.33333333, 0.33333333,
        0.5       , 0.        , 0.        ],
       [1.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ]]), array([[1.],
       [1.],
       [8.],
       [3.],
       [3.],
       [2.],
       [1.],
  

### Problem 3

In [7]:
def p_3(A, N = None, d=0.85, tol=1e-5):
    
    if N == None:
        n,n = A.shape 
    if N != None:
        n = N
    
    p = np.random.rand(n)
    p = p/p.sum() #make probabilities sum to one for initial guess
    
    diff = 98.2
    maxiters =  500
    iters = 0
    
    K, D = p_2(A[0:n, 0:n])
    while diff > tol and iters < maxiters:
        p_1 = d*(K @ p) + (1-d)/n
        diff = np.sqrt(np.inner((p_1 - p), (p_1 - p)))
        p = p_1
        iters += 1

    return p

In [8]:
p_3(A, N=8)

array([0.43869566, 0.02171029, 0.02786154, 0.02171029, 0.02171029,
       0.02786154, 0.04585394, 0.39459646])

### Problem 4

In [11]:
def p_4(A, N=None, d=0.85):
    
    if N == None:
        n,n = A.shape 
    if N != None:
        n = N
        
    K, D = p_2(A)
    E = np.ones((n,n))
    B = d*K + ((1-d)/n)*E
    
    eigs, vecs = np.linalg.eig(B)
    
    eval_1 = np.argmax(eigs)
    evec = vecs[:, eval_1]
    return evec / np.sum(evec)
    

In [13]:
p_4(A)

array([0.43869288, 0.02171029, 0.02786154, 0.02171029, 0.02171029,
       0.02786154, 0.04585394, 0.39459924])

### Problem 5

In [236]:
from numba import vectorize, jit, njit, float64, prange
rawdata = pd.read_csv('ncaa2013.csv',header=0)
rawdatanp = rawdata.values #convert pandas to numpy array so we can use jit

In [288]:
q, m = rawdatanp.shape #q is length, m is width
names = np.unique(np.concatenate((rawdatanp[:,0], rawdatanp[:,1])))
n = len(names)
Adj = np.zeros((n,n))

In [304]:
def filladj_1(Adj, q, names, rawdatanp):
    for i in range(q):
        loc1 = np.where(names[names == rawdatanp[i,0]] == names)
        loc2 = np.where(names[names == rawdatanp[i,1]] == names)
        Adj[loc2, loc1] = 1
    return Adj

In [307]:
filledad = filladj_1(Adj, q, names, rawdatanp)

In [308]:
filledad[0].sum()

11.0

In [311]:
probs = p_3(filledad, d=0.7)
probs.sum()

1.0

In [312]:
rank_id = probs.argsort()[-5:][::-1]  # Top 5 teams

list(names[rank_id])

['Duke', 'Butler', 'Louisville', 'Illinois', 'Indiana']