## Location Authority

This section is to calculate location authority by SALSA Algorithm

In [3]:
import pandas as pd
import numpy as np

dir = 'E:\\Sebnewrepo\\Data\\checkin_data\\dataset_tsmc2014/'
checkin_file = 'dataset_TSMC2014_NYC.txt'

In [4]:
col = ['user_id',
       'poi_id',
       'poi_category_id',
       'poi_category_name',
       'latitude', 
       'longitude',
       'time_offset',
       'UTC_time']
df = pd.read_csv(dir + checkin_file, delimiter = "\t", names = col)
df = df[['user_id','poi_id']]
df.head()

Unnamed: 0,user_id,poi_id
0,470,49bbd6c0f964a520f4531fe3
1,979,4a43c0aef964a520c6a61fe3
2,69,4c5cc7b485a1e21e00d35711
3,395,4bc7086715a7ef3bef9878da
4,87,4cf2c5321d18a143951b5cec


In [5]:
# remove infrequent items and users
from copy import deepcopy
def rm_infrequent_items(data, min_counts):
    df = deepcopy(data)
    counts = df['poi_id'].value_counts()
    df = df[df['poi_id'].isin(counts[counts >= min_counts].index)]
    print("POIs with < {} interactoins are removed".format(min_counts))
    return df
def rm_infrequent_users(data, min_counts):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]
    print("users with < {} interactoins are removed".format(min_counts))
    return df
          
df = rm_infrequent_users(df, 5)
df = rm_infrequent_items(df, 5)
print('num of users:{}, num of POIs:{}'.format(len(df['user_id'].unique()), len(df['poi_id'].unique())))

users with < 5 interactoins are removed
POIs with < 5 interactoins are removed
num of users:1083, num of POIs:9989


In [6]:
# POIs encode, and generate encode mapping
poi_cat = pd.Categorical(df['poi_id'])
poi_encode = poi_cat.codes
#generate poi mapping table
poi_mapping = pd.DataFrame({
    'poi_encode': poi_encode,
    'poi_id': df['poi_id']
    })
#drop duplicate
poi_mapping_output = poi_mapping.drop_duplicates()
df['poi_encode'] = poi_encode
df.drop(['poi_id'], axis = 1, inplace = True)
df.head(5)

Unnamed: 0,user_id,poi_encode
0,470,1230
1,979,1879
2,69,6161
4,87,6859
5,484,4017


In [7]:
df.poi_encode.nunique()

9989

In [8]:
df[df.poi_encode == 0]

Unnamed: 0,user_id,poi_encode
25842,746,0
49239,421,0
49340,417,0
62387,665,0
99727,908,0
136952,903,0
167351,268,0
168246,700,0
182171,260,0
186461,674,0


In [9]:
poi_mapping_output.head()

Unnamed: 0,poi_encode,poi_id
0,1230,49bbd6c0f964a520f4531fe3
1,1879,4a43c0aef964a520c6a61fe3
2,6161,4c5cc7b485a1e21e00d35711
4,6859,4cf2c5321d18a143951b5cec
5,4017,4b5b981bf964a520900929e3


In [10]:
# try a test dataset:
test = pd.DataFrame({
    'user_id': [1,1,1,2,2,3,3,3,4,5,6,6,7,7],
    'poi_encode': [0,1,2,0,4,3,2,2,1,0,4,3,3,1]
})
test

Unnamed: 0,user_id,poi_encode
0,1,0
1,1,1
2,1,2
3,2,0
4,2,4
5,3,3
6,3,2
7,3,2
8,4,1
9,5,0


#### 0. Create user-location adjacent matrix

In [199]:
# Adjacent matrix
def adjacent_mat(df):
    mat_row = df['user_id'].nunique()
    mat_col = df['poi_encode'].nunique()
    adj_mat = np.zeros((mat_row, mat_col))
    for index, row in df.iterrows():
        #print(row['user_id'])
        adj_mat[row['user_id'] - 1][row['poi_encode']] += 1
    return adj_mat

In [212]:
adj_mat = adjacent_mat(df)

adj_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

#### 1. Create user_location transition prob matrix G_ul 


In [213]:
# array of total check-in number of each location
num_loc = adj_mat.sum(axis = 0, dtype = 'float')
num_loc

array([16., 17.,  5., ..., 24., 13.,  9.])

In [214]:
G_ul = adj_mat / num_loc
G_ul

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.11111111],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.07692308,
        0.        ]])

In [215]:
# G_ul with random walk prob
e = 0.85
M = adj_mat.shape[1]
G_ul_random = e * G_ul + (1-e) * 1/M
# column normalize
G_ul_random_normalized = G_ul_random / G_ul_random.sum(axis = 0, dtype = 'float')
G_ul_random_normalized

array([[1.73348280e-05, 1.73348280e-05, 1.73348280e-05, ...,
        1.73348280e-05, 1.73348280e-05, 1.73348280e-05],
       [1.73348280e-05, 1.73348280e-05, 1.73348280e-05, ...,
        1.73348280e-05, 1.73348280e-05, 1.73348280e-05],
       [1.73348280e-05, 1.73348280e-05, 1.73348280e-05, ...,
        1.73348280e-05, 1.73348280e-05, 1.73348280e-05],
       ...,
       [1.73348280e-05, 1.73348280e-05, 1.73348280e-05, ...,
        1.73348280e-05, 1.73348280e-05, 1.09042488e-01],
       [1.73348280e-05, 1.73348280e-05, 1.73348280e-05, ...,
        1.73348280e-05, 1.73348280e-05, 1.73348280e-05],
       [1.73348280e-05, 1.73348280e-05, 1.73348280e-05, ...,
        1.73348280e-05, 7.54962872e-02, 1.73348280e-05]])

#### 2. Create Location_user transition prob matrix G_lu

In [216]:
base = adj_mat.T

In [217]:
# array of total check-in number of each user
num_user = base.sum(axis = 0, dtype = 'float')
num_user

array([ 70., 100.,  85., ...,  77., 231., 240.])

In [218]:
G_lu = base / num_user
G_lu

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00416667],
       [0.        , 0.        , 0.        , ..., 0.01298701, 0.        ,
        0.        ]])

In [219]:
e = 0.85
M = base.shape[1]
G_lu_random = e * G_lu + (1-e) * 1/M
# column normalize
G_lu_random_normalized = G_lu_random / G_lu_random.sum(axis = 0, dtype = 'float')
G_lu_random_normalized

array([[6.20116582e-05, 6.20116582e-05, 6.20116582e-05, ...,
        6.20116582e-05, 6.20116582e-05, 6.20116582e-05],
       [6.20116582e-05, 6.20116582e-05, 6.20116582e-05, ...,
        6.20116582e-05, 6.20116582e-05, 6.20116582e-05],
       [6.20116582e-05, 6.20116582e-05, 6.20116582e-05, ...,
        6.20116582e-05, 6.20116582e-05, 6.20116582e-05],
       ...,
       [6.20116582e-05, 6.20116582e-05, 6.20116582e-05, ...,
        6.20116582e-05, 6.20116582e-05, 6.20116582e-05],
       [6.20116582e-05, 6.20116582e-05, 6.20116582e-05, ...,
        6.20116582e-05, 6.20116582e-05, 1.64770143e-03],
       [6.20116582e-05, 6.20116582e-05, 6.20116582e-05, ...,
        5.00442135e-03, 6.20116582e-05, 6.20116582e-05]])

#### 3. Calculate location authority

In [220]:
G_ul_random_normalized.shape

(1083, 9989)

In [221]:
G_lu_random_normalized.shape

(9989, 1083)

In [225]:
def cosine_similarity(x,y):
    num = x.dot(y.T)
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    return num / denom

def mutual_rein(G_ul_random, G_lu_random):
    
    #initialize the location score vector and user score vector:
    amount_location = G_lu_random.shape[0]
    amount_user = G_ul_random.shape[0]
    w_location = (np.ones(amount_location)/amount_location).T
    w_user = (np.ones(amount_user)/amount_user).T
    
    i = 0
    while True:
        w_location_check = w_location
        w_user_check = w_user
        # mutual enforcement
        #print(w_location)
        #print(sum(w_location))
        #print(w_user)
        w_location = np.matmul(G_lu_random, w_user)
        w_user = np.matmul(G_ul_random, w_location)
        # print(G_lu_random.shape)
        # print(w_user.shape)
        i = i + 1
        #print(w_location)
        if 1 - cosine_similarity(w_location_check, w_location) < 0.0001:
            print('number of iteration: ', i)
            break
        
    return w_location

In [226]:
t = mutual_rein(G_ul_random_normalized, G_lu_random_normalized)

number of iteration:  4


In [237]:
location_authority = pd.DataFrame({'score': t})
location_authority = location_authority.reset_index()
location_authority = location_authority.rename(columns = {'index':'poi_encode'})
location_authority = location_authority.merge(poi_mapping_output, on = 'poi_encode', how = 'left')
location_authority

Unnamed: 0,poi_encode,score,poi_id
0,0,0.000102,3fd66200f964a52000e71ee3
1,1,0.000103,3fd66200f964a52001e81ee3
2,2,0.000076,3fd66200f964a52003e51ee3
3,3,0.000101,3fd66200f964a52003e71ee3
4,4,0.000106,3fd66200f964a52004e41ee3
...,...,...,...
9984,9984,0.000099,510188aae4b055abaa8e5ca7
9985,9985,0.000081,51073fcae4b047c0870d2216
9986,9986,0.000116,510c9f9de4b0ec5cc005fc87
9987,9987,0.000089,51140198e4b0874a568cde81


In [256]:
location_authority.sort_values(by = 'score', ascending = False).head(10)

Unnamed: 0,poi_encode,score,poi_id
672,672,0.002309,42911d00f964a520f5231fe3
659,659,0.001638,42829c80f964a5206a221fe3
773,773,0.001523,43a52546f964a520532c1fe3
2858,2858,0.001431,4ace6c89f964a52078d020e3
1151,1151,0.001183,4840fe6bf964a52030501fe3
1215,1215,0.000913,49b7ed6df964a52030531fe3
1906,1906,0.000869,4a4821f5f964a52095aa1fe3
2211,2211,0.000847,4a737bf8f964a52091dc1fe3
387,387,0.000764,3fd66200f964a520def11ee3
465,465,0.000745,40b68100f964a5207d001fe3


In [257]:
test1 = df.groupby('poi_encode')['user_id'].count().reset_index()

In [258]:
test1.sort_values(by = 'user_id', ascending = False).head(10)

Unnamed: 0,poi_encode,user_id
672,672,1147
659,659,874
773,773,687
1151,1151,562
2858,2858,561
2211,2211,486
1215,1215,472
465,465,442
1906,1906,407
1945,1945,371
