## Gowalla Data Transformation
This workbook is a POC for the data transformation from geo data to matrix factorization
The target output tables include:

1. user, item encoding mapping table
2. user - item counting table
3. location ids with their latitude and longitude
4. users' last-n activity location table

In [1]:
import numpy as np, pandas as pd

Gowalla = pd.read_table('Gowalla_totalCheckins.txt', header=None,
                        names=['user','time', 'latitude','longitude','location_id'])


In [2]:
Gowalla.head(5)

Unnamed: 0,user,time,latitude,longitude,location_id
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878


### 01. User Item counting table
This table is going to be the input training data for Poisson MF

In [3]:
Go_input = Gowalla.groupby(['user','location_id'])['time'].count()
Go_input = Go_input.reset_index()
Go_input = Go_input.rename(columns = {'time':'count'})
Go_input.head(5)

Unnamed: 0,user,location_id,count
0,0,8938,1
1,0,8964,1
2,0,8977,1
3,0,9073,1
4,0,9153,1


#### Get rid of inactive users
Analyze the check in count first

In [4]:
checkin_count = Gowalla.groupby('user')['time'].count().reset_index()
checkin_count.head(5)

Unnamed: 0,user,time
0,0,225
1,1,12
2,2,2100
3,4,225
4,5,50


In [5]:
checkin_count.time.describe()

count    107092.000000
mean         60.162216
std         136.188949
min           1.000000
25%           7.000000
50%          25.000000
75%          56.000000
max        2175.000000
Name: time, dtype: float64

In [6]:
#number of users
checkin_count.shape[0]

107092

In [7]:
# filtered number of users
target_users = checkin_count[(checkin_count['time'] > 20) & (checkin_count['time'] < 100)]
target_users = target_users.sample(n = 5000)
target_users.shape[0]

5000

In [8]:
Go_input = Go_input[Go_input['user'].isin(target_users['user'])]
#check the user number
Go_input.user.nunique()

5000

In [9]:
# this is the training/test table
Go_input

Unnamed: 0,user,location_id,count
6542,44,9314,1
6543,44,11591,2
6544,44,12505,1
6545,44,13172,1
6546,44,16907,1
...,...,...,...
3981179,196528,4188115,5
3981180,196528,4238034,1
3981181,196528,4662083,2
3981182,196528,5175610,1


### 02. Item & User encoding 
This section is going to encode the location id, and create a location mapping table

In [10]:
# substitute location_id column
location_cat = pd.Categorical(Go_input['location_id'])
user_cat = pd.Categorical(Go_input['user'])
location_encode = location_cat.codes
user_encode = user_cat.codes
Go_input['location_encode'] = location_encode
Go_input['user_encode'] = user_encode
Go_input.drop(['location_id','user'], axis = 1, inplace = True)
Go_input.head(5)

Unnamed: 0,count,location_encode,user_encode
6542,1,151,0
6543,2,871,0
6544,1,1117,0
6545,1,1279,0
6546,1,2150,0


In [11]:
# Create location mapping table
location_mapping = pd.DataFrame({
    'location_id': location_cat,
    'location_mapping': location_encode
})
location_mapping = location_mapping.drop_duplicates().sort_values(by = ['location_id']).reset_index(drop = True)
location_mapping

Unnamed: 0,location_id,location_mapping
0,8932,0
1,8947,1
2,8956,2
3,8958,3
4,8961,4
...,...,...
115913,5957832,115913
115914,5960592,115914
115915,5961246,115915
115916,5963700,115916


In [12]:
# Create user mapping table
user_mapping = pd.DataFrame({
    'user': user_cat,
    'user_mapping': user_encode
})
user_mapping = user_mapping.drop_duplicates().sort_values(by = ['user']).reset_index(drop = True)
user_mapping

Unnamed: 0,user,user_mapping
0,44,0
1,67,1
2,69,2
3,99,3
4,102,4
...,...,...
4995,196169,4995
4996,196183,4996
4997,196273,4997
4998,196494,4998


### 03. Check-in location latitude and longitude table

In [13]:
# get rid of inactive users
filtered_Go = Gowalla[Gowalla['user'].isin(target_users['user'])]


In [14]:
# use user mapping to map users
filtered_Go = filtered_Go.merge(user_mapping, how = 'left', on = 'user')
filtered_Go = filtered_Go.merge(location_mapping, how = 'left', on = 'location_id')
filtered_Go.drop(columns = ['user', 'location_id'], inplace = True)
filtered_Go

Unnamed: 0,time,latitude,longitude,user_mapping,location_mapping
0,2010-10-14T23:43:02Z,30.423712,-97.702081,0,4864
1,2010-10-11T23:33:29Z,30.507986,-97.772360,0,50114
2,2010-09-21T23:52:54Z,30.338269,-97.807192,0,43848
3,2010-09-18T21:35:16Z,30.401769,-97.726307,0,3954
4,2010-09-08T16:37:37Z,30.556206,-97.687802,0,8914
...,...,...,...,...,...
217549,2010-09-29T13:10:03Z,13.757143,100.565801,4999,93035
217550,2010-09-29T07:41:25Z,13.823481,100.558398,4999,110502
217551,2010-09-27T15:42:43Z,13.762287,100.616450,4999,96706
217552,2010-09-23T11:53:47Z,13.821627,100.558956,4999,80171


In [15]:

checkin_location = filtered_Go.drop(['time','user_mapping'], axis = 1)
checkin_location.drop_duplicates(inplace = True)
# in order to get rid of some location with different latitude and longitude
checkin_location = checkin_location.groupby('location_mapping')['latitude','longitude'].mean().reset_index()
checkin_location

  after removing the cwd from sys.path.


Unnamed: 0,location_mapping,latitude,longitude
0,0,32.927662,-97.254356
1,1,37.331880,-122.029631
2,2,32.942655,-97.131200
3,3,32.882931,-97.260221
4,4,32.940099,-97.115691
...,...,...,...
115913,115913,47.421945,-120.310798
115914,115914,37.485057,-122.205536
115915,115915,37.483356,-122.201271
115916,115916,35.840735,14.547491


In [16]:
from math import radians, cos, sin, asin, sqrt
def geodistance(lng1, lat1, lng2, lat2):
    
    lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)])
    dlon=lng2-lng1
    dlat=lat2-lat1
    a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 
    distance=2*asin(sqrt(a))*6371*1000
    distance=round(distance/1000,3)
    return distance



In [17]:
%%time
checkin_location[['latitude','longitude']]\
.apply(lambda x: geodistance(x['longitude'],x['latitude'],10,-10),axis=1)

Wall time: 1.35 s


0         12214.870
1         14346.220
2         12203.355
3         12215.485
4         12201.911
            ...    
115913    13786.139
115914    14355.061
115915    14354.772
115916     5119.538
115917    10964.787
Length: 115918, dtype: float64

### 04. users' last-n activity location table
This table is to made by calculating the center of the last-n location of an user. 

In [18]:
import math
# Geographical center of a polygon that is in the map
def centroid(df):
    x = 0.0
    y = 0.0
    z = 0.0

    for i, coord in df.iterrows():
        latitude = math.radians(coord.latitude)
        longitude = math.radians(coord.longitude)

        x += math.cos(latitude) * math.cos(longitude)
        y += math.cos(latitude) * math.sin(longitude)
        z += math.sin(latitude)

    total = len(df)

    x = x / total
    y = y / total
    z = z / total

    central_longitude = math.atan2(y, x)
    central_square_root = math.sqrt(x * x + y * y)
    central_latitude = math.atan2(z, central_square_root)

    mean_location = pd.Series({
        'latitude': math.degrees(central_latitude),
        'longitude': math.degrees(central_longitude)
        })
    return mean_location

In [19]:
filtered_Go['time'] = pd.to_datetime(filtered_Go.time)
filtered_Go['rank'] = filtered_Go.groupby('user_mapping')['time'].rank(ascending = False)

In [20]:
# some ranks are same, it will be represented by like: 10.5 or 10.333
rank_df = filtered_Go[filtered_Go['rank'] <= 10.5]
rank_df

Unnamed: 0,time,latitude,longitude,user_mapping,location_mapping,rank
0,2010-10-14 23:43:02+00:00,30.423712,-97.702081,0,4864,1.0
1,2010-10-11 23:33:29+00:00,30.507986,-97.772360,0,50114,2.0
2,2010-09-21 23:52:54+00:00,30.338269,-97.807192,0,43848,3.0
3,2010-09-18 21:35:16+00:00,30.401769,-97.726307,0,3954,4.0
4,2010-09-08 16:37:37+00:00,30.556206,-97.687802,0,8914,5.0
...,...,...,...,...,...,...
217529,2010-10-13 11:39:24+00:00,13.823481,100.558398,4999,110502,6.0
217530,2010-10-12 12:19:43+00:00,13.813639,100.560426,4999,64654,7.0
217531,2010-10-11 16:42:02+00:00,13.762287,100.616450,4999,96706,8.0
217532,2010-10-11 01:42:02+00:00,13.766876,100.570672,4999,66454,9.0


In [21]:
ff = rank_df.groupby('user_mapping')['latitude','longitude']

  """Entry point for launching an IPython kernel.


In [22]:
last_n_center = ff.apply(centroid)
last_n_center

Unnamed: 0_level_0,latitude,longitude
user_mapping,Unnamed: 1_level_1,Unnamed: 2_level_1
0,30.376921,-97.756796
1,47.823526,9.887863
2,28.373947,-81.536761
3,51.097721,-0.936662
4,30.315782,-97.732966
...,...,...
4995,-12.087679,-77.014849
4996,33.225680,-117.035439
4997,35.018004,-78.949578
4998,50.182699,3.765209


In [23]:
ff.mean()

Unnamed: 0_level_0,latitude,longitude
user_mapping,Unnamed: 1_level_1,Unnamed: 2_level_1
0,30.376914,-97.756781
1,47.821283,9.879647
2,28.373945,-81.536760
3,51.090498,-0.933139
4,30.315779,-97.732969
...,...,...
4995,-12.087675,-77.014847
4996,33.224137,-117.028267
4997,35.017993,-78.949607
4998,50.180038,3.775228


### 06. Transfer to Pickle File and Store

In [24]:
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [25]:
save_obj(location_mapping, 'location_mapping')
save_obj(user_mapping, 'user_mapping')
save_obj(Go_input, 'user_item_count')
save_obj(checkin_location, 'checkin_location')
save_obj(last_n_center, 'last_n_center')