## Location Clustering 
This workbook is to classify the locations in the dataset

In [33]:
import pandas as pd
import numpy as np

dir = 'E:\\Sebnewrepo\\Data\\checkin_data\\dataset_tsmc2014/'
checkin_file = 'dataset_TSMC2014_NYC.txt'

In [34]:
col = ['user_id',
       'poi_id',
       'poi_category_id',
       'poi_category_name',
       'latitude', 
       'longitude',
       'time_offset',
       'UTC_time']
df = pd.read_csv(dir + checkin_file, delimiter = "\t", names = col)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [35]:
# Test how many categories in the dataset
df.shape

(227428, 8)

In [36]:
# number of users and number of POIs
print('Amount of Users: ', len(df['user_id'].unique()))
print('Amount of Items: ', len(df['poi_id'].unique()))

Amount of Users:  1083
Amount of Items:  38333


### Extract datetime info from column UTC_time
use regex to extract time info

In [37]:
# datetime converting
# use regex to extract time info

# extract day of week
import re

def extract_wkd(data):
    re_wkd = r'[A-Z]?[a-z]+\s'
    result = re.match(re_wkd, data)
    result = result[0].split(' ')
    return result[0]
df['day_of_week'] = df['UTC_time'].apply(extract_wkd)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue


In [38]:
df.groupby('day_of_week')['day_of_week'].count()

day_of_week
Fri    33824
Mon    32487
Sat    34401
Sun    32636
Thu    30558
Tue    32129
Wed    31393
Name: day_of_week, dtype: int64

In [39]:
# extract time of the day
from datetime import datetime
from datetime import timedelta
def extract_mth(data):
    result = data.split(' ')[1] + ' ' + data.split(' ')[2] + ' '  + data.split(' ')[3] + ' '  + data.split(' ')[5]
    dtm_str = datetime.strptime(result, '%b %d %H:%M:%S %Y')
    return dtm_str

df['datetime'] = df['UTC_time'].apply(extract_mth)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week,datetime
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue,2012-04-03 18:00:09
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue,2012-04-03 18:00:25
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue,2012-04-03 18:02:24
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue,2012-04-03 18:02:41
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue,2012-04-03 18:03:00


In [40]:
# add time_offset

df['datetime_real'] = df['datetime']+ pd.to_timedelta(df['time_offset'], unit = 'm')
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week,datetime,datetime_real
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue,2012-04-03 18:00:09,2012-04-03 14:00:09
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue,2012-04-03 18:00:25,2012-04-03 14:00:25
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue,2012-04-03 18:02:24,2012-04-03 14:02:24
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue,2012-04-03 18:02:41,2012-04-03 14:02:41
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue,2012-04-03 18:03:00,2012-04-03 14:03:00


In [41]:
# only keep hour in the timestamp
df['hour_in_the_day'] = pd.Series(val.hour for val in df['datetime_real'])
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week,datetime,datetime_real,hour_in_the_day
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue,2012-04-03 18:00:09,2012-04-03 14:00:09,14
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue,2012-04-03 18:00:25,2012-04-03 14:00:25,14
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue,2012-04-03 18:02:24,2012-04-03 14:02:24,14
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue,2012-04-03 18:02:41,2012-04-03 14:02:41,14
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue,2012-04-03 18:03:00,2012-04-03 14:03:00,14


### Feature Engineering
The features selected in the dataset is based on statistics of locations acted by user's behavior:
1. number of visits
2. avg visits per users (returned users)
3. Num of visits in one-hour slice
4. Num of visits in weekday/ weekend (one day slice)

In [42]:
# Number of visits
df_f = df.groupby('poi_id')['user_id'].count()
df_f = df_f.reset_index()
df_f.columns = (['poi_id', 'num_visits'])
df_f.head()

Unnamed: 0,poi_id,num_visits
0,3fd66200f964a52000e71ee3,16
1,3fd66200f964a52000e81ee3,2
2,3fd66200f964a52000f11ee3,1
3,3fd66200f964a52001e51ee3,1
4,3fd66200f964a52001e81ee3,17


In [43]:
# avg visits per users
df_2 = df.groupby('poi_id')['user_id'].nunique()
df_2 = df_2.reset_index()
df_f['unique_user'] = df_2['user_id']


In [44]:
df_f['avg_visits_per_user'] = df_f['num_visits']/df_f['unique_user']
df_f = df_f.drop(columns = ['unique_user'])
df_f.head()

Unnamed: 0,poi_id,num_visits,avg_visits_per_user
0,3fd66200f964a52000e71ee3,16,1.0
1,3fd66200f964a52000e81ee3,2,1.0
2,3fd66200f964a52000f11ee3,1,1.0
3,3fd66200f964a52001e51ee3,1,1.0
4,3fd66200f964a52001e81ee3,17,1.0625


In [45]:
# Num of visits in day of the week
m = df.groupby(['poi_id','day_of_week'])['time_offset'].count()
m = m.reset_index()
# pivot table
m_pivot = pd.pivot_table(m, values = 'time_offset', index = 'poi_id', columns = ['day_of_week'])
m_pivot = m_pivot.fillna(0).reset_index()
m_pivot.head()

day_of_week,poi_id,Fri,Mon,Sat,Sun,Thu,Tue,Wed
0,3fd66200f964a52000e71ee3,3.0,0.0,6.0,5.0,1.0,0.0,1.0
1,3fd66200f964a52000e81ee3,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,3fd66200f964a52000f11ee3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3fd66200f964a52001e51ee3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3fd66200f964a52001e81ee3,4.0,0.0,5.0,4.0,2.0,1.0,1.0


In [53]:
# Num of visits in an one-hour slice in a day
n = df.groupby(['poi_id','hour_in_the_day'])['time_offset'].count()
n = n.reset_index()
# pivot table
n_pivot = pd.pivot_table(n, values = 'time_offset', index = 'poi_id', columns = ['hour_in_the_day'])
n_pivot = n_pivot.fillna(0).reset_index()
n_pivot.head()

hour_in_the_day,poi_id,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,3fd66200f964a52000e71ee3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,4.0,2.0,3.0,0.0,1.0
1,3fd66200f964a52000e81ee3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3fd66200f964a52000f11ee3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3fd66200f964a52001e51ee3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3fd66200f964a52001e81ee3,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,1.0


In [55]:
## combine all features together
feature = df_f.merge(m_pivot, how = 'left', on = 'poi_id')
feature = feature.merge(n_pivot, how = 'left', on = 'poi_id')
feature

Unnamed: 0,poi_id,num_visits,avg_visits_per_user,Fri,Mon,Sat,Sun,Thu,Tue,Wed,...,14,15,16,17,18,19,20,21,22,23
0,3fd66200f964a52000e71ee3,16,1.0000,3.0,0.0,6.0,5.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,4.0,2.0,3.0,0.0,1.0
1,3fd66200f964a52000e81ee3,2,1.0000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3fd66200f964a52000f11ee3,1,1.0000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3fd66200f964a52001e51ee3,1,1.0000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3fd66200f964a52001e81ee3,17,1.0625,4.0,0.0,5.0,4.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38328,51190f52e4b0f83278d5b9ba,1,1.0000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38329,51194e17e4b0665cf3280ce3,1,1.0000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38330,511aa583e4b03acfa105f50b,1,1.0000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
38331,511ae28ce4b00516bc52ccae,1,1.0000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Clustering
Hierachical clustering is used here