In [1]:
import pandas as pd
import numpy as np

dir = 'E:\\Sebnewrepo\\Data\\checkin_data\\dataset_tsmc2014/'
checkin_file = 'dataset_TSMC2014_NYC.txt'

In [2]:
col = ['user_id',
       'poi_id',
       'poi_category_id',
       'poi_category_name',
       'latitude', 
       'longitude',
       'time_offset',
       'UTC_time']
df = pd.read_csv(dir + checkin_file, delimiter = "\t", names = col)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [6]:
# number of users and number of POIs
print('Amount of Users: ', len(df['user_id'].unique()))
print('Amount of Items: ', len(df['poi_id'].unique()))

Amount of Users:  1083
Amount of Items:  38333


In [8]:
# remove infrequent items and users
from copy import deepcopy
def rm_infrequent_items(data, min_counts):
    df = deepcopy(data)
    counts = df['poi_id'].value_counts()
    df = df[df['poi_id'].isin(counts[counts >= min_counts].index)]
    print("POIs with < {} interactoins are removed".format(min_counts))
    return df
def rm_infrequent_users(data, min_counts):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]
    print("users with < {} interactoins are removed".format(min_counts))
    return df
          
filtered_df = rm_infrequent_users(df, 5)
filtered_df = rm_infrequent_items(filtered_df, 5)
print('num of users:{}, num of POIs:{}'.format(len(filtered_df['user_id'].unique()), len(filtered_df['poi_id'].unique())))

users with < 5 interactoins are removed
POIs with < 5 interactoins are removed
num of users:1083, num of POIs:9989


In [14]:
# POIs encode, and generate encode mapping
poi_cat = pd.Categorical(filtered_df['poi_id'])
poi_encode = poi_cat.codes
#generate poi mapping table
poi_mapping = pd.DataFrame({
    'poi_encode': poi_encode,
    'poi_id': filtered_df['poi_id']
    })
#drop duplicate
poi_mapping_output = poi_mapping.drop_duplicates()
filtered_df['poi_encode'] = poi_encode
filtered_df.drop(['poi_id'], axis = 1, inplace = True)
filtered_df.head(5)

Unnamed: 0,user_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,poi_encode
0,470,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,1230
1,979,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,1879
2,69,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,6161
4,87,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,6859
5,484,4bf58dd8d48988d118951735,Food & Drink Shop,40.690427,-73.954687,-240,Tue Apr 03 18:04:00 +0000 2012,4017


In [17]:
# convert to sequential data per user
def convert_data(data):
    df = deepcopy(data)
    df_ordered = df.sort_values(['UTC_time'], ascending=True)
    data = df_ordered.groupby('user_id')['poi_encode'].apply(list)
    unique_data = df_ordered.groupby('user_id')['poi_encode'].nunique()
    # delete users whose items are less than 5
    data = data[unique_data[unique_data >= 5].index]
    print(data[:10])
    print(len(data))
    return data

In [18]:
seq_data = convert_data(filtered_df)

user_id
1     [9066, 405, 773, 678, 499, 3032, 1344, 9066, 1...
2     [1928, 825, 6889, 8153, 5302, 2903, 3272, 7089...
3     [727, 2033, 7513, 18, 817, 194, 4435, 6547, 26...
4     [8077, 8077, 2505, 1422, 2237, 1180, 1912, 249...
5     [3090, 2467, 3090, 2940, 9244, 9175, 3090, 246...
6     [3871, 6362, 3574, 2858, 2652, 6104, 9603, 636...
7     [2768, 8552, 5425, 6659, 8836, 8274, 6659, 807...
8     [2946, 9009, 7217, 2946, 8188, 3152, 3152, 830...
9     [6539, 863, 9110, 508, 2603, 982, 905, 1431, 2...
10    [5863, 8100, 7953, 2034, 8076, 492, 8931, 66, ...
Name: poi_encode, dtype: object
1083


In [19]:
# sequential data to dict
user_item_dict = seq_data.to_dict()

data_records = []
# generate data_record
for user_id in user_item_dict:
    data_records.append(user_item_dict[user_id])

In [20]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [21]:
save_obj(data_records, 'checkin_sequences')
save_obj(poi_mapping_output, 'poi_mapping')