In [2]:
import pandas as pd
import numpy as np

dir = 'E:\\Sebnewrepo\\Data\\checkin_data\\dataset_tsmc2014/'
checkin_file = 'dataset_TSMC2014_NYC.txt'

In [3]:
col = ['user_id',
       'poi_id',
       'poi_category_id',
       'poi_category_name',
       'latitude', 
       'longitude',
       'time_offset',
       'UTC_time']
df = pd.read_csv(dir + checkin_file, delimiter = "\t", names = col)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [4]:
df.shape

(227428, 8)

In [5]:
# number of users and number of POIs
print('Amount of Users: ', len(df['user_id'].unique()))
print('Amount of Items: ', len(df['poi_id'].unique()))

Amount of Users:  1083
Amount of Items:  38333


In [7]:
# remove infrequent items and users
from copy import deepcopy
def rm_infrequent_items(data, min_counts):
    df = deepcopy(data)
    counts = df['poi_id'].value_counts()
    df = df[df['poi_id'].isin(counts[counts >= min_counts].index)]
    print("POIs with < {} interactoins are removed".format(min_counts))
    return df
def rm_infrequent_users(data, min_counts):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]
    print("users with < {} interactoins are removed".format(min_counts))
    return df
          
filtered_df = rm_infrequent_users(df, 5)
filtered_df = rm_infrequent_items(filtered_df, 5)
print('num of users:{}, num of POIs:{}'.format(len(filtered_df['user_id'].unique()), len(filtered_df['poi_id'].unique())))

users with < 5 interactoins are removed
POIs with < 5 interactoins are removed
num of users:1083, num of POIs:9989


In [8]:
filtered_df.shape

(179468, 8)

In [10]:
# deal with UTC_time column coz that is the string

from datetime import datetime
from datetime import timedelta
def convert_to_datetime(data):
    result = data.split(' ')[1] + ' ' + data.split(' ')[2] + ' '  + data.split(' ')[3] + ' '  + data.split(' ')[5]
    dtm_str = datetime.strptime(result, '%b %d %H:%M:%S %Y')
    return dtm_str

filtered_df['datetime'] = filtered_df['UTC_time'].apply(convert_to_datetime)
filtered_df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,datetime
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,2012-04-03 18:00:09
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,2012-04-03 18:00:25
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,2012-04-03 18:02:24
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,2012-04-03 18:03:00
5,484,4b5b981bf964a520900929e3,4bf58dd8d48988d118951735,Food & Drink Shop,40.690427,-73.954687,-240,Tue Apr 03 18:04:00 +0000 2012,2012-04-03 18:04:00


In [16]:
# convert to sequential data per user
df_ordered = filtered_df.groupby('user_id').apply(pd.DataFrame.sort_values, 'datetime')
df_ordered = df_ordered.reset_index(drop = True)


In [17]:
# extract data
df_ordered.to_csv('ny_ordered.csv', index = False)