In [1]:
import pandas as pd
import numpy as np

dir = 'E:\\Sebnewrepo\\Data\\checkin_data\\dataset_tsmc2014/'
checkin_file = 'dataset_TSMC2014_NYC.txt'

In [2]:
col = ['user_id',
       'poi_id',
       'poi_category_id',
       'poi_category_name',
       'latitude', 
       'longitude',
       'time_offset',
       'UTC_time']
df = pd.read_csv(dir + checkin_file, delimiter = "\t", names = col)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time
0,1541,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101,139.61959,540,Tue Apr 03 18:17:18 +0000 2012
1,868,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.715581,139.800317,540,Tue Apr 03 18:22:04 +0000 2012
2,114,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.714542,139.480065,540,Tue Apr 03 19:12:07 +0000 2012
3,868,4c178638c2dfc928651ea869,4bf58dd8d48988d118951735,Food & Drink Shop,35.725592,139.776633,540,Tue Apr 03 19:12:13 +0000 2012
4,1458,4f568309e4b071452e447afe,4f2a210c4b9023bd5841ed28,Housing Development,35.656083,139.734045,540,Tue Apr 03 19:18:23 +0000 2012


In [3]:
df.shape

(573703, 8)

In [4]:
# number of users and number of POIs
print('Amount of Users: ', len(df['user_id'].unique()))
print('Amount of Items: ', len(df['poi_id'].unique()))

Amount of Users:  2293
Amount of Items:  61858


In [5]:
# remove infrequent items and users
from copy import deepcopy
def rm_infrequent_items(data, min_counts):
    df = deepcopy(data)
    counts = df['poi_id'].value_counts()
    df = df[df['poi_id'].isin(counts[counts >= min_counts].index)]
    print("POIs with < {} interactoins are removed".format(min_counts))
    return df
def rm_infrequent_users(data, min_counts):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]
    print("users with < {} interactoins are removed".format(min_counts))
    return df
          
filtered_df = rm_infrequent_users(df, 5)
filtered_df = rm_infrequent_items(filtered_df, 5)
print('num of users:{}, num of POIs:{}'.format(len(filtered_df['user_id'].unique()), len(filtered_df['poi_id'].unique())))

users with < 5 interactoins are removed
POIs with < 5 interactoins are removed
num of users:2293, num of POIs:15177


In [6]:
filtered_df.shape

(494807, 8)

In [7]:
# deal with UTC_time column coz that is the string

from datetime import datetime
from datetime import timedelta
def convert_to_datetime(data):
    result = data.split(' ')[1] + ' ' + data.split(' ')[2] + ' '  + data.split(' ')[3] + ' '  + data.split(' ')[5]
    dtm_str = datetime.strptime(result, '%b %d %H:%M:%S %Y')
    return dtm_str

filtered_df['datetime'] = filtered_df['UTC_time'].apply(convert_to_datetime)
filtered_df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,datetime
0,1541,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101,139.61959,540,Tue Apr 03 18:17:18 +0000 2012,2012-04-03 18:17:18
1,868,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.715581,139.800317,540,Tue Apr 03 18:22:04 +0000 2012,2012-04-03 18:22:04
2,114,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.714542,139.480065,540,Tue Apr 03 19:12:07 +0000 2012,2012-04-03 19:12:07
3,868,4c178638c2dfc928651ea869,4bf58dd8d48988d118951735,Food & Drink Shop,35.725592,139.776633,540,Tue Apr 03 19:12:13 +0000 2012,2012-04-03 19:12:13
7,114,4b3eae5cf964a520b4a025e3,4bf58dd8d48988d129951735,Train Station,35.700253,139.480255,540,Tue Apr 03 19:35:36 +0000 2012,2012-04-03 19:35:36


In [8]:
# convert to sequential data per user
df_ordered = filtered_df.groupby('user_id').apply(pd.DataFrame.sort_values, 'datetime')
df_ordered = df_ordered.reset_index(drop = True)


In [9]:
# extract data
df_ordered.to_csv('ny_ordered.csv', index = False)