In [102]:
import pandas as pd
import numpy as np
from haversine import haversine_vector, Unit
from sklearn.cluster import DBSCAN
from datetime import timedelta
import networkx as nx

In [3]:
def correct_latitude(lat):
    """
    This function corrects for out of range latitude.
    
    Input: 
    -- lat: latitude coordinates in °
    Output: 
    -- lat: latitude coordinates put between -90 and 90°
    """
    while lat>90 or lat<-90:
        if lat>90:
            lat = -(lat-180)
        elif lat<-90:
            lat = -(lat+180)
    return lat

In [4]:
def correct_longitude(long):
    """
    This function corrects for out of range longitude.
    
    Input: 
    -- long: longitude coordiantes in °
    Output: 
    -- long: longitude coordinates put between -180 and 180°
    """
    while long>180 or long<-180:
        if long>180:
            long = long - 360
        elif long<-180:
            long = long +360
    return long

In [103]:
def compute_distance(df,columns):
    '''
    This function computes the distance between two geographic coordinates for a given dataframe.
    
    Input: 
        - df: Dataframe containing 4 columns latitude1, longitude1, latitude2 and longitude2
        - columns: list of columns [latitude1, longitude1, latitude2 and longitude2]
        
    Output: 
        - numpy array containing the distance between geographic coordinates of each row
    '''
    points1 = list(zip(df[columns[0]],df[columns[1]]))
    points2 = list(zip(df[columns[2]],df[columns[3]]))
    # Use harvesine_vector to compute the distance between points
    return np.round(haversine_vector(points1,points2,Unit.KILOMETERS)).astype(int)

In [6]:
def select_relevant_homes(df_homes):
    
    # Grouping df_homes according to the user id and compute std and mean for lat and lon
    df_homes = df_homes.groupby('User ID').agg({'lat':('std','mean'),'lon':('std','mean')})
    
    # Filling nan values with 0 (std return 0 if there is only one sample)
    df_homes.fillna(0,inplace = True)
    
    # Compute distance from mean
    # Preparing dataframe
    df_tmp = pd.DataFrame()
    df_tmp['lat1'] = df_homes.lat['mean']-df_homes.lat['std']
    df_tmp['lat2'] = df_homes.lat['mean']+df_homes.lat['std']
    df_tmp['lon1'] = df_homes.lon['mean']-df_homes.lon['std']
    df_tmp['lon2'] = df_homes.lon['mean']+df_homes.lon['std']
    # Compute distance
    df_tmp['home_radius'] = compute_distance(df_tmp,['lat1','lon1','lat2','lon2'])
    # Filter home and keep relevant home (estimated distance between homes checkins < 100m )
    df_homes = df_homes[df_tmp['home_radius']<0.1][[('lat','mean'),('lon','mean')]].copy()
    
    df_homes.columns = df_homes.columns.get_level_values(0)
    
    return df_homes

In [7]:
def construct_training_df(path,sample_frac = 1):
    df_tmp = pd.read_csv(path).sample(frac=sample_frac).drop(columns=['Venue ID','day'])
    
    # Latitude and Longitude correction
    df_tmp.lat = df_tmp.lat.apply(correct_latitude)
    df_tmp.lon = df_tmp.lon.apply(correct_longitude)
    # ---------------------------------------------------------------------------------------#
    # -------------------------------Build Homes dataframe-----------------------------------#
    # ---------------------------------------------------------------------------------------#
    df_homes = df_tmp.loc[df_tmp.place.str.lower().str.contains('home' and 'private')].copy()
    df_homes.drop(columns=['place','local time'],inplace=True)
    # Select relevant homes
    df_homes = select_relevant_homes(df_homes)
    
    # ---------------------------------------------------------------------------------------#
    # ------------------------------Build Checkin dataframe----------------------------------#
    # ---------------------------------------------------------------------------------------#
    
    users = set(df_homes.index.values)
    df_checkins = df_tmp.loc[df_tmp['User ID'].isin(users)].copy()
    df_checkins['local time'] = pd.to_datetime(df_checkins['local time'])
    df_checkins['Is_home'] = df_checkins.place.str.lower().str.contains('home' and 'private')
    
    return df_homes, df_checkins.sort_values(by=['User ID','local time']).reset_index(drop=True)

In [8]:
df_homes, df_checkins  = construct_training_df('data/processed_dataset-003.csv')

In [113]:
def build_clusters_labels(df_user,clustering_method):
    
    df_user['cluster_label'] = clustering_method.fit(np.deg2rad(df_user[['lat','lon']])).labels_
    
    return df_user['cluster_label']

In [248]:
def cleaning_user(df_user):
    
    df_tmp = df_user.merge(df_user.iloc[1:,:].reset_index(drop=True),right_index=True,left_index=True,
                           how='inner')
    df_tmp['dt'] = df_tmp['local time_y'] - df_tmp['local time_x']
    columns = ['lat_x','lon_x','lat_y','lon_y']
    df_tmp['distance'] = compute_distance(df_tmp,columns)
    mask = (np.abs(df_tmp['dt'])>timedelta(minutes=60))&(df_tmp['distance']>0.1)
    
    return df_user.iloc[df_tmp[mask].index,:]
    

In [53]:
def compute_checkin_during_midnight(df_user):
    df_tmp = (df_user['local time'].dt.hour>=0) & (df_user['local time'].dt.hour<7)
    return df_tmp

In [51]:
def compute_last_checkin(df_user):
    # We consider checkin until last
    tmp_date = (df_user['local time']-timedelta(hours=3)).dt.date
    last_checkin = []
    
    # tmp_date is ordered and we can detect the last checkin whenever we detect a change 
    for i in range(len(tmp_date)-1):
        if tmp_date[i]<tmp_date[i+1]:
            last_checkin.append(True)
        else:
            last_checkin.append(False)
    # The last checkin is always True by definition 
    last_checkin.append(True)
    return last_checkin

In [85]:
def compute_last_checkin_with_inactive_midnight(df_user):
    tmp_date = (df_user['local time']-timedelta(hours=7)).dt.date
    tmp_hour = (df_user['local time']).dt.hour
    
    last_checkin_with_inactive_midnight = []
    
    for i in range(len(tmp_date)-1):
        if (tmp_date[i]<tmp_date[i+1]) and (tmp_hour[i]<=23):
            last_checkin_with_inactive_midnight.append(True)
        else:
            last_checkin_with_inactive_midnight.append(False)
    if tmp_hour.iloc[-1]<17:
        last_checkin_with_inactive_midnight.append(True)
    else:
        last_checkin_with_inactive_midnight.append(False)
        
    return last_checkin_with_inactive_midnight

In [98]:
def compute_distance_to_next_checkin(df_user):
    
    df_tmp = df_user.iloc[:-1].merge(df_user.iloc[1:].reset_index(),right_index=True,left_index=True)
    columns = ['lat_x','lon_x','lat_y','lon_y']
    distance_to_next_checkin = compute_distance(df_tmp,columns)
    distance_to_next_checkin = np.append(distance_to_next_checkin,0)
    is_an_edge = df_tmp['cluster_label_x'].values!=df_tmp['cluster_label_y'].values
    
    return distance_to_next_checkin,is_an_edge

In [121]:
def compute_ratio(column):
    return np.sum(column)/len(column)

In [249]:
def build_features(df_checkins):
    
    df_tmp = pd.DataFrame(columns = ['user','cluster_id','CR','MR','EDR','EIDR','PR','RPR'])
    users_id = df_checkins['User ID'].unique()
    grouped_checkins = df_checkins.groupby('User ID')
    KMS_PER_RADIAN = 6371.0088
    clustering_method = DBSCAN(eps=0.1/KMS_PER_RADIAN,metric='haversine')
    
    for user in users_id:
        df_user = cleaning_df_user(df_user).copy()
        df_user = grouped_checkins.get_group(user)
        # Compute cluster_label
        df_user['cluster_label'] = build_clusters_features(df_user,clustering_method)
        # Compute Checkin during midnight
        df_user['checkin_during_midnight'] = compute_checkin_during_midnight(df_user)
        # Compute last checkin
        df_user['last_checkin'] = compute_last_checkin(df_user)
        # Compute last checkin with inactive midnight
        df_user['last_checkin_with_inactive_midnight'] = compute_last_checkin_with_inactive_midnight(df_user)
        # Compute distance to next_checkin and classify edges
        df_user[['distance_to_next_checkin','is_edge']] = compute_distance_to_next_checkin(df_user)
        
        agg_dic = {'cluster_label':compute_ratio,'checkin_during_midnight':compute_ratio,
                    'last_checkin':compute_ratio,'last_checkin_with_inactive_midnight': compute_ratio}
        
        grouped_clusters = df_user.groupby('cluster_label')
        
        features1 = grouped_clusters.agg(agg_dic)
        
        features2 = grouped_clusters.apply(my_fun)
        
        
    return df_tmp

In [70]:
iterator = (sample_user['local time']-timedelta(hours=3)).dt.hour

In [114]:
sample_user['cluster_label'] = build_clusters_labels(sample_user,DBSCAN(eps=0.1/6371.0088,metric='haversine'))

In [120]:
sample_user['cluster_label'].iloc[:-1].values ==  sample_user['cluster_label'].iloc[1:].values

array([ True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False,  True, False, False,  True, False,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
       False, False, False, False,  True,  True,  True,  True,  True,
        True, False, False, False,  True,  True, False, False, False,
        True, False, False, False, False, False, False,  True, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
       False,  True,  True, False, False, False, False, False,  True,
        True,  True,

In [None]:
compute_distance_to_next_checkin(sample_user)

In [54]:
sample_user['last_checkin'] = compute_last_checkin(sample_user)
sample_user['checkin_during_midnight'] = compute_checkin_during_midnight(sample_user)

In [86]:
sample_user['last_checkin_with_inactive_midnight'] = compute_last_checkin_with_inactive_midnight(sample_user)

In [127]:
sample_user.groupby('cluster_label').agg({('last_checkin','checkin_during_midnight'): my_fun})

SpecificationError: Column(s) [('last_checkin', 'checkin_during_midnight')] do not exist

In [123]:
def compute(df_cluster):
    

In [26]:
for i, time in enumerate(iterator):
    print(i,time)

0 14
1 14
2 14
3 14
4 14
5 14
6 14
7 15
8 15
9 15
10 16
11 16
12 16
13 16
14 16
15 16
16 16
17 16
18 16
19 16
20 16
21 16
22 16
23 16
24 17
25 17
26 17
27 17
28 17
29 17
30 17
31 17
32 17
33 17
34 17
35 17
36 17
37 17
38 17
39 18
40 18
41 18
42 18
43 18
44 18
45 18
46 18
47 18
48 18
49 18
50 19
51 19
52 19
53 19
54 19
55 19
56 19
57 19
58 19
59 19
60 19
61 19
62 19
63 20
64 20
65 20
66 20
67 20
68 20
69 20
70 20
71 21
72 21
73 21
74 21
75 21
76 21
77 22
78 22
79 22
80 22
81 22
82 22
83 22
84 22
85 22
86 23
87 23
88 23
89 23
90 23
91 23
92 23
93 23
94 23
95 23
96 23
97 23
98 24
99 24
100 24
101 24
102 24
103 24
104 24
105 24
106 25
107 26
108 26
109 26
110 26
111 27
112 27
113 27
114 27
115 27
116 27
117 27
118 27
119 27
120 27
121 28
122 28
123 28
124 28
125 28
126 28
127 28
128 28
129 28
130 28
131 28
132 29
133 29
134 29
135 30
136 30
137 31
138 32
139 32
140 32
141 33
142 33
143 33
144 33
145 33
146 33
147 33
148 36
149 36
150 36
151 36
152 37
153 37
154 41
155 41
156 41
157 42
158 

In [247]:
sample_user

Unnamed: 0,User ID,local time,lat,lon,place,country,Is_home,lat_rad,lon_rad
1,19,2012-04-03 18:12:28+00:00,40.720787,-74.001975,Shoe Store,US,False,0.710712,-1.291578
2,19,2012-04-03 21:26:31+00:00,40.726452,-74.002260,American Restaurant,US,False,0.710811,-1.291583
3,19,2012-04-04 19:50:42+00:00,40.721941,-73.996259,Bar,US,False,0.710732,-1.291478
4,19,2012-04-07 11:49:00+00:00,42.152078,-71.412678,Cafï¿½,US,False,0.735693,-1.246386
5,19,2012-04-07 21:45:45+00:00,42.232172,-71.179776,American Restaurant,US,False,0.737090,-1.242321
...,...,...,...,...,...,...,...,...,...
405,19,2013-12-05 21:03:21+00:00,40.722761,-73.982839,Mexican Restaurant,US,False,0.710746,-1.291244
406,19,2013-12-21 14:33:26+00:00,41.953211,-74.048901,Sporting Goods Shop,US,False,0.732222,-1.292397
407,19,2013-12-21 20:51:25+00:00,41.950235,-74.040170,Steakhouse,US,False,0.732170,-1.292245
409,19,2014-01-10 10:05:40+00:00,40.725292,-73.992146,Coffee Shop,US,False,0.710790,-1.291407


In [1]:
sample_user

NameError: name 'sample_user' is not defined

In [9]:
sample_user = df_checkins.groupby('User ID').get_group(df_homes.index[0]).copy()

In [234]:
test = sample_user.merge(sample_user.iloc[1:,:].reset_index(drop=True),right_index=True,left_index=True)

In [222]:
sample_user.iloc[:-1,:].reset_index()

Unnamed: 0,index,User ID,local time,lat,lon,place,country,Is_home,lat_rad,lon_rad
0,0,19,2012-04-03 18:08:02+00:00,40.720582,-74.002051,Clothing Store,US,False,0.710708,-1.291579
1,1,19,2012-04-03 18:12:28+00:00,40.720787,-74.001975,Shoe Store,US,False,0.710712,-1.291578
2,2,19,2012-04-03 21:26:31+00:00,40.726452,-74.002260,American Restaurant,US,False,0.710811,-1.291583
3,3,19,2012-04-04 19:50:42+00:00,40.721941,-73.996259,Bar,US,False,0.710732,-1.291478
4,4,19,2012-04-07 11:49:00+00:00,42.152078,-71.412678,Cafï¿½,US,False,0.735693,-1.246386
...,...,...,...,...,...,...,...,...,...,...
406,406,19,2013-12-21 14:33:26+00:00,41.953211,-74.048901,Sporting Goods Shop,US,False,0.732222,-1.292397
407,407,19,2013-12-21 20:51:25+00:00,41.950235,-74.040170,Steakhouse,US,False,0.732170,-1.292245
408,408,19,2014-01-09 09:54:13+00:00,40.725292,-73.992146,Coffee Shop,US,False,0.710790,-1.291407
409,409,19,2014-01-10 10:05:40+00:00,40.725292,-73.992146,Coffee Shop,US,False,0.710790,-1.291407


In [224]:
test.head()

Unnamed: 0,User ID_x,local time_x,lat_x,lon_x,place_x,country_x,Is_home_x,lat_rad_x,lon_rad_x,User ID_y,local time_y,lat_y,lon_y,place_y,country_y,Is_home_y,lat_rad_y,lon_rad_y
0,19,2012-04-03 18:08:02+00:00,40.720582,-74.002051,Clothing Store,US,False,0.710708,-1.291579,19,2012-04-03 18:12:28+00:00,40.720787,-74.001975,Shoe Store,US,False,0.710712,-1.291578
1,19,2012-04-03 18:12:28+00:00,40.720787,-74.001975,Shoe Store,US,False,0.710712,-1.291578,19,2012-04-03 21:26:31+00:00,40.726452,-74.00226,American Restaurant,US,False,0.710811,-1.291583
2,19,2012-04-03 21:26:31+00:00,40.726452,-74.00226,American Restaurant,US,False,0.710811,-1.291583,19,2012-04-04 19:50:42+00:00,40.721941,-73.996259,Bar,US,False,0.710732,-1.291478
3,19,2012-04-04 19:50:42+00:00,40.721941,-73.996259,Bar,US,False,0.710732,-1.291478,19,2012-04-07 11:49:00+00:00,42.152078,-71.412678,Cafï¿½,US,False,0.735693,-1.246386
4,19,2012-04-07 11:49:00+00:00,42.152078,-71.412678,Cafï¿½,US,False,0.735693,-1.246386,19,2012-04-07 21:45:45+00:00,42.232172,-71.179776,American Restaurant,US,False,0.73709,-1.242321


In [230]:
np.abs(test['local time_y']-test['local time_x'])<timedelta(minutes=60)

0       True
1      False
2      False
3      False
4      False
       ...  
406    False
407    False
408    False
409    False
410    False
Length: 411, dtype: bool

In [205]:
test = build_features(df_checkins)

In [207]:
test[test.Is_home ==True]

Unnamed: 0,User ID,local time,lat,lon,place,country,Is_home,lat_rad,lon_rad,cluster_label,CR,MR
76,19,2012-05-25 21:10:01+00:00,40.980988,-72.123841,Home (private),US,True,0.715253,-1.258798,-1,0.759709,0.070288
393,19,2013-09-02 08:55:48+00:00,40.980988,-72.123841,Home (private),US,True,0.715253,-1.258798,-1,0.759709,0.070288
466,58,2012-11-26 18:54:03+00:00,29.933542,-90.098043,Home (private),US,True,0.522439,-1.572507,-1,0.951049,0.044118
597,190,2012-10-31 21:03:47+00:00,-23.535157,-46.665706,Home (private),BR,True,-0.410766,-0.814470,-1,0.903614,0.146667
1999,256,2013-03-26 05:38:51+00:00,36.958127,-121.987122,Home (private),US,True,0.645041,-2.129077,-1,0.192391,0.041045
...,...,...,...,...,...,...,...,...,...,...,...,...
2695,277,2013-03-30 18:21:30+00:00,45.425242,-122.553632,Home (private),US,True,0.792820,-2.138964,2,0.086294,0.205882
2730,277,2013-08-02 00:53:46+00:00,45.425242,-122.553632,Home (private),US,True,0.792820,-2.138964,2,0.086294,0.205882
2757,277,2013-09-03 23:36:09+00:00,45.425242,-122.553632,Home (private),US,True,0.792820,-2.138964,2,0.086294,0.205882
2784,277,2013-10-09 23:59:15+00:00,45.425242,-122.553632,Home (private),US,True,0.792820,-2.138964,2,0.086294,0.205882


In [124]:
sample_user = df_checkins.groupby('User ID').get_group(df_homes.index[0]).copy()

In [125]:
classifier = DBSCAN(eps=0.1,metric='haversine').fit(sample_user[['lat','lon']])

In [123]:
compute_MR(sample_user)[compute_MR(sample_user).Is_home == True]

KeyError: 'cluster_label'

In [41]:
sample_user.groupby('cluster_label').agg({'cluster_label':'count'})

Unnamed: 0_level_0,cluster_label
cluster_label,Unnamed: 1_level_1
-1,47
0,221
1,29
2,9
3,9
4,24
5,10
6,8
7,17
8,8


In [28]:
len(classifier.labels_)

412

In [107]:
sample_user.lat/sample_user.lon

0     -0.550263
1     -0.550266
2     -0.550341
3     -0.550324
4     -0.590260
         ...   
407   -0.566588
408   -0.550400
409   -0.550400
410   -0.364233
411   -0.550353
Length: 412, dtype: float64

In [15]:
df_homes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28199 entries, 19 to 2186144
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lat     28199 non-null  float64
 1   lon     28199 non-null  float64
dtypes: float64(2)
memory usage: 660.9 KB


In [16]:
df_homes.head()

Unnamed: 0_level_0,lat,lon
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1
19,40.980988,-72.123841
58,29.933542,-90.098043
190,-23.535157,-46.665706
256,36.958127,-121.987122
277,45.425242,-122.553632


In [17]:
df_checkins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4936492 entries, 0 to 4936491
Data columns (total 7 columns):
 #   Column      Dtype              
---  ------      -----              
 0   User ID     int64              
 1   local time  datetime64[ns, UTC]
 2   lat         float64            
 3   lon         float64            
 4   place       object             
 5   country     object             
 6   Is_home     bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2), int64(1), object(2)
memory usage: 230.7+ MB


In [18]:
df_checkins.head()

Unnamed: 0,User ID,local time,lat,lon,place,country,Is_home
0,19,2012-04-03 18:08:02+00:00,40.720582,-74.002051,Clothing Store,US,False
1,19,2012-04-03 18:12:28+00:00,40.720787,-74.001975,Shoe Store,US,False
2,19,2012-04-03 21:26:31+00:00,40.726452,-74.00226,American Restaurant,US,False
3,19,2012-04-04 19:50:42+00:00,40.721941,-73.996259,Bar,US,False
4,19,2012-04-07 11:49:00+00:00,42.152078,-71.412678,Cafï¿½,US,False


In [133]:
df_checkins['User ID'].unique()

array([     19,      58,     190, ..., 2184709, 2185485, 2186144])

In [19]:
tmp = df_checkins.groupby('User ID').agg({'Is_home':('count','sum')})

In [20]:
tmp.describe()

Unnamed: 0_level_0,Is_home,Is_home
Unnamed: 0_level_1,count,sum
count,28199.0,28199.0
mean,175.059116,10.407426
std,158.296487,21.381624
min,74.0,1.0
25%,99.0,1.0
50%,131.0,3.0
75%,196.0,11.0
max,5337.0,829.0


In [None]:
df_homes

In [None]:
df_checkins

In [None]:
cols=['User ID','Venue ID','Venue category ID','Venue category name','Latitude','Longitude','Timezone offset','UTC time']

In [None]:
df_tsmc

In [None]:
grouped = df_tsmc.groupby('Venue category name').count()

In [None]:
grouped.loc[grouped.index.str.contains('Home')]

In [None]:
grouped.index