In [1]:
import pandas as pd
import numpy as np
from haversine import haversine_vector, Unit
from sklearn.cluster import DBSCAN
from datetime import timedelta
import networkx as nx

In [2]:
def correct_latitude(lat):
    """
    This function corrects for out of range latitude.
    
    Input: 
    -- lat: latitude coordinates in °
    Output: 
    -- lat: latitude coordinates put between -90 and 90°
    """
    while lat>90 or lat<-90:
        if lat>90:
            lat = -(lat-180)
        elif lat<-90:
            lat = -(lat+180)
    return lat

In [3]:
def correct_longitude(long):
    """
    This function corrects for out of range longitude.
    
    Input: 
    -- long: longitude coordiantes in °
    Output: 
    -- long: longitude coordinates put between -180 and 180°
    """
    while long>180 or long<-180:
        if long>180:
            long = long - 360
        elif long<-180:
            long = long +360
    return long

In [182]:
def compute_distance(df,columns):
    '''
    This function computes the distance between two geographic coordinates for a given dataframe.
    
    Input: 
        - df: Dataframe containing 4 columns latitude1, longitude1, latitude2 and longitude2
        - columns: list of columns [latitude1, longitude1, latitude2 and longitude2]
        
    Output: 
        - numpy array containing the distance between geographic coordinates of each row
    '''
    points1 = list(zip(df[columns[0]],df[columns[1]]))
    points2 = list(zip(df[columns[2]],df[columns[3]]))
    # Use harvesine_vector to compute the distance between points
    return np.round(haversine_vector(points1,points2,Unit.KILOMETERS),decimals=3)

In [183]:
def select_relevant_homes(df_homes):
    '''
    This function selects relevant homes. We consider a home location as relevant if it's latitude and
    longitude doesn't "vary much". To measure this variation, we simply compute the mean and the std of
    the latitude and longitude of homes for every user. Then we construct 4 points as follow:
        - by adding and substracting the standard deviation of the latitude and longitude from their
        respective mean
        - Measure the diagonal in KM
        - If the diagonal is less than 100m we can assume with confidence that the mean is indeed the
        home location
    Input:
        - df_homes: A dataframe containing all checkins labled as Home
    Output:
        - df_homes: Home location for each user
    '''
    
    # Grouping df_homes according to the user id and compute std and mean for lat and lon
    df_homes = df_homes.groupby('User ID').agg({'lat':('std','mean'),'lon':('std','mean')})
    
    # Filling nan values with 0 (std return 0 if there is only one sample)
    df_homes.fillna(0,inplace = True)
    
    # Construct the diagonal points
    df_tmp = pd.DataFrame()
    df_tmp['lat1'] = df_homes.lat['mean']-df_homes.lat['std']
    df_tmp['lat2'] = df_homes.lat['mean']+df_homes.lat['std']
    df_tmp['lon1'] = df_homes.lon['mean']-df_homes.lon['std']
    df_tmp['lon2'] = df_homes.lon['mean']+df_homes.lon['std']
    
    # Compute diagonal length
    df_tmp['home_radius'] = compute_distance(df_tmp,['lat1','lon1','lat2','lon2'])
    
    # Filter home and keep relevant home (estimated distance between homes checkins < 100m )
    df_homes = df_homes[df_tmp['home_radius']<0.1][[('lat','mean'),('lon','mean')]].copy()
    
    # Flatten df_homes columns
    df_homes.columns = df_homes.columns.get_level_values(0)
    
    return df_homes

In [286]:
def construct_df_checkins(path,sample_frac = 1):
    '''
    This function takes the path of the raw data, import it and construct a checkin dataframe where
    all users have at least 5 checkins and 1 home location
    
    Input:
        - Path: the Path of the file containing the data
        - sample_frac: sample fraction from the raw dataframe
    Output:
        - df_checkins: Checkin dataframe where all users have at least 5 checkins and 1 home location
    '''
    
    # Read data from the file and drop unnecessary columns
    df_tmp = pd.read_csv(path).sample(frac=sample_frac).drop(columns=['Venue ID','day'])
    
    # Latitude and Longitude correction
    df_tmp.lat = df_tmp.lat.apply(correct_latitude)
    df_tmp.lon = df_tmp.lon.apply(correct_longitude)
    
    # Construct df_homes and select only relevant homes
    df_homes = df_tmp.loc[df_tmp.place.str.lower().str.contains('home' and 'private')].copy()
    df_homes = select_relevant_homes(df_homes)
    
    # Select users with relevant homes from the raw data
    df_tmp = df_tmp.loc[df_tmp['User ID'].isin(df_homes.index)].copy()
    
    # Count the number of checkins for each user
    df_tmp_grouped = df_tmp.groupby('User ID').agg({'User ID':'count'})
    
    # Define a set containing users with at least 5 checkins
    users = set(df_tmp_grouped[df_tmp_grouped['User ID']>5])
    
    # Construct df_checkins
    df_checkins = df_tmp.loc[df_tmp['User ID'].isin(users)].copy()
    
    # Convert 'local time' attribute to a pandas datetime
    df_checkins['local time'] = pd.to_datetime(df_checkins['local time'])
    
    # Label Homes
    df_checkins['Is_home'] = df_checkins.place.str.lower().str.contains('home' and 'private')
    
    # Drop unnecessary column
    df_checkins.drop(columns = ['place'],inplace = True)
    
    return df_checkins.sort_values(by=['User ID','local time']).reset_index(drop=True)

In [288]:
if False:
    df_checkins  = construct_df_checkins('data/processed_dataset-003.csv')

In [186]:
df_checkins.head()

Unnamed: 0,User ID,local time,lat,lon,country,Is_home
0,15,2012-04-25 14:12:50+00:00,36.292377,-119.325095,US,False
1,15,2012-04-25 14:56:17+00:00,36.323795,-119.348035,US,False
2,15,2012-04-25 16:56:45+00:00,36.312187,-119.313529,US,False
3,15,2012-04-27 08:16:23+00:00,36.292377,-119.325095,US,False
4,15,2012-04-27 08:16:54+00:00,36.292787,-119.325866,US,False


In [187]:
def build_clusters_labels(df_user,clustering_method):
    '''
    This function clusters the checkins for a single user.
    Input:
        - df_user: a dataframe containing the latitude and longitude for each checkin
        - clustering_method: DBSCAN, we define this parameter to avoid unnecessary initialisations
        when calling this funcrion
    Output:
        - clusters_labels: cluster label assigned to each checkin
    '''
    cluster_lables = clustering_method.fit(np.deg2rad(df_user[['lat','lon']])).labels_
    
    return cluster_lables

In [519]:
def cleaning_user(df_user):
    '''
    To avoid biasing the dataset with multiple checkins in a small period of time or small distance traveled, 
    we drop checkins within 
    
    '''
    df_tmp = df_user.reset_index().merge(df_user.iloc[1:].reset_index(drop=True),right_index=True,
                                         left_index=True,how='inner')
    df_tmp['dt'] = df_tmp['local time_y'] - df_tmp['local time_x']
    
    columns = ['lat_x','lon_x','lat_y','lon_y']
    df_tmp['distance'] = compute_distance(df_tmp,columns)
    mask = (df_tmp['dt']!=timedelta(0))&((df_tmp['dt']>timedelta(hours=1))|(df_tmp['distance']>0.1))
    
    #print(df_tmp[['dt','distance']],mask)
    return df_user.reset_index().iloc[df_tmp[mask].index]
    

In [520]:
def compute_checkin_during_midnight(df_user):
    df_tmp = (df_user['local time'].dt.hour>=0) & (df_user['local time'].dt.hour<7)
    return df_tmp

In [521]:
def compute_last_checkin(df_user):
    # We consider checkin until last
    tmp_date = (df_user['local time']-timedelta(hours=3)).dt.date.values
    last_checkin = []
    # tmp_date is ordered and we can detect the last checkin whenever we detect a change 
    for i in range(len(tmp_date)-1):
        if tmp_date[i]<tmp_date[i+1]:
            last_checkin.append(True)
        else:
            last_checkin.append(False)
    # The last checkin is always True by definition 
    last_checkin.append(True)
    return last_checkin

In [522]:
def compute_last_checkin_with_inactive_midnight(df_user):
    tmp_date = (df_user['local time']-timedelta(hours=7)).dt.date.values
    tmp_hour = (df_user['local time']).dt.hour.values
    
    last_checkin_with_inactive_midnight = []
    
    for i in range(len(tmp_date)-1):
        if (tmp_date[i]<tmp_date[i+1]) and (tmp_hour[i]<=23):
            last_checkin_with_inactive_midnight.append(True)
        else:
            last_checkin_with_inactive_midnight.append(False)
    if tmp_hour[-1]<17:
        last_checkin_with_inactive_midnight.append(True)
    else:
        last_checkin_with_inactive_midnight.append(False)
        
    return last_checkin_with_inactive_midnight

In [523]:
def compute_dt_to_next_checkin(df_user):
    
    checkin_time = df_user['local time'].values
    delta_time = checkin_time[1:]-checkin_time[:-1]
    delta_time = delta_time.astype(float)/(1e9*3600)

    return np.append(delta_time,None)

In [524]:
def compute_ratio(column):
    return np.sum(column)/len(column)

In [525]:
def build_features(df_checkins, sample_frac = 1):
    
    df_checkins  = construct_df_checkins('data/processed_dataset-003.csv',sample_frac=sample_frac)
    
    df_tmp = pd.DataFrame(columns = ['user','cluster_id','CR','MR','EDR','EIDR','PR','RPR','Is_home'])
    
    users_id, counts = np.unique(df_checkins['User ID'].values,return_counts=True)
    grouped_checkins = df_checkins.groupby('User ID')
    KMS_PER_RADIAN = 6371.0088
    PRECISION = 0.1
    clustering_method = DBSCAN(eps=PRECISION/KMS_PER_RADIAN,metric='haversine')
    
    for user in users_id:
        #print('******************************',counts[i])
        df_user = grouped_checkins.get_group(user)
        df_user = cleaning_user(df_user).copy()
        
        if len(df_user)>1:
            # Compute cluster_label
            df_user['cluster_label'] = build_clusters_labels(df_user,clustering_method)
            # Compute Checkin during midnight
            df_user['checkin_during_midnight'] = compute_checkin_during_midnight(df_user)
            # Compute last checkin
            df_user['last_checkin'] = compute_last_checkin(df_user)
            # Compute last checkin with inactive midnight
            df_user['last_checkin_with_inactive_midnight'] = compute_last_checkin_with_inactive_midnight(df_user)
            # Compute distance to next_checkin and classify edges
            df_user['dt_to_next_checkin'] = compute_dt_to_next_checkin(df_user)

            agg_dic = {'cluster_label':'count','checkin_during_midnight':compute_ratio,
                        'last_checkin':compute_ratio,'last_checkin_with_inactive_midnight': compute_ratio,
                        'Is_home': 'sum'}
            rename_dic = {'cluster_label':'CR','checkin_during_midnight':'MR','last_checkin':'EDR',
                          'last_checkin_with_inactive_midnight':'EIDR'}

            grouped_clusters = df_user.groupby('cluster_label')

            features = grouped_clusters.agg(agg_dic).rename(columns = rename_dic)
            features['user'] = user
            features['CR'] = features['CR']/features['CR'].sum()
            features['Is_home'] = features['Is_home'] == features['Is_home'].max()
            features['PR'],features['RPR'] = compute_PR_RPR(df_user)
            
            df_tmp = df_tmp.append(features)
        
    df_tmp['cluster_id'] = df_tmp.index 
    return df_tmp.reset_index(drop=True)

In [None]:
def compute_PR_RPR(df_user):
    df_tmp = df_user.reset_index().iloc[:-1].merge(df_user.iloc[1:].reset_index(),
                                                    right_index=True,left_index=True)
    df_tmp['inverse_time'] = 1/df_tmp['dt_to_next_checkin_x']
    df_graph = df_tmp.groupby(['cluster_label_x','cluster_label_y'],as_index = False).agg({'inverse_time':'sum'})
    
    G = nx.DiGraph()
    RG = nx.DiGraph()
    for i, row in df_graph.iterrows():
        G.add_edge(int(row['cluster_label_x']),int(row['cluster_label_y']),weight=row['inverse_time'])
        RG.add_edge(int(row['cluster_label_y']),int(row['cluster_label_x']),weight=row['inverse_time'])

    return list(nx.pagerank(G, weight='weight').values()), list(nx.pagerank(RG, weight='weight').values())

In [526]:
df_training = build_features(df_checkins)

In [527]:
df_training

Unnamed: 0,user,cluster_id,CR,MR,EDR,EIDR,PR,RPR,Is_home
0,15,-1,0.473373,0.012500,0.150000,0.150000,0.147836,0.394836,False
1,15,0,0.035503,0.000000,0.000000,0.000000,0.039053,0.036291,False
2,15,1,0.053254,0.000000,0.333333,0.333333,0.036535,0.041733,False
3,15,2,0.177515,0.300000,0.300000,0.333333,0.380939,0.246896,False
4,15,3,0.130178,0.318182,0.272727,0.272727,0.341692,0.203941,True
...,...,...,...,...,...,...,...,...,...
600331,2199190,-1,0.762887,0.027027,0.567568,0.554054,0.765123,0.472843,True
600332,2199190,0,0.061856,0.166667,0.833333,0.833333,0.052372,0.030383,False
600333,2199190,1,0.061856,0.166667,0.666667,0.666667,0.065619,0.323668,False
600334,2199190,2,0.051546,0.000000,0.200000,0.200000,0.057655,0.096595,False


In [528]:
df_training.to_csv('training_dataset.csv')