# Stay Point Extraction

======================================================
##### Using this code, we filtered the raw call details records (CDR) data and removed the noises in the data in order to identify people stop locations. In addition, we have developed an algorithm to find the individuals' home and work location based on the frequency of their stop points on different times of the day
=======================================================

In [None]:
from sklearn.metrics.pairwise import haversine_distances         
from geopy.distance import geodesic                              
import numpy as np                                               
from sklearn.cluster import AgglomerativeClustering
import dask.dataframe as dd
import geopandas as gpd
import time
import pandas as pd 

from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=6, 
                       threads_per_worker=2,
                       memory_limit='2GB')
import warnings
warnings.filterwarnings('ignore')
client = Client(cluster)

In [None]:
client

## Reading the data

In [None]:
callsDataset = dd.read_csv(r'E:\***\raw_data.csv', parse_dates=['Date_Time'],dtype={'Lat':'float','Long':'float','ID':'int'})
callsDataset = callsDataset.set_index('Date_Time')
callsDataset = callsDataset.reset_index()


shapeFile = gpd.read_file(r'E:\***\tehran.shp')
shapeFile = shapeFile.drop('Area',axis=1)

callsDatasetgp = callsDataset.groupby('ID')

## Functions definition

In [None]:
def medoid(candidateSet=[]):
    """
    This func use sklearn pairwise matrices to calculate medoid for each canidateSet.
    A candidateSet is a set of points for each person which seems to be stop points
    based on the short distance criteria and the minimum activity period
    """
                                       
    l = [candidateSet[i][2:4] * (np.pi / 180) for i in range(len(candidateSet))]
    medIndex = np.argmin((haversine_distances(l) * 6731000).sum(axis=1)) 
                                                               
    cSetMedoid = candidateSet[medIndex]
    cSetMedoid = np.append(cSetMedoid,[candidateSet[0][0],candidateSet[-1][0] - candidateSet[0][0]])
    
    l.clear()
    
    return cSetMedoid


def agg_cluster(stopPoints=[]):
    """                             
    This function will use sklearn hierarchical clustering to cluster the stop points into stayPoints.
    For example, two home location in the morning and in the evening (stopPoints) are clustered into one home location (stayPoint)
    """     
    stayPointsTemp = []                       
    stopPointsModifiedTemp = pd.DataFrame(stopPoints,columns=['Date_Time','ID','Lat','Long','StayStart','Duration'])
    
    if len(stopPoints)>1: 
        dist = haversine_distances(stopPointsModifiedTemp[['Lat','Long']]) * (np.pi/180) * 6731000
        Agg = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=1000).fit(dist)                                                                                                                                                            
        stopPointsModifiedTemp['cluster'] = Agg.labels_                                                             

        for clusterNo in stopPointsModifiedTemp.cluster.unique():                                                    #this process will find medoid of each final cluster and then append medoids lat/long to the same clusters
            clusterN = stopPointsModifiedTemp.loc[stopPointsModifiedTemp.cluster == clusterNo][['Lat','Long','Date_Time']]
            medIndex = np.argmin(haversine_distances(clusterN[['Lat','Long']]).sum(axis=1))
            medClusterN = clusterN.iloc[medIndex]
            stayPointsTemp.append(medClusterN)   
            stopPointsModifiedTemp.loc[(stopPointsModifiedTemp.cluster == clusterNo),'Lat'] = medClusterN.Lat # WHY TWO =?
            stopPointsModifiedTemp.loc[(stopPointsModifiedTemp.cluster == clusterNo),'Long'] = medClusterN.Long # HERE WE CAN HAVE ANOTHER VARIABLE FOR UNIQUE POINTS WHICH ARE CLUSTERED. BUT, WE ARE JUST CHAGING THE LATLONG IN THE BASE DATASET
        
        stopPointsModifiedTemp = gpd.GeoDataFrame(stopPointsModifiedTemp, geometry=gpd.points_from_xy(x=stopPointsModifiedTemp.Long, y=stopPointsModifiedTemp.Lat), crs={'init': 'epsg:4326'}) #$NEW use geopandas to create a geo DataFrame
        stopPointsModifiedTemp = gpd.sjoin(stopPointsModifiedTemp, shapeFile, how='left', op='within') # Spatial join
        stopPointsModifiedTemp.drop(columns=['index_right', 'geometry'], inplace=True)                 # Drop sth which is not needed
        stopPointsModifiedTemp.fillna(float(0), inplace=True)                                                     #This is for filling points which are not in Tehran shapeFile
        
        return stopPointsModifiedTemp
        
    else:
        return stopPointsModifiedTemp


def HWO_finder(notLabeledStopPoints):
    """
    This function allocates home, work, and other labels to the stay points. At this stage,
    the inpute argument is the stayPoints list because we need the ferquency at which 
    a stay location has been observed. 
    """
    
    labeledStopPoints = notLabeledStopPoints 
    labeledStopPoints['Day'] = labeledStopPoints['StayStart'].dt.day_name()                              
    labeledStopPoints['Week'] = labeledStopPoints['StayStart'].dt.week                    
    labeledStopPoints.set_index('StayStart', inplace=True)                            
    labeledStopPoints['Purpose'] = np.nan                                                
    
    homeLoc = labeledStopPoints.loc[(labeledStopPoints.Day != 'Friday')][['Lat','Long','Duration']].between_time('19:00', '7:00').mode()#Find most used location as Home
    if len(homeLoc)>=1:
        homeLoc = homeLoc.dropna()
        homeLoc = (homeLoc.groupby(['Lat','Long'],as_index = False).sum()).max()
        #homeLoc  = max(homeLoc.groupby(['Lat','Long'],as_index = False).sum())
        #print(homeLoc)
        labeledStopPoints.loc[((labeledStopPoints.Lat.isin([homeLoc.Lat])) & (labeledStopPoints.Long.isin([homeLoc.Long]))),'Purpose']='Home'                
    
    workLoc = labeledStopPoints.loc[(labeledStopPoints.Purpose != 'Home') & (labeledStopPoints.Day != 'Friday')][['Lat','Long']].between_time('7:00', '19:00').mode()  #Find most used location as Work. excluding Home
    if len(workLoc)>=1:
        workLoc = workLoc.dropna()
        workLoc = (workLoc.groupby(['Lat','Long'],as_index = False).sum()).max()
        print(workLoc)
        labeledStopPoints.loc[((labeledStopPoints.Lat.isin([workLoc.Lat])) & (labeledStopPoints.Long.isin([workLoc.Long]))), 'Purpose']='Work'   
    
    if ((labeledStopPoints.loc[(labeledStopPoints.Purpose == 'Work')].shape[0]) / (len(labeledStopPoints.Week.unique()))) < 1:  #New                                                                           #to get average trip frequency for five week 
        labeledStopPoints.loc[(labeledStopPoints.Purpose == 'Work'), 'Purpose'] = np.nan
    
    labeledStopPoints.loc[((labeledStopPoints.Purpose != 'Work') & (labeledStopPoints.Purpose != 'Home')), 'Purpose'] = 'Other'     
    return labeledStopPoints 



def dataclust(uniqIdCalls):
    
    uniqIdCalls=uniqIdCalls.to_numpy()
    candidateSet = []                                               # For storing the candidate set for each person. A candidate set is a set of points that are locally close to each other.
    stopPoints = []                                                 # Stop points are the centroid (medoid) of candidate sets keeping in mind that the first and the last points in the candidate sets should have a minimum time difference
    allCandidateSets = []                                           # For storing all candidate sets of a person
    candidateSet.append(uniqIdCalls[0])   
    
    for i in range(len(uniqIdCalls) - 1):                               
        if geodesic(uniqIdCalls[i][2:4], uniqIdCalls[i+1][2:4]).meters <=500:   
            candidateSet.append(uniqIdCalls[i+1])   
        else:
            if (candidateSet[-1][0] - candidateSet[0][0]).seconds > 600:                     
                stopPoints.append(medoid(candidateSet))
                allCandidateSets.append(candidateSet)
            candidateSet = []
            candidateSet.append(uniqIdCalls[i+1])
    if (candidateSet[-1][0] - candidateSet[0][0]).seconds > 600:  #append final cluster
       stopPoints.append(medoid(candidateSet))
       allCandidateSets.append(candidateSet)
    if len(stopPoints)>=1:     
        stopPointsModified = agg_cluster(stopPoints)
        stopPointsLabeled = HWO_finder(stopPointsModified)
        if ((stopPointsLabeled.loc[(stopPointsLabeled.Purpose == 'Home')].shape[0]) / (len(stopPointsLabeled.Week.unique()))) > 1:
        #stopPointsLabeled.reset_index(inplace=True)
            Home=stopPointsLabeled.loc[stopPointsLabeled.Purpose=='Home'].head(1)
            #usersCDRTract[int(Home['manategh'])]+=1
        else:
            stopPointsLabeled= pd.DataFrame(columns=['Date_Time','ID','Lat','Long','Duration','cluster', 'manategh','Day','Week','Purpose'])
    else:
        stopPointsLabeled= pd.DataFrame(columns=['Date_Time','ID','Lat','Long','Duration','cluster', 'manategh','Day','Week','Purpose'])

    return stopPointsLabeled



def CDR(x):
    Home = stay_points.loc[stay_points.Purpose == 'Home'].head(1)
    usersCDRTracts[int(Home['manategh'])] += 1

## Running Using Dask

In [None]:
stay_points = callsDatasetgp.apply(dataclust,meta={'Date_Time':'f8','ID':'int','Lat':'float','Long':'float','Duration':'f8','cluster':'int', 'manategh':'int','Day':'f8','Week':'int','Purpose':'f8'}).compute()
stay_points.to_csv('E:stay_points.csv',index = True)

stay_points = stay_points.droplevel('ID').reset_index()
stay_points_gp = np.groupby('ID')

usersCDRTracts=np.zeros((23,1))
stay_points_gp.apply(CDR)

CDR_tract_pop = pd.DataFrame(usersCDRTracts,columns=['Pop'])
CDR_tract_pop.to_csv('E:CDR_tract_pop.csv' , index = False)