In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import time
from geopandas import GeoDataFrame
from utils.basefuncs import preProcessDataFrame
from utils.trajectoryClass import Trajectory
from utils.uuid import UUIDCollection
from utils.stopCollection import stopCollection,ExtractAndOrganizeData
from utils.gravityModel import POIgdf,activityMapper,gravityModel
from datetime import datetime,timedelta
import movingpandas as mpd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import folium

In [None]:
warnings.filterwarnings("ignore")

In [None]:
path = r"D:\MSc\MSBD 5014\FindingPointsOfInterestFromGPS"
compiled_path = Path(path,"Configs.xlsx").resolve()
raw_path = Path(r"D:\MSc\MSBD 5014\Data","rawdata_202212.csv").resolve()

In [None]:
df_raw = pd.read_csv(raw_path)

In [None]:
preprocess = preProcessDataFrame(compiled_path,sheetname='preProcessDataFrame')
df = preprocess.processDF()

In [None]:
to_datetime = lambda x: datetime(year = x['ts_Year'],month = x['ts_Month'],day = x['ts_Day'],hour=x['ts_Hour'],minute = x['ts_Minute'],second = x['ts_Second'])

In [None]:
df['ts_Index'] = df.apply(to_datetime,axis = 1)

In [None]:
configs = {
    'col_missing_vals' : ['gpstime_Hour',
        'gpstime_Minute',
        'gpstime_Second',
        'gpstime_Day',
        'gpstime_Month',
        'gpstime_Year',
        'gpstime'],
    'fill_missing_vals' : [
        'ts_Hour',
       'ts_Minute', 
       'ts_Second', 
       'ts_Day', 
       'ts_Month', 
       'ts_Year', 
       'ts'],
    'UUID_Collection':{
        'unique_identifier_col' : 'uuid',
        'index_col' : 'ts_Index',
        'sort_values_col' : 'ts',
         'lat_col' : 'latitude',
        'long_col' : 'longitude',
        'required_cols' : ['gpsacc','lost'],
        'min_duration' : timedelta(seconds=30*60),
        'max_diameter' : 25,
        'min_points' : 200,
        'query_amount' : 450,
        'plot_map' : False
    },
    'STOP_Collection':{
        'min_distance' : 200,
        'stop_point_num' : 200
    },
    'POIgdf':{
        'id_col':'UUID',
        'poi_type_col_name' : 'POI Type',
        'activity_type_col_name':'Activity Types',
        'poi_point_col_name':'POI Point',
        'stop_point_lat': 'Stop Point Latitude',
        'stop_point_long':'Stop Point Longitude',
    },
    'activityMapper':{
        'file_path' : r"D:\MSc\MSBD 5014\FindingPointsOfInterestFromGPS\helper\POITypes.csv",
    }
} 

Fill in the missing values 

In [None]:
for col,ref in zip(configs['col_missing_vals'],configs['fill_missing_vals']):
    df.loc[df[col].isna(),col] =  df.loc[df[col].isna()][ref]
df.loc[df['gpsacc'].isna(),'gpsacc'] = df.loc[~df['gpsacc'].isna()]['gpsacc'].median()

In [None]:
len({k:grp.shape[0] for k,grp in df.groupby('uuid') if grp.shape[0]>200})

In [None]:
uuid_collection_obj = UUIDCollection(df,**configs['UUID_Collection'])
stop_collection_obj = stopCollection(uuid_collection_obj,**configs['STOP_Collection'])
queryObj = ExtractAndOrganizeData(stops_of_interest=stop_collection_obj.filtered_stops,radius = 200)
poigdf = queryObj.extractAndorganizeData()

In [None]:
poigdf_obj = POIgdf(poigdf,**configs['POIgdf'])
activitymapper_obj = activityMapper(**configs['activityMapper'])
poigdf_obj = activitymapper_obj.reset_POI_types(poigdf_obj)
poigdf_obj = activitymapper_obj.add_activity_types(poigdf_obj)
gravitymodel = gravityModel(activitymapper_obj,poigdf_obj)
results = gravitymodel.calculate_probability()

In [None]:
uuid_collection_obj.complete_traj_stop_df

In [None]:
results

In [None]:
rr

In [None]:
# import pickle

# with open('poigdf_obj.pickle','wb') as handle:
#     pickle.dump(poigdf_obj,handle,protocol=pickle.HIGHEST_PROTOCOL)

# with open('activitymapper_obj.pickle','wb') as handle:
#     pickle.dump(activitymapper_obj,handle,protocol=pickle.HIGHEST_PROTOCOL)

# with open('gravitymodel.pickle','wb') as handle:
#     pickle.dump(gravitymodel,handle,protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
rrr

In [None]:
def find_activity_for_each_point(data:pd.DataFrame):
    find_highest_activity = lambda x:sorted([(i,j) for i,j in x.to_dict().items()],key=lambda y:y[1],reverse=True)[0][0]
    attach_latitude = lambda x:x['Stop Point'].y
    attach_longitude = lambda x:x['Stop Point'].x
    activity_df = data[['Stop Point']]
    activity_probs = data[[i for i in data.columns if i != 'Stop Point']]
    activity_df['Activity at Point'] = activity_probs.apply(find_highest_activity,axis = 1)
    activity_df['Stop Point Latitude'] = activity_df.apply(attach_latitude,axis = 1)
    activity_df['Stop Point Longitude'] = activity_df.apply(attach_longitude,axis = 1)
    return activity_df

activity_df = find_activity_for_each_point(results)
activity_df = activity_df.merge(poigdf[['Stop Point','Time Spent at Stop Point','Stop Point Latitude','Stop Point Longitude']],how = 'left',on = ['Stop Point Latitude','Stop Point Longitude'],suffixes=('','_y'))
activity_df.drop(labels = activity_df.filter(regex=r"_y$").columns,axis = 1,inplace = True)
activity_df.drop_duplicates(subset=['Stop Point Latitude','Stop Point Longitude'],inplace = True)

In [None]:
activity_df['Time Spent at Stop Point (minutes)'] = activity_df['Time Spent at Stop Point']/60

In [None]:
def calculate_counts_for_each_activity(df):
    counts_df = pd.DataFrame(data = {'Activity Type':df['Activity at Point'].unique().tolist()})
    counts_df['Counts'] = np.nan
    for k,grp in df.groupby('Activity at Point'):
        counts_df.loc[counts_df['Activity Type'] == k,'Counts'] = grp.shape[0]
    return counts_df

counts_df = calculate_counts_for_each_activity(activity_df)

Boxplot for time spent at an activity

In [None]:
counts_df

In [None]:
fig,ax = plt.subplots(2,1,figsize = (20,15))
sns.boxplot(data=activity_df,x = 'Activity at Point',y = 'Time Spent at Stop Point (minutes)',ax = ax[0])
sns.lineplot(data = counts_df,x = 'Activity Type',y = 'Counts',ax = ax[1])
ax[0].set_xticklabels(rotation = 90,labels=activity_df['Activity at Point'].unique().tolist())
ax[1].set_xticklabels(rotation = 90,labels=activity_df['Activity at Point'].unique().tolist())
ax[0].yaxis.set_ticks(np.arange(0,activity_df['Time Spent at Stop Point (minutes)'].max(),step = 50))
ax[1].yaxis.set_ticks(np.arange(0,counts_df['Counts'].max(),step = 5))
fig.tight_layout()

In [None]:
df_uuid = uuid_collection_obj.uuid_collection[112102].parent_trajectory_with_stop_labels['2023-01-14']
df_uuid.reset_index()

In [None]:
stop_collection_obj.filtered_stops

In [None]:
poigdf

In [None]:
def make_boxplot(data):
    fig,ax = plt.subplots(1,1,figsize = (20,10))
    melt_df = pd.melt(results,id_vars=['Stop Point'],value_vars=[i for i in results.columns if i != 'Stop Point'],var_name = 'Activity Type',value_name='Activity Probability')
    sns.boxplot(data = melt_df,x = 'Activity Type',y = 'Activity Probability',ax = ax)
    ax.set_xticklabels(labels = melt_df['Activity Type'].unique(),rotation = 90)
make_boxplot(results)
    

In [None]:
results

Checking the Number of places of each activity type

In [None]:
results_counts = results[[i for i in results.columns if i != 'Stop Point']]
results_counts.mask(results_counts != 0,other=1,inplace=True)
counts_of_activity_places = results_counts.sum(axis = 0).to_dict()

In [None]:
# results_normalized = 
place_counts = np.array([j for i,j in counts_of_activity_places.items()])

In [None]:
results_normalized =  results[[i for i in results.columns if i != 'Stop Point']].to_numpy()/place_counts
results[[i for i in results.columns if i != 'Stop Point']] = results_normalized

In [None]:
make_boxplot(results)

In [None]:
ax.get_legend_handles

In [None]:
# from ast import literal_eval
# poi_type_list = pd.read_csv(r"D:\MSc\MSBD 5014\FindingPointsOfInterestFromGPS\helper\POITypes.csv")['POI Type'].tolist()
# activity_types = pd.read_csv(r"D:\MSc\MSBD 5014\FindingPointsOfInterestFromGPS\helper\POITypes.csv")['Activity'].tolist()
# poi_type_activity_dict = dict(zip(poi_type_list,activity_types))
# reset_types = lambda x: str([i for i in literal_eval(x) if i in poi_type_list])
# POIgdf['POI Type'] = POIgdf['POI Type'].apply(reset_types)
# POIgdf = POIgdf.loc[POIgdf['POI Type'] != '[]']
# # literal_eval(POIgdf['POI Type'][0])

In [None]:
# find_activity = lambda x : str(list(set(poi_type_activity_dict[i] for i in literal_eval(x))))

# POIgdf['Activity Types'] = np.nan
# POIgdf['Activity Types'] = POIgdf['POI Type'].apply(find_activity)

In [None]:
# POIgdf['Stop Point Latitude'] = POIgdf['Stop Point'].apply(lambda x: x.y)
# POIgdf['Stop Point Longitude'] = POIgdf['Stop Point'].apply(lambda x: x.x)

In [None]:
# from collections import Counter
# cols = ['Stop Point']
# cols.extend(list(set(activity_types)))
# activity_df_dict = {k:[] for k in cols}
# for k,grp in POIgdf.groupby(['Stop Point Latitude','Stop Point Longitude']):
#     activity_df_dict['Stop Point'].append(k)
#     activity_list = map(lambda x: literal_eval(x),grp['Activity Types'])
#     flat_activity_list = [activity for sublist in activity_list for activity in sublist]
#     count_dict = dict(Counter(flat_activity_list))
#     for activity in list(set(activity_types)):
#         if activity in count_dict.keys():
#             activity_df_dict[activity].append(count_dict[activity])
#         else:
#             activity_df_dict[activity].append(0)