In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List
import warnings
from functools import reduce
import json
warnings.filterwarnings("ignore")

In [None]:
# COLS_TRAVEL = ['travelEndLocationId','purpose','checkInTime']
VENUE_TYPES = ['Apartment', 'Pub', 'Restaurant', 'Workplace']

ACTIVITIES_DICT = {'NumActivities':None}

PARSING_MAPS = {
    'travelStartTime':"%Y-%m-%d %H:%M:%S",
    'travelEndTime':"%Y-%m-%d %H:%M:%S",
    'timestamp' : "%Y-%m-%d %H:%M:%S",
    'checkInTime' : "%Y-%m-%d %H:%M:%S",
    'checkOutTime' : "%Y-%m-%d %H:%M:%S"
}

def processDateTime(df,cols):
    """
    :params df: dataframe
    :params alias:columns alias thats in COL_MAP
    """
    for col in cols:
        df[col] = pd.to_datetime(df[col],format=PARSING_MAPS[col])
    return df

In [None]:
from decimal import DivisionByZero
class globalVariableStore:
  CHECKIN_PATH = Path(r"/content/drive/MyDrive/Datasets/VAST_Challenge_2022/Journals/CheckinJournal.csv").resolve()
  TRAVEL_PATH = Path(r"/content/drive/MyDrive/Datasets/VAST_Challenge_2022/Journals/TravelJournal.csv").resolve()
  ACTIVITY_KEYS = ['NumActivities','Workplace','Restaurant','Apartment','Pub']
  ACTIVITIES_DICT = {k:0 for k in ACTIVITY_KEYS}
  ACTIVITIES_OF_INTEREST = ['Eating','Recreation (Social Gathering)']
  WORKPLACE_LIST = ['Apartment']
  OBSERVATION_TIME = 450
  @classmethod
  def changeFilePaths(cls,pathdict:dict):
    for key in pathdict:
      setattr(cls,key,pathdict[key])

class analyseActivityCountsofParticipant:
  def __init__(self):
    pass

  #lowest level to get daily activities
  def findDailyActivities(self,df:pd.DataFrame):
    for key,grp in df.sort_values(by = 'timestamp').groupby([pd.Grouper(key = 'timestamp',freq = 'D')]):
      activity_template = globalVariableStore.ACTIVITIES_DICT.copy()
      activity_template['NumActivities'] = grp['venueType'].count()
      activityTypeCounts = dict(grp['venueType'].value_counts())
      for k in activityTypeCounts:
        activity_template[k] = activityTypeCounts[k] 
      yield (key,activity_template)

  def obtainAverageActivities(self,resgen):
    totals = globalVariableStore.ACTIVITIES_DICT.copy()
    length = 0
    for i,j in resgen:
      time = i
      for key,val in j.items():
        totals[key] += val
      length += 1
    averages = {k:np.round(totals[k]/length,2) for k in totals}
    return averages

  def calculateActivitiesParticipant(self,key,grp):
    if int(key) %50 == 0:
      print(f"Doing Activities. ID {key}")
    allActivitiesGenerator = self.findDailyActivities(grp)
    activityAverages = self.obtainAverageActivities(allActivitiesGenerator)
    return activityAverages

class analyseActivityMinutesofParticipant:
  def __init__(self,activity_list : List[str]):
    self.activities = activity_list

  def calculateSumOfTimeSpent(self,grp:pd.DataFrame):
    """
    Collect the instances a person spents on an activity throughout the day.
    """
    #activity minutes dictionary for that day
    activity_dict = {k:[0,0] for k in globalVariableStore.ACTIVITIES_OF_INTEREST}
    for activity in self.activities:
      if grp.loc[grp['purpose'] == activity].shape[0] > 0:
        grp_activity = grp.loc[grp['purpose'] == activity]
        for key,grp_day in grp_activity.groupby([pd.Grouper(key = 'timestamp',freq = 'D')]):
        #Add the total minutes for given day to collect all the mins
          activity_dict[activity][0] += np.sum((grp_day['checkOutTime'] - grp_day['checkInTime']).dt.total_seconds()/60)
          activity_dict[activity][1] += 1
    # #   for key,grp_day in grp_activity.groupby([pd.Grouper(key = 'timestamp',freq = 'D'),'purpose']):
    # #       #Add the total minutes for given day to collect all the mins
    # #       activity_dict[key[1]][0] += np.sum((grp_day['checkOutTime'] - grp_day['checkInTime']).dt.total_seconds()/60)
    # #       #on some days 
    # #       activity_dict[][1] += 1
    # for activity in globalVariableStore.ACTIVITIES_OF_INTEREST:
    #   grp_activity_sub = grp_activity.loc[grp_activity['purpose'] == activity]
    #   activity_dict[activity][0] = np.sum((grp_activity_sub['checkOutTime'] -\
    #                                        grp_activity_sub['checkInTime']).dt.total_seconds()/60)
    #   activity_dict[activity][1] = grp_activity_sub['timestamp'].dt.date.unique().shape[0]
    return activity_dict

  def obtainAverageMinutes(self,totalMinutesDict):
    average_dict = {k:0 for k in globalVariableStore.ACTIVITIES_OF_INTEREST}
    for i in totalMinutesDict:
      if totalMinutesDict[i][1] == 0:
        average_dict[i] = 0
      else:
        average_dict[i] = np.round(totalMinutesDict[i][0]/totalMinutesDict[i][1],2)
      # try:
      #   average_dict[i] = np.round(totalMinutesDict[i][0]/totalMinutesDict[i][1],2)
      # except ZeroDivisionError:
      #   average_dict[i] = 0
    return average_dict

  def calculateAverageMinutesParticipant(self,grp):
    totalMinuteGenerator = self.calculateSumOfTimeSpent(grp)
    averageMinutesDict = self.obtainAverageMinutes(totalMinuteGenerator)
    return averageMinutesDict

class findMinutesAtWorkplace:
  """
  same class used for travelling too
  """
  def __init__(self):
    pass
  def calculateTotalMinutesPerDay(self,grp,startTime,endTime,grouper):
    """
    startTime : refers to the column name of start time of activity
    endTime : refers to the column name of end time of activity
    """
    workplace_minutes = {}
    for key,grp_sub in grp.groupby([pd.Grouper(key = grouper,freq = 'D')]):
      workplace_minutes[key] = np.sum((grp_sub[endTime] - grp_sub[startTime]).dt.total_seconds())/60
    return workplace_minutes
  def calculateAverageMinutes(self,minute_dict):
    average_minutes = reduce(lambda x,key:x + minute_dict[key],minute_dict,0)/len(minute_dict)
    return average_minutes
  def calculateAverageMinutesParticipant(self,grp,startTime,endTime,grouper):
    minutes_dict = self.calculateTotalMinutesPerDay(grp,startTime,endTime,grouper)
    average_minutes = self.calculateAverageMinutes(minutes_dict)
    return average_minutes


class timeLeaveBack:
  def __init__(self):
    """
    To calculate the average leaving home time and arriving time, we query the hours at which people leave
    and come back on a daily basis. Then for each day find the difference from 00:00:00 time of that day.
    This is then converted to minutes. Then the average is calculated across all days
    """
    pass
  def calculate(self,grp):
    participant_dict = {}
    for key,grp_sub in grp.groupby([pd.Grouper(key = 'travelStartTime',freq = 'D')]):
      if grp_sub.shape[0] > 0:
        daily_dict = {}
        grp_sub.reset_index(drop = True,inplace = True)
        daily_dict['Leave Time'] = grp_sub.loc[0,'travelStartTime']
        daily_dict['Back Home Time'] = grp_sub.loc[grp_sub.shape[0] - 1,'travelEndTime']
        participant_dict[key] = daily_dict
    return participant_dict
  def convertTimetoMinutes(self,timeDict):
    """
    To calculate 
    Converts time from 12 midnight to minutes
    """
    for key in timeDict:
      timeDict[key]['Leave Time'] = (timeDict[key]['Leave Time'] - key).seconds/60
      timeDict[key]['Back Home Time'] = (timeDict[key]['Back Home Time'] - key).seconds/60
    return timeDict
  def calculateAverage(self,minuteDict):
    averageLeave = reduce(lambda x,key:minuteDict[key]['Leave Time'] + x,minuteDict,0)/len(minuteDict)
    averageBack = reduce(lambda x,key:minuteDict[key]['Back Home Time'] + x,minuteDict,0)/len(minuteDict)
    return averageLeave,averageBack

class collectDataofAllParticipants:
  def __init__(self):
    self.completeActivities = {}
    self.averageActivityMinutes = {}
    self.averageWorkPlaceMinutes = {}
    self.averageTravellingMinutes = {}
    self.averageLeaveBack = {}
  def calculateActivitiesOfAllParticipants(self,df:pd.DataFrame):
    for key,grp in df.groupby('participantId',sort = True):
      participantActivityObj = analyseActivityCountsofParticipant()
      self.completeActivities[key] = participantActivityObj.calculateActivitiesParticipant(key,grp)

  def calculateTimeInSocialAcitivities(self,df:pd.DataFrame):
    for key,grp in df.groupby('participantId',sort = True):
      if key%50 == 0:
        print(f"Doing ID : {key}")
      participantActivityMinutesObj = analyseActivityMinutesofParticipant(globalVariableStore.ACTIVITIES_OF_INTEREST)
      activityMinutesDict = participantActivityMinutesObj.calculateAverageMinutesParticipant(grp)
      self.averageActivityMinutes[key] = activityMinutesDict
  
  def calculateTimeInWorkPlace(self,df:pd.DataFrame):
    df = df.loc[df['venueType'].isin(globalVariableStore.WORKPLACE_LIST)]
    for key,grp in df.groupby('participantId',sort = True):
      participantActivityWorkplace = findMinutesAtWorkplace()
      averageMinutes = participantActivityWorkplace.calculateAverageMinutesParticipant(grp,'checkInTime','checkOutTime','timestamp')
      self.averageWorkPlaceMinutes[key] = averageMinutes

  def calculateTimeTravelling(self,df:pd.DataFrame):
    for key,grp in df.groupby('participantId',sort = True):
      participantTravelling = findMinutesAtWorkplace()
      averageMinutes = participantTravelling.calculateAverageMinutesParticipant(grp,'travelStartTime','travelEndTime','travelEndTime')
      self.averageTravellingMinutes[key] = averageMinutes

  def calculateTimeLeaveAndBack(self,df:pd.DataFrame):
    df = df.loc[df['purpose'] == 'Work/Home Commute']
    for key,grp in df.groupby('participantId',sort = True):
      if key%50 == 0:
        print(f"Doing ID : {key}")
      leaveBackObj = timeLeaveBack()
      participantLeaveBackDict = leaveBackObj.calculate(grp)
      participantLeaveBackMinutesDict = leaveBackObj.convertTimetoMinutes(participantLeaveBackDict)  
      self.averageLeaveBack[key] = leaveBackObj.calculateAverage(participantLeaveBackMinutesDict)

In [None]:
df_checkIn = pd.read_csv(globalVariableStore.CHECKIN_PATH)
df_travel = pd.read_csv(globalVariableStore.TRAVEL_PATH)
df_checkIn = processDateTime(df_checkIn,["timestamp"])
df_travel = processDateTime(df_travel,["travelStartTime","travelEndTime","checkInTime","checkOutTime"])

In [None]:
df_checkInTravel = df_checkIn.merge(df_travel[["participantId","travelEndLocationId","purpose","checkInTime","checkOutTime"]],\
                           left_on = ['participantId','timestamp'],right_on = ['participantId','checkInTime'])

In [None]:
activityobj = collectDataofAllParticipants()
# activity_counts = activityobj.calculateActivitiesOfAllParticipants(df_checkIn)
# activityobj.calculateTimeTravelling(df_travel)
activityobj.calculateTimeLeaveAndBack(df_travel)

Doing ID : 0
Doing ID : 50
Doing ID : 100
Doing ID : 150
Doing ID : 200
Doing ID : 250
Doing ID : 300
Doing ID : 350
Doing ID : 400
Doing ID : 450
Doing ID : 500
Doing ID : 550
Doing ID : 600
Doing ID : 650
Doing ID : 700
Doing ID : 750
Doing ID : 800
Doing ID : 850
Doing ID : 900
Doing ID : 950
Doing ID : 1000


In [None]:
activityobj.averageLeaveBack

{0: (380.0, 1128.136645962733),
 1: (455.0, 1005.4968944099379),
 2: (360.0, 987.888198757764),
 3: (385.0, 1023.3229813664597),
 4: (420.27950310559004, 978.4782608695652),
 5: (475.0, 1025.0931677018634),
 6: (405.0, 925.0934579439253),
 7: (460.0, 1010.0),
 8: (425.0, 945.0),
 9: (405.0, 1013.3540372670808),
 10: (400.0, 1068.167701863354),
 11: (350.0, 1027.329192546584),
 12: (430.0, 1050.0),
 13: (455.0, 995.0),
 14: (525.0, 1043.4006211180124),
 15: (390.0, 988.2398753894081),
 16: (430.0, 1007.0807453416149),
 17: (485.0, 1083.276397515528),
 18: (450.0, 1138.1211180124224),
 19: (515.0, 1061.4906832298136),
 20: (405.0, 1055.1552795031057),
 21: (470.0, 1014.1614906832298),
 22: (495.0, 1025.0),
 23: (315.0, 1043.0685358255453),
 24: (430.0, 1058.8354037267081),
 25: (475.0, 1063.2919254658384),
 26: (485.0, 1057.5310559006211),
 27: (470.0, 1088.245341614907),
 28: (460.0, 1075.0),
 29: (410.0, 1043.2298136645963),
 30: (484.984520123839, 1114.2414860681115),
 31: (390.0, 109

In [None]:
with open('/content/drive/MyDrive/Datasets/VAST_Challenge_2022/LeaveBackTime.json','w') as f:
  json.dump(activityobj.averageLeaveBack,f)

In [None]:
workplace_minute

In [None]:
df_checkInTravel_0 = df_checkInTravel.loc[df_checkInTravel["participantId"] == 0]
# df_checkInTravel_0

In [None]:
df_checkInTravel_0_WP = df_checkInTravel_0.loc[df_checkInTravel_0['venueType'] == "Workplace"]

In [None]:
df_checkInTravel_0_WP

In [None]:
workplace_minutes = {}
for key,grp in df_checkInTravel_0_WP.groupby([pd.Grouper(key = 'timestamp',freq = 'D')]):
  workplace_minutes[key] = np.sum((grp['checkOutTime'] - grp['checkInTime']).dt.total_seconds())/60

In [None]:
from functools import reduce
reduce(lambda x,key:workplace_minutes[key] + x,workplace_minutes,0)/len(workplace_minutes)

In [None]:
(grp['checkOutTime'] - grp['checkInTime']).dt.total_seconds()

In [None]:
# activity_dict = {k:[0,0] for k in globalVariableStore.ACTIVITIES_OF_INTEREST}
# df_checkInTravel_0 = df_checkInTravel.loc[df_checkInTravel['participantId'] == 0]
# df_checkInTravel_0 = df_checkInTravel_0.loc[df_checkInTravel_0['purpose'].isin(['Eating','Recreation (Social Gathering)'])]

In [None]:
activityobj = collectDataofAllParticipants()
activityobj.calculateTimeInSocialAcitivities(df_checkInTravel)

In [None]:

with open('/content/drive/MyDrive/Datasets/VAST_Challenge_2022/eatingSocial.json','w') as f:
  json.dump(activityobj.averageActivityMinutes,f)

In [None]:
f = open('/content/drive/MyDrive/Datasets/VAST_Challenge_2022/eatingSocial.json')

data = json.load(f)