In this notebook, we are going to take the original tutor log data at transaction level and stamp it with additional columns on whether the teacher is detected to be stopping beside the student at that moment. 

In [9]:
# python setup 
import pandas as pd 
import numpy as np 
import tutorDataAPI as tutorAPI
import stop_detection as sDetec

# 1. Loading data from files

Load teacher position data from corresponding file: 

In [10]:
positionDF = pd.read_csv("output_files/teacher_position_sprint1_shou.csv", 
                         index_col=False) 

startTime = min(positionDF["time_stamp"])
endTime = max(positionDF["time_stamp"]) 

Read in the transaction level tutor log data: 

In [11]:
tutorLogDF = tutorAPI.getAnnotatedTutorLogDF("raw data/tutor_log.tsv", 
                                             startTimestamp=startTime, 
                                             endTimestamp=endTime) 

def getDayPeriod(timestamp): 
    """
    Finds the dayID and periodID given a timestamp, if not fitted into any day/period, returns (nan, nan)
    """
    # start and end time of each day/period 
                # period1                                  period2                                   period3                                   period4                                   period5
    startTimes = [ 
                 [tutorAPI.EDTDatetime2epoch("2022-05-23 08:26:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 10:13:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 11:05:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 12:36:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 14:17:00")], # day1
                 [tutorAPI.EDTDatetime2epoch("2022-05-24 08:21:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 10:02:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 10:57:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 12:24:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 14:07:00")], # day2
                 [tutorAPI.EDTDatetime2epoch("2022-05-25 08:21:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 10:01:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 10:55:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 12:24:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 14:07:00")]  # day3
                 ] 

                # period1                                  period2                                   period3                                   period4                                   period5
    endTimes =  [ 
                [tutorAPI.EDTDatetime2epoch("2022-05-23 08:53:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 10:41:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 11:30:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 13:00:00"), tutorAPI.EDTDatetime2epoch("2022-05-23 14:40:00")], # day1
                [tutorAPI.EDTDatetime2epoch("2022-05-24 08:43:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 10:27:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 11:15:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 12:48:00"), tutorAPI.EDTDatetime2epoch("2022-05-24 14:30:00")], # day2
                [tutorAPI.EDTDatetime2epoch("2022-05-25 08:43:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 10:24:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 11:18:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 12:47:00"), tutorAPI.EDTDatetime2epoch("2022-05-25 14:30:00")]  # day3
                ] 
    
    for day in range(3): 
        for period in range(5): 
            if startTimes[day][period] <= timestamp <= endTimes[day][period]: 
                return (day+1, period+1)

    return (np.nan, np.nan) 

def getDayID(timestamp): 
    return getDayPeriod(timestamp)[0] 

def getPeriodID(timestamp): 
    return getDayPeriod(timestamp)[1]

# mutate two new columns indicating dayID and periodID 
tutorLogDF["periodID"] = tutorLogDF["timestamp"].apply(getPeriodID)
tutorLogDF["dayID"] = tutorLogDF["timestamp"].apply(getDayID)

# filtered out only those entries within some day/period 
tutorLogDF = tutorLogDF.loc[tutorLogDF["dayID"].notnull()]
tutorLogDF = tutorLogDF.loc[tutorLogDF["periodID"].notnull()] 

# sort by timestamp 
tutorLogDF = tutorLogDF.sort_values(by="timestamp")
tutorLogDF.index = np.arange(len(tutorLogDF))


  tutorLogDF = pd.read_csv(tutorLogFilePath, delimiter=delimiter, index_col=False)


# 2. Calculating stops and corresponding centroids from position data

In [12]:
# data mine stops and corresponding centroids with the following parameter setting
stopsAndCentroids = sDetec.getStopsAndCentroids(positionDF["chosen_X"], 
                                                positionDF["chosen_Y"], 
                                                positionDF["time_stamp"], 
                                                positionDF["periodID"], 
                                                positionDF["dayID"], 
                                                duration=10, radius=500) 
# separate stops tuple from centroid tuple 
stops = [(elem[0], elem[1]) for elem in stopsAndCentroids]
centroids = [(elem[2], elem[3]) for elem in stopsAndCentroids]

# 3. Determine if each transaction take place with in `range` distance of a stop

We first have to know each student's position during each period/day. 

In [13]:
# grab the mapping file 
positionMappingDF = pd.read_csv("output_files/student_position_sprint1_shou.csv", index_col=False) 
positionMapping = dict() 
# the following for loop construct this mapping variable 
for i in positionMappingDF.index: 
    dayID = positionMappingDF.loc[i, "dayID"]
    periodID = positionMappingDF.loc[i, "periodID"] 
    studentID = positionMappingDF.loc[i, "anon_user_id"] 
    coordinate = (positionMappingDF.loc[i, "X"], positionMappingDF.loc[i, "Y"]) 
    # insert the key-value entry into the mapping 
    positionMapping[(dayID, periodID, studentID)] = coordinate 

# construct a list of coordinates that has the same length as 
# tutorLogDF["Anon Student Id"], but with value of coordinates corresponding 
# to these students 

studentPos = list() 
for i in tutorLogDF.index: 
    dayID = tutorLogDF.loc[i, "dayID"] 
    periodID = tutorLogDF.loc[i, "periodID"] 
    studentID = tutorLogDF.loc[i, "Anon Student Id"] 
    studentPos.append(positionMapping.get((dayID, periodID, studentID), (np.nan, np.nan))) 


In [14]:
from typing import Iterable
# this works under the assumption that input timestamp sequance is sorted ascending 
def isBesideStop(studentPos: Iterable, timestamps: Iterable, stops: Iterable, centroids: Iterable, rng): 

    assert len(studentPos) == len(timestamps) 
    assert len(stops) == len(centroids) 

    # result, to be returned 
    res = list() 

    searchStart = 0 # search start index for looking at stops 
    for i in range(len(studentPos)): 
        teacherIsBeside = False 
        position = studentPos[i] 
        timestamp = timestamps[i] 

        # this means that the student's position is undocumented 
        if np.isnan(position[0]) or np.isnan(position[1]): 
            res.append(False) 
            continue

        # start searching from seachStart 
        for j in range(len(stops)): 
            stopStart, stopEnd = stops[j] 
            assert stopEnd > stopStart
            centroid = centroids[j] 
            
            if timestamp < stopStart: 
                searchStart = j
            # timestamp within bound
            elif stopStart <= timestamp and timestamp <= stopEnd: 
                searchStart = j # update searchStart to be current index
                                # since all of the following timestamps are 
                                # garanteed to be larger that the current one 
                if sDetec.getDist(position, centroid) <= rng: # within rng distance, toggle teacherIsBeside to True
                    teacherIsBeside = True 
            else: 
                assert timestamp > stopEnd
                #break 

        if teacherIsBeside: res.append(True) 
        else: res.append(False) 

    return res 

for rng in [500, 1000, 1500]: 
    vals = isBesideStop(studentPos, tutorLogDF["timestamp"].tolist(), stops, centroids, rng=rng) 
    tutorLogDF[f"isBesideStop_rng{rng}"] = vals



In [15]:
exportCSV = True
if exportCSV: 
    tutorLogDF.to_csv("output_files/tutor_log_tagged_with_stop.csv", index=False)