We are going to merge all the event-actor-subject data from three modalities into one master file in this notebook. 

In [6]:
import stop_detection as sd 
import pandas as pd 
import numpy as np
import import_ipynb 
import triangulation

In [7]:
def getDistList(centroid, objPoints): 
    """
    Input is a centroid point (X,Y), and a iterable of classroom object coordinates. 
    Returns a list of distances between the centroid point and the objects. 

    Args:
        centroid ((float, float))): coordinates of centroid points 
        objPoints (Iterable( (int/float, int/float) )): an iterable of int/float tuples 

    Returns:
        List[int]: distance list 
    """    
    
    distanceList = list()
    for objPoint in objPoints: 
        distanceList.append( sd.getDist(centroid, objPoint) ) 

    assert( len(objPoints) == len(distanceList) )
    return distanceList 

def iterable2Str(itrb, sep=";"): 

    """
    Input an iterable, returned a string with all the items in the iterable 
    casted to strings separated by separator 

    Args:
        itrb (iterable): input iterable
        sep (str, optional): item seperator. Defaults to ";".

    Returns:
        str: items in itrb casted to string separated by separator 
    """

    assert isinstance(sep, str)
    res = "" # to be returned 

    for item in itrb: 
        res += str(item) + sep

    res = res[:len(res) - len(sep)] # get rid of the last separator 

    return res 

def getClosestObj(centroids, objDF, rng=float("inf")): 
    """
    Returns a list of closest object to the centroid points; object will only be 
    return if it is within the distance specified by range (rng), or NaN will appear 
    in the returned list 

    Args:
        centroids (Iterable[(float, float)]): an iterable with centroid points represented by tuples 
        objDF (pd.DataFrame): a dataframe holding the position of all classroom objects of interests 
        rng (float/int, optional): range parameter, object must be within this distance from centroid to be returned. 
            Defaults to float("inf").

    Returns:
        List[str]: a list of object names 
    """    

    seen = dict() # note the centroid we already calculated closest objects for 
    closestObjects = list() # list of closest objects; to be returned; should be of the same length as centroids
    objPoints = sd.cols2tuples(objDF.X, objDF.Y) # coordinates for classroom objects 
    preceedingLettersLen = len("seat")

    for centroid in centroids: 
        
        # if centroid is NaN, just append NaN to object list 
        if(not isinstance(centroid, tuple)): 
            closestObjects.append(np.nan) 

        else: 

            if centroid in seen: # if we already calculated the centroid's closest object 
                closestObjects.append(seen[centroid]) # just append the already-calculated result 
            else: # not seen 
                distanceList = getDistList(centroid, objPoints) 
                minDist = min(distanceList) # distance between the centroid to the closest object 
                ind = distanceList.index(minDist) # index of the closest object 
                obj = objDF.iloc[ind]["object"] # full object name, e.g., seat12
                seatNum = int(obj[preceedingLettersLen:]) # get the seat number 

                # only append the object within range 
                if minDist <= rng: 
                    closestObjects.append(seatNum) 
                    seen[centroid] = seatNum # add calculation result to seen distionary 
                else: 
                    closestObjects.append(np.nan)  
                    seen[centroid] = np.nan # tell seen mapping that this has no result 

    assert(len(closestObjects) == len(centroids)) 
    return closestObjects

def getObjsInRange(centroids, objDF, rng=float("inf")): 
    """
    Returns a list of in-range objects to the centroid points, meaning that a 
    centroid point may correspond to 0 to many objects; object will only be 
    return if it is within the distance specified by range (rng), or NaN will appear 
    in the returned list 

    Args:
        centroids (Iterable[(float, float)]): an iterable with centroid points represented by tuples 
        objDF (pd.DataFrame): a dataframe holding the position of all classroom objects of interests 
        rng (float/int, optional): range parameter, object must be within this distance from centroid to be returned. 
            Defaults to float("inf").

    Returns:
        List[str]: a list of object names 
    """    

    seen = dict() # note the centroid we already calculated closest objects for 
    closestObjects = list() # list of closest objects; to be returned; should be of the same length as centroids
    objPoints = sd.cols2tuples(objDF.X, objDF.Y) # coordinates for classroom objects 
    preceedingLettersLen = len("seat")

    for centroid in centroids: 
        
        # if centroid is NaN, just append NaN to object list 
        if(not isinstance(centroid, tuple)): 
            closestObjects.append(np.nan) 

        else: 

            if centroid in seen: # if we already calculated the centroid's closest object 
                closestObjects.append(seen[centroid]) # just append the already-calculated result 
            else: # not seen 
                distanceList = getDistList(centroid, objPoints) 
                whetherInRange = pd.Series(distanceList) <= rng # see if each object in objDF is in-range
                objs = objDF.loc[whetherInRange, "object"] # full object name, e.g., seat12

                if len(objs) > 0: # there is at least one object in-range
                    seatNums = objs.apply(lambda s: s[preceedingLettersLen:]) # only extract the seat number, i.e., seat12 -> 12
                    seatNumsStr = iterable2Str(seatNums, sep=";") # convert to a string with seat numbers separated by ;
                    closestObjects.append(seatNumsStr) # append the string
                    seen[centroid] = seatNumsStr # memorize findings to mapping 
                else: # no object found within range
                    closestObjects.append(np.nan)  
                    seen[centroid] = np.nan # tell seen mapping that this has no result 

    assert(len(closestObjects) == len(centroids)) 
    return closestObjects

def getFirstWord(s): 

    word = "" 
    for letter in s: 
        if letter == " ": return word 
        else: word += letter 

    return word


In [8]:
# This code chunk processes teacher's position data by given duration and radius parameter. 
# Use positionEvents dataframe as the event-actor-subject format data. 

if __name__ == "__main__": 

    # TODO: change the parameters here: 
    duration = 10
    radius = 500
    rng = 1000

    positionRawDF = pd.read_csv("output_files/teacher_position_sprint1_shou.csv", index_col=False) 
    objPos = pd.read_csv("raw data/seating_chart_x_y_seat_only_sprint1_shou.csv", index_col=False) 

    stops = sd.getStops(positionRawDF.chosen_X, positionRawDF.chosen_Y, positionRawDF.time_stamp, 
                        positionRawDF.periodID, positionRawDF.dayID, duration, radius) 
    events, centroids = triangulation.getStopEvent(positionRawDF, stops) 
    positionRawDF["content"] = events
    positionRawDF["event"] = [getFirstWord(event) for event in events] 
    positionRawDF["subject"] = getObjsInRange(centroids, objPos, rng=rng) 
    positionRawDF["actor"] = "teacher" 

    # transcribe useful columns from raw dataframe to events dataframe 
    positionEventsDF = pd.DataFrame() 
    positionEventsDF["timestamp"] = positionRawDF["time_stamp"]
    for col in ["dayID", "periodID", "event", "actor", "subject", "content"]: 
        positionEventsDF[col] = positionRawDF[col] 

    # add tag for modality origin 
    positionEventsDF["modality"] = "position"

# Now the position events data are ready 


In the following code chunk(s), we are going to clean up observation data and extract strict event-actor-subject format data: 

In [9]:
if __name__ == "__main__": 

    obsEventsDF = pd.read_csv("output_files/observation_events.tsv", delimiter="\t", index_col=False) 

    # do not want this signaling event, i.e., period begins 
    obsEventsDF = obsEventsDF.loc[ obsEventsDF["event"] != "Period begins"] 

    # combine `keyword` and `note` column into one single column called `content` 
    content = obsEventsDF["keyword"] + "; " + obsEventsDF["note"] 

    # drop a few unnecessary columns 
    obsEventsDF = obsEventsDF.drop(["time", "note", "where", "keyword"], axis=1) 

    # add tag for modality origin 
    obsEventsDF["modality"] = "observation"

# Now observation events data are ready 


In the following code chunk(s), we are going to get events data from tutor log: 

In [10]:
if __name__ == "__main__": 

    tutorEventsDF = pd.read_csv("output_files/tutor_events.csv") 
    tutorEventsDF["modality"] = "tutor" # add tag for modality origin 

By simply concatenating data from the three modalities, we can obtain a master event data file: 

In [11]:
if __name__ == "__main__": 

    eventMasterDF = pd.concat([tutorEventsDF, positionEventsDF, obsEventsDF], 
                              ignore_index=True) 
    eventMasterDF = eventMasterDF.sort_values(by=['timestamp'], ascending=True) 
    eventMasterDF.index = np.arange(len(eventMasterDF))

Since the event data from position and observation modality identifies students with their seat number during given `periodID` and `dayID`, we will have to may the seat number to `anon_student_id` to reach agreement with tutor data. 

In [12]:
def seatNum2AnonStudID(seatNum, dayID, periodID, mappingDF): 
    """
    Given a seat number, a period ID, and a day ID, identifies the anonymous user ID for the student seated in the seat number at that time. 
    Returns anon user ID as a string; returns an empty string if no student seated. 

    Args:
        seatNum (int): seat number in the classroom 
        dayID (int): 1, 2, or 3
        periodID (int): 1, 2, 3, 4, or 5
        mappingDF (pd.DataFrame): pandas data frame with student seat mapping information 

    Returns:
        str: anon user ID of the student identified; empty string if no student seated 
    """    

    # this row should obtain the information on the student seated in the seat 
    # number during the given period and day
    selectedRow = mappingDF.copy()
    selectedRow = selectedRow.loc[selectedRow["seatNum"] == seatNum] \
                                 [selectedRow["periodID"] == periodID] \
                                 [selectedRow["dayID"] == dayID] 

    assert len(selectedRow) <= 1, "Multiple students found. "
    
    # returns empty string if no student found
    if(len(selectedRow) == 0): return ""  
    if(str(selectedRow.iloc[0]["anon_user_id"]) == "nan"): return ""
    return selectedRow.iloc[0]["anon_user_id"] # returns anon ID

if __name__ == "__main__": 

    # reading in mapping data file 
    mappingDF = pd.read_csv("output_files/student_position_sprint1_shou.csv", index_col=False)

    # loop through all the rows in event master dataframe 
    for i in range(len(eventMasterDF)): 
        currRow = eventMasterDF.iloc[i]

        # only `actor` and `subject` have occurrences of seat numbers 

        if str(currRow["actor"]).isdigit(): # seat number found 
            periodID, dayID = currRow["periodID"], currRow["dayID"] 
            seatNum = int( currRow["actor"] ) 

            # find the corresponding anon user id and replace 
            anonID = seatNum2AnonStudID(seatNum, dayID, periodID, mappingDF) # THIS IS SLOW, consider optimization!!!
            if anonID == "": # no student is sitting in the seat during the day and period 
                eventMasterDF.at[i, "actor"] += ", but no student seated"
            else: 
                eventMasterDF.at[i, "actor"] = anonID

        # subject column involves multiple seat number separated by `;`, so is more complicated 
        if not str(currRow["subject"]).isalpha(): # seat number found 
            periodID, dayID = currRow["periodID"], currRow["dayID"] 
            seatNums = str(currRow["subject"]).split(';')

            res = '' # result value to assign to eventMasterDF.at[i, "subject"] 
            for seatNum in seatNums: 
                
                # get typing straight, since seatNum maybe '3.0', which will trigger error when feed into int()
                try: 
                    seatNum = int(seatNum)
                except ValueError: 
                    seatNum = int(float(seatNum))

                # find the corresponding anon user id and replace 
                anonID = seatNum2AnonStudID(seatNum, dayID, periodID, mappingDF) # THIS IS SLOW, consider optimization!!!
                if anonID == "": # no student is sitting in the seat during the day and period 
                    res += str(seatNum) + ", but no student seated; "
                else: 
                    res += anonID + "; " 

            # assign value back to dataframe
            eventMasterDF.at[i, "subject"] = res[0:len(res)-2] # get rid of the semicolon and space in the tail 

# this chunk should take a few minutes 

  selectedRow = selectedRow.loc[selectedRow["seatNum"] == seatNum] \


In [13]:
if __name__ == "__main__":
    
    # final touch: indicate whether the ATTEMPT is CORRECT or INCORRECT in the 
    # event name, since it is important info
    for i in eventMasterDF.index: 

        if not isinstance(eventMasterDF.loc[i, "content"], str): pass

        elif ("INCORRECT" in eventMasterDF.loc[i, "content"]): # an incorrect attempt
            # safety check
            assert eventMasterDF.loc[i, "event"] == "ATTEMPT" and \
                   eventMasterDF.loc[i, "modality"] == "tutor" 

            eventMasterDF.loc[i, "event"] = "Incorrect attempt" 

        elif ("CORRECT" in eventMasterDF.loc[i, "content"]): # a correct attempt
            # safety check
            assert eventMasterDF.loc[i, "event"] == "ATTEMPT" and \
                   eventMasterDF.loc[i, "modality"] == "tutor" 

            eventMasterDF.loc[i, "event"] = "Correct attempt"
        
        # replace HINT_REQUEST with Hint request event 
        if eventMasterDF.loc[i, "event"] == "HINT_REQUEST": 
            eventMasterDF.loc[i, "event"] = "Hint request" 


In [15]:
# final touch: split event talking to small group into multiple rows, where
# each row only has one subject 
if __name__ == "__main__":

    newRowsDF = pd.DataFrame()
    eventMasterDF = originalEventMasterDF.copy()
    for i in eventMasterDF.index: 

        if ("Talking to small group" in eventMasterDF.loc[i, "event"] or \
            "Talking to student" in eventMasterDF.loc[i, "event"] or \
            "Stopping" in eventMasterDF.loc[i, "event"]) and \
            isinstance(eventMasterDF.loc[i, "subject"], str): 

            subjects = eventMasterDF.loc[i, "subject"].split(";")

            # create a new row for each subject in the small group 
            for subject in subjects: 
                # all the other values in the new row are the same with the 
                # previous row in the dataframe but subject 
                newRow = dict()
                for col in eventMasterDF.columns: newRow[col] = [eventMasterDF.loc[i, col]]
                newRow["subject"] = subject 
                newRowsDF = pd.concat([newRowsDF, pd.DataFrame(newRow)]) 

            eventMasterDF = eventMasterDF.drop(i, axis=0) # drop the previous row with multiple subjects separated by semicolon 

    eventMasterDF = pd.concat([eventMasterDF, newRowsDF]) # concatenate the new rows 
    eventMasterDF = eventMasterDF.sort_values("timestamp") # sort by time stamp 
    eventMasterDF.index = np.arange(len(eventMasterDF)) 

# this chunk should take about a minute 

In [22]:
# including detector results into this event master file 
import detectorDataAPI as detectorAPI 

if __name__ == "__main__":

    # get detector events 
    detectorDF = detectorAPI.getDetectorResultsDF() 
    detectorEventsDF = detectorAPI.getDetectorEvents(detectorDF, ["struggle", "idle", "misuse", "gaming"]) 
    # concat with event master dataframe and sort by timestamp 
    eventMasterDF = pd.concat([eventMasterDF, detectorEventsDF], ignore_index=True) 
    eventMasterDF = eventMasterDF.sort_values(by="timestamp") 
    eventMasterDF.index = np.arange(len(eventMasterDF)) 


In [23]:
# save the merged event data file
if __name__ == "__main__":
    # TODO: change file name as needed
    outputFilePath = f"output_files/event_master_file_D{duration}_R{radius}_RNG{rng}_sprint2_shou.csv"
    eventMasterDF.to_csv(outputFilePath, index=False)