In this notebook, we are going to distill the tutor log data into event-actor-subject format. Here are some baic assumptions to the distilling process: 

- Students can only conduct two actions in this dataset: `ATTEMPT` and `HINT REQUEST`
- Subject will be `tutor` for all actions 
- Time will be sunchronized using epoch timestamp 

In [15]:
import numpy as np
import pandas as pd
import import_ipynb
from tutor_log_summary_stat import *

# data-loading 
tutorLogDF = pd.read_csv("raw data/tutor_log.tsv", delimiter="\t", index_col=False) 
tutorLogDF["Time Zone"] = "UTC" # the logs are entered in UTC time zone 
tutorLogDF["timestamp"] = tutorLogDF["Time"].apply(UTCDatetime2epoch) # add a new column with unix time stamps 
tutorLogDF["EDT_time"] = tutorLogDF["timestamp"].apply(epoch2datetimeInEDT) # append a new column with EDT time information to be more intuitive 

# only aceepting data within the experiment period, which is between 05/23/2022 and 05/25/2022 
experimentStartTimestamp = EDTDatetime2epoch("2022-05-23 08:00:00")
experimentEndTimestamp = EDTDatetime2epoch("2022-05-25 16:00:00")
tutorLogDF = filterWithTime(tutorLogDF, experimentStartTimestamp, experimentEndTimestamp) 

  tutorLogDF = pd.read_csv("raw data/tutor_log.tsv", delimiter="\t", index_col=False)


In [19]:
# helper function definitions 

def getDayPeriod(timestamp): 
    """
    Determine which day/period the given time stamp is in; if not in any day/period, returns None 

    Args:
        timestamp (int): epoch time stamp 

    Returns:
        (int, int) or None: (dayID, periodID) tuple if the time stamp is within any day/period; None if not within any 
    """    

    # start and end time of each day/period 
                    # period1                                  period2                                   period3                                   period4                                   period5
    startTimes =  [ 
                    [EDTDatetime2epoch("2022-05-23 08:26:00"), EDTDatetime2epoch("2022-05-23 10:13:00"), EDTDatetime2epoch("2022-05-23 11:05:00"), EDTDatetime2epoch("2022-05-23 12:36:00"), EDTDatetime2epoch("2022-05-23 14:17:00")], # day1
                    [EDTDatetime2epoch("2022-05-24 08:21:00"), EDTDatetime2epoch("2022-05-24 10:02:00"), EDTDatetime2epoch("2022-05-24 10:57:00"), EDTDatetime2epoch("2022-05-24 12:24:00"), EDTDatetime2epoch("2022-05-24 14:07:00")], # day2
                    [EDTDatetime2epoch("2022-05-25 08:21:00"), EDTDatetime2epoch("2022-05-25 10:01:00"), EDTDatetime2epoch("2022-05-25 10:55:00"), EDTDatetime2epoch("2022-05-25 12:24:00"), EDTDatetime2epoch("2022-05-25 14:07:00")]  # day3
                  ] 

                # period1                                  period2                                   period3                                   period4                                   period5
    endTimes =  [ 
                [EDTDatetime2epoch("2022-05-23 08:53:00"), EDTDatetime2epoch("2022-05-23 10:41:00"), EDTDatetime2epoch("2022-05-23 11:30:00"), EDTDatetime2epoch("2022-05-23 13:00:00"), EDTDatetime2epoch("2022-05-23 14:40:00")], # day1
                [EDTDatetime2epoch("2022-05-24 08:43:00"), EDTDatetime2epoch("2022-05-24 10:27:00"), EDTDatetime2epoch("2022-05-24 11:15:00"), EDTDatetime2epoch("2022-05-24 12:48:00"), EDTDatetime2epoch("2022-05-24 14:30:00")], # day2
                [EDTDatetime2epoch("2022-05-25 08:43:00"), EDTDatetime2epoch("2022-05-25 10:24:00"), EDTDatetime2epoch("2022-05-25 11:18:00"), EDTDatetime2epoch("2022-05-25 12:47:00"), EDTDatetime2epoch("2022-05-25 14:30:00")]  # day3
                ] 

    days = 3 
    periods = 5
    for day in range(days): 
        for period in range(periods): 

            # see if the time stamp is within this day/period's start and end time 
            if(startTimes[day][period] <= timestamp and 
               timestamp <= endTimes[day][period]): 
                # if condition is true, return this day and period
                # going to + 1 for each of them since we are counting from 1 for these variables 
                return (day + 1, period + 1)

    # return None is this time stamp is not within any day/period 
    return None


In [22]:
# distill chunk 

if __name__ == "__main__": 

    outputDF = pd.DataFrame(columns=["dayID", "periodID", "timestamp", "event", "actor", "subject", "content"]) 

    # go through tutorLogDF to distill event rows 
    for i in range(len(tutorLogDF)): 

        currTutorRow = tutorLogDF.iloc[i,] # i-th row in tutor log data 
        newOutputRow = dict() # to be appended to outputDF

        # this signals a student attempt event 
        if(currTutorRow["Student Response Type"] == "ATTEMPT" and 
        currTutorRow["Tutor Response Type"] == "RESULT"):
            
            # make sure that this tutor data row is in some class period with respect to timestamp 
            if(getDayPeriod(currTutorRow["timestamp"]) == None): continue 
            
            # populate this output dataset row 
            timestamp = currTutorRow["timestamp"]
            dayID, periodID = getDayPeriod(timestamp) 
            event = currTutorRow["Student Response Type"] # ATTEMPT in this case 
            actor = currTutorRow["Anon Student Id"] 
            subject = "tutor" 
            content = "Outcome is " + currTutorRow["Outcome"] + "; "
            assert(currTutorRow["Outcome"] == "CORRECT" or currTutorRow["Outcome"] == "INCORRECT") # safety check 
            content += "student input is " + currTutorRow["Input"] + "; " 
            content += "problem level is " + currTutorRow["Level (ProblemSet)"] 

        # this signals a student hint request event 
        elif(currTutorRow["Student Response Type"] == "HINT_REQUEST" and 
            currTutorRow["Tutor Response Type"] == "HINT_MSG"):

            # make sure that this tutor data row is in some class period with respect to timestamp 
            if(getDayPeriod(currTutorRow["timestamp"]) == None): continue 

            # populate this output dataset row 
            timestamp = currTutorRow["timestamp"]
            dayID, periodID = getDayPeriod(timestamp) 
            event = currTutorRow["Student Response Type"] # HINT_REQUEST in this case 
            actor = currTutorRow["Anon Student Id"] 
            subject = "tutor" 
            content = "Hint message is " + currTutorRow["Feedback Text"] + "; "
            assert(currTutorRow["Outcome"] == "HINT") # safety check 
            content += "problem level is " + currTutorRow["Level (ProblemSet)"] 

        # this row of tutor log does not constitute a legal event 
        else: continue

        newOutputRow = {"dayID": [dayID], 
                        "periodID": [periodID], 
                        "timestamp": [timestamp], 
                        "event": [event], 
                        "actor": [actor], 
                        "subject": [subject], 
                        "content": [content]}
        newDF = pd.DataFrame(newOutputRow) 
        outputDF = pd.concat([outputDF, newDF], ignore_index=True) 


In [23]:
if __name__ == "__main__": 
    outputDF.to_csv("output_files/tutor_events.csv", index=False) 