# SRFREW-EDA

In [1]:
import os 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt

alt.data_transformers.enable("vegafusion");

In [238]:
assignments = pd.read_csv(os.path.join("data", "assignments.csv"))
discussion_topics = pd.read_csv(os.path.join("data", "discussion_topics.csv"))
discussions = pd.read_csv(os.path.join("data", "discussions.csv"))
enrollments = pd.read_csv(os.path.join("data", "enrollments.csv"))
files = pd.read_csv(os.path.join("data", "files.csv"))
gradebook = pd.read_csv(os.path.join("data", "gradebook.csv"))
module_items = pd.read_csv(os.path.join("data", "module_items.csv"))
navigation_events = pd.read_csv(os.path.join("data", "navigation_events.csv"))
pages = pd.read_csv(os.path.join("data", "pages.csv"))

In [240]:
# gradebook 
gradebook.drop(index=[0,1], inplace=True)
gradebook.loc[:, gradebook.columns != "Student"] = gradebook.loc[:, gradebook.columns != "Student"].apply(pd.to_numeric)

# drop strange/na columns
navigation_events.drop(columns=["ed_app", "type", "action", "course_offering_id", "statement_type", "statement_version", "event__object_type", "event__object_extensions_asset_subtype", "event__object_extensions_entity_id", "event__referrer", "event__extensions_request_url", "event__attachment_type"], inplace=True)

# drop learners that are not enrolled in the course
navigation_events = navigation_events.iloc[np.in1d(navigation_events.actor_id, enrollments.user_id), :]
navigation_events = navigation_events.query("actor_id != 'LEARNER_48'")
                       
navigation_events.event_time = pd.to_datetime(navigation_events.event_time, format='ISO8601').dt.normalize()

ID MAP: 
`object_id` maps to: 
- `discussion_topics.id`
- `discussions.discussion_topic_id`
- `assignments.id`
- `files.id`
- `pages.id`


Does NOT map to: 
- anything in `module_items`
- anything in `gradebook`

In [129]:
object_id_map = pd.concat([discussion_topics.loc[:,["id", "title"]],
           assignments.loc[:,["id", "name"]].rename({"name": "title"}, inplace=True),
           files.loc[:,["id", "filename_masked"]].rename({"filename_masked": "title"}, inplace=True),
           pages.loc[:,["id", "title"]]
          ])

Unnamed: 0,id,title
0,132f3fab56d60839d727b966a76c1b1e,Assignment 1 Discussion forum
1,e1f90c16c123e0f96b2af7d94a1c335c,Introduce yourself
2,7c9e39571ad00fe24282e5b5366d7563,Discussion 2: Reflect on rationales for eLearning
3,c916612763c04832d8e7b9e89deb7d2a,Discussion 1: The meaning of eLearning
4,bb62cd396b9a8a94ecef6b5d1c4f8ad1,Discussion 3: Funding issues
...,...,...
87,972196dd128aeb62a66f5c02b897e7dd,[wikipage] Assignment 2 Group sign up
88,9ee9a2cacefb8be58dde2a381617e0ad,Wikipage Assignment 2 Group Sign Up (Optional)
89,0790ff605875f7393f9fde5c50c3eae4,[wikipage] Assignment 3 Group sign up (optional)
90,47cc5ac28d341bcc8e175c649974a314,Your Instructor


In [140]:
alt.Chart(navigation_events).mark_bar().encode(
    x = alt.X("event_time:T"), 
    y = "count()",
    color = "event__object_extensions_asset_type"
)

In [141]:
alt.Chart(navigation_events).mark_bar().encode(
    x = alt.X("actor_id"), 
    y = "count()",
    color = "event__object_extensions_asset_type"
)

In [267]:
a = pd.DataFrame(navigation_events.groupby(["actor_id", "event__object_extensions_asset_type"]).count()["id"]).unstack()
a.columns = a.columns.droplevel()
a["total_clicks"] = np.sum(a, axis=1)

b = pd.merge(a, gradebook, how="left", right_on="Student", left_index=True)

In [273]:
alt.Chart(b).mark_point().encode(
    x = alt.X(alt.repeat(), type="quantitative"),
    y = alt.Y("Current Score").scale(zero=False)
).properties(width = 150, height = 150).repeat(['assignment', 'attachment', 'discussion_topic', 'wiki_page', 'total_clicks'])

In [278]:
df_assignments = assignments.drop(columns=["unlock_at", "lock_at", "grading_type", "position"]).query("has_submitted_submissions == True")
df_assignments["due_at"] = pd.to_datetime(df_assignments["due_at"])
df_assignments

Unnamed: 0,id,due_at,points_possible,name,submission_types,has_submitted_submissions,workflow_state,published
0,c81f04547a95da2a7b88054ef491b7c4,2033-02-05 20:06:09+00:00,100,Assignment 1,['online_upload'],True,published,True
2,a4dc11e7e79361fc5886a9078aac66b8,2033-03-05 20:06:09+00:00,100,Assignment 2,['online_upload'],True,published,True
3,811d93ea379b5cdd5a19f1b5dbab88cd,2033-04-09 19:06:09+00:00,100,Assignment 3 (option A),['online_upload'],True,published,True
4,df8d1f1ff3f48fdc24a278b40c5f45cc,2033-04-09 19:06:09+00:00,100,Assignment 3 (option B),['online_upload'],True,published,True
5,bb971b36c1578cede00150acda89aa99,2033-04-09 19:06:09+00:00,100,Assignment 3 (option C),['online_upload'],True,published,True
6,f1ad954cd2cddda6e17f6fc225d1aa3e,2033-04-09 19:06:09+00:00,100,Assignment 3 (option D),['online_upload'],True,published,True


In [11]:
assignment_name = "Assignment 1"

def plot_cumulative_assignment_view(assignment_name):
    
    ## Setup
    assignments = pd.read_csv(os.path.join("data", "assignments.csv"))
    navigation_events = pd.read_csv(os.path.join("data", "navigation_events.csv"))
    enrollments = pd.read_csv(os.path.join("data", "enrollments.csv"))

    # drop strange/na columns
    navigation_events.drop(columns=["ed_app", "type", "action", "course_offering_id", "statement_type", "statement_version", "event__object_type", "event__object_extensions_asset_subtype", "event__object_extensions_entity_id", "event__referrer", "event__extensions_request_url", "event__attachment_type"], inplace=True)
    # drop learners that are not enrolled in the course
    navigation_events = navigation_events.iloc[np.in1d(navigation_events.actor_id, enrollments.user_id), :]
    navigation_events = navigation_events.query("actor_id != 'LEARNER_48'")
    navigation_events.loc[:, "event_time"] = pd.to_datetime(navigation_events.loc[:, "event_time"], format='ISO8601').dt.normalize()

    df_assignments = assignments.drop(columns=["unlock_at", "lock_at", "grading_type", "position"]).query("has_submitted_submissions == True")
    df_assignments["due_at"] = pd.to_datetime(df_assignments["due_at"])


    ## Drawing
    tmp_assignment = df_assignments.query("name == @assignment_name")
    tmp_assignment_due_date = tmp_assignment.due_at
    tmp_first_accessed = navigation_events.query("object_id == @tmp_assignment.id.iloc[0]")[["actor_id", "event_time"]]
    tmp_first_accessed = tmp_first_accessed.sort_values(["actor_id", "event_time"]).groupby("actor_id").first().sort_values("event_time")
    
    tmp_first_accessed = tmp_first_accessed.reset_index()
    
    chart = alt.Chart(tmp_first_accessed).transform_window(
        ecdf="cume_dist()",
        sort=[{"field": "event_time"}],
    ).mark_line().encode( #interpolate="step-after"
        x=alt.X("event_time:T").scale(domain=[tmp_first_accessed.event_time.iloc[0] - pd.Timedelta(days=7),  tmp_assignment.due_at.iloc[0] + pd.Timedelta(days=7)]),
        y=alt.Y("ecdf:Q").scale(domain=[0, 1.1]).title("Assignment View %"), 
    )
    chart = chart + alt.Chart(tmp_assignment).mark_rule().encode(x = alt.X("due_at").title(None))

    return chart.properties(width=400, height=200)

plot_cumulative_assignment_view(assignment_name)