In [9]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

### Importing Navigation Data

In [10]:
nf = pd.read_csv('../data/navigation_events.csv') #importing the navigation data
nf.shape
nf.head(1)

Unnamed: 0,type,action,event_time,session_id,membership_role,membership_type,event__id,event__type,event__actor_type,event__action,event__object_type,event__object_name,event__object_extensions_asset_type,event__object_extensions_http_method,event__eventTime,event__edApp_type,event__session_type,object_id_type,event__object_id_type,event__attachment_type,object_id,event__object_extensions_asset_name,actor_id
0,NavigationEvent,navigatedto,2033-03-27 03:08:48.947000+00:00,f8eac5633f79de90a77b33338b14dd8e,"[""Learner""]",Membership,a3bd94baaa440f4793ad08bb64ed3a8f,NavigationEvent,Person,NavigatedTo,Entity,attachment,attachment,GET,2033-03-27 03:08:48.947000+00:00,SoftwareApplication,Session,attachment,attachment,pdf,0878d2f171deae54c6a81acc6ffeba20,file003.pdf,LEARNER_1


#### additional data files

In [11]:
assignments = pd.read_csv('../data/additional/assignments.csv')
discussion_topics = pd.read_csv('../data/additional/discussion_topics.csv')
discussions = pd.read_csv('../data/additional/discussions.csv')
enrollments = pd.read_csv('../data/additional/enrollments.csv')
files = pd.read_csv('../data/additional/files.csv')
gradebook = pd.read_csv('../data/additional/gradebook.csv')
module_items = pd.read_csv('../data/additional/module_items.csv')
pages = pd.read_csv('../data/additional/pages.csv')

## EDA

In [12]:
nf.nunique()

type                                        1
action                                      1
event_time                              61904
session_id                               2259
membership_role                             3
membership_type                             1
event__id                               62839
event__type                                 1
event__actor_type                           1
event__action                               1
event__object_type                          4
event__object_name                          1
event__object_extensions_asset_type         4
event__object_extensions_http_method        3
event__eventTime                        61904
event__edApp_type                           1
event__session_type                         1
object_id_type                              4
event__object_id_type                       4
event__attachment_type                      4
object_id                                 172
event__object_extensions_asset_nam

In [13]:
nf.astype(str).groupby(['event__object_extensions_http_method','membership_role','object_id_type','event__object_id_type']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0
event__object_extensions_http_method,membership_role,object_id_type,event__object_id_type,Unnamed: 4_level_1
GET,Null,assignment,assignment,24
GET,Null,attachment,attachment,162
GET,Null,discussion,discussion,54
GET,Null,wikiPage,wikiPage,4443
GET,"[""Instructor""]",assignment,assignment,93
GET,"[""Instructor""]",attachment,attachment,1842
GET,"[""Instructor""]",discussion,discussion,614
GET,"[""Instructor""]",wikiPage,wikiPage,485
GET,"[""Learner""]",assignment,assignment,2757
GET,"[""Learner""]",attachment,attachment,34087


In [14]:
nf['event_time'].min(),nf['event_time'].max()

('2033-01-02 16:47:19.035000+00:00', '2033-06-15 19:38:36.185000+00:00')

In [15]:
nf['event__object_extensions_http_method'].value_counts(dropna=False)

GET     62422
POST      234
PUT       183
Name: event__object_extensions_http_method, dtype: int64

In [16]:
nf['membership_role'].value_counts()

["Learner"]       54941
Null               4683
["Instructor"]     3215
Name: membership_role, dtype: int64

In [17]:
module_items.groupby(['module_name','type']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
module_name,type,Unnamed: 2_level_1
Course information,Page,15
Module 1: Understanding eLearning (two weeks long),Assignment,1
Module 1: Understanding eLearning (two weeks long),Discussion,2
Module 1: Understanding eLearning (two weeks long),Page,11
Module 2: eLearning readiness (two weeks long),Assignment,1
Module 2: eLearning readiness (two weeks long),Discussion,3
Module 2: eLearning readiness (two weeks long),Page,12
Module 3: Institutional organization and support (one week long),Assignment,1
Module 3: Institutional organization and support (one week long),Discussion,1
Module 3: Institutional organization and support (one week long),Page,8


In [18]:
module_items['type'].value_counts()

Page          79
Discussion    13
Assignment     4
Name: type, dtype: int64

In [19]:
gradebook = gradebook.drop([0,1]) #dropping rows with non-numeric data

### Preparing Discussion level aggregated data (Which discussions were popular?)

In [20]:
d_level = discussions[discussions['actor_id'].str.contains('LEARNER')].groupby(['discussion_topic_id','discussion_topic_title','discussion_topic_message_length']).agg({'post_message_length':'sum','actor_id':'nunique','post_id':'nunique', 'count_of_likes':'sum'}).reset_index()

d_level = d_level.rename(columns = {'actor_id':'Student Participation','post_message_length':'Content Volume','post_id':'Total Posts', 'count_of_likes':'Total Likes'})

In [21]:
d_level.to_csv('../data/final data/discussion_level.csv',index=False)

### Preparing Learner level aggregaged data (Student Grades, Participation and Engagement Metrics)

In [22]:
l_level = discussions[discussions['actor_id'].str.contains('LEARNER')].groupby('actor_id').agg({'discussion_topic_id':'nunique','post_id':'nunique','post_message_length':'sum','count_of_likes':'sum'})
l_level = l_level.reset_index()
l_level.columns = ['Student','Discussions Participated In','Total Posts','Content Volume','Total Likes']
l_level.head(1)

Unnamed: 0,Student,Discussions Participated In,Total Posts,Content Volume,Total Likes
0,LEARNER_1,10,14,33617,3


In [23]:
enrollments_ = enrollments.groupby(['user_id']).agg({'last_activity_at':'max','total_activity_time':'sum'}).reset_index()
enrollments_.columns = ['Student','Last Activity Time','Total Activity Time']

In [24]:
learner_level_combined = gradebook.merge(l_level,on = 'Student', how = 'left').merge(enrollments_,on='Student',how='left')

In [25]:
learner_level_combined['Total Activity Time (Hrs)'] = round(learner_level_combined['Total Activity Time']/3600.0)

In [26]:
learner_level_combined.to_csv('../data/final data/learner_level_data.csv',index=False)

### Preparing Page level aggregaged data (Which Pages were most popular?)

In [27]:
wikiPage_level = nf[(nf['event__object_extensions_http_method']=='GET') & (nf['membership_role']=='["Learner"]') & (nf['object_id_type']=='wikiPage')].groupby(['event__object_extensions_asset_name']).agg({'actor_id':'nunique','event__id':'count'}).reset_index()

In [28]:
wikiPage_level.columns = ['Page Name','No. of Students Visited','Page Views']

In [29]:
wikiPage_level.to_csv('../data/final data/wikiPage_level.csv',index=False)