# Learning Analytics v2

## Learning Resources Schema Fact Tables


In [1]:
%run OEA_py

StatementMeta(, 44, -1, Finished, Available)

2023-07-05 15:52:47,541 - OEA - INFO - Now using workspace: dev
2023-07-05 15:52:47,542 - OEA - INFO - OEA initialized.


In [2]:
workspace = 'dev'
oea.set_workspace(workspace)

StatementMeta(spark3p3sm, 44, 3, Finished, Available)

2023-07-05 15:52:48,143 - OEA - INFO - Now using workspace: dev


In [3]:
from pyspark.sql.functions import col, lit, split
from pyspark.sql import functions as f
import os
import uuid

# helper functions
def _publish_to_stage2(df, destination, pk):
    oea.upsert(df, destination, pk)

def publish(df, stage2_destination, stage3_destination, primary_key='id'):
    _publish_to_stage2(df, stage2_destination, primary_key)

    spark.sql("set spark.sql.streaming.schemaInference=true")
    streaming_df = spark.readStream.format('delta').load(oea.to_url(stage2_destination))
    # for more info on append vs complete vs update modes for structured streaming: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#basic-concepts
    query = streaming_df.writeStream.format('delta').outputMode('append').trigger(once=True).option('checkpointLocation', oea.to_url(stage2_destination) + '/_checkpoints')
    query = query.start(oea.to_url(stage3_destination))
    query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
    number_of_new_inbound_rows = query.lastProgress["numInputRows"]
    logger.info(f'Number of new inbound rows processed: {number_of_new_inbound_rows}')
    logger.debug(query.lastProgress)
    return number_of_new_inbound_rows

def format_to_schema(df, column_mapping, schema, source_directory):
    """ This funciton formats a dataframe to match a schema dataframe format. 
        Column mapping needs to be provided. If columns are missing, they are filled as none type.
    """
    
    # rename columns
    dfSource = df.select([col(existing_col).alias(column_mapping[existing_col]) for existing_col in df.columns])
    data_source = source_directory.split(os.path.sep)[2]
    dfSource = dfSource.withColumn("data_source", lit(data_source))
    dfSource = dfSource.withColumn("source_directory", lit(source_directory)) 

    # create missing columns with needed data type
    missing_columns = [col for col in schema.names if col not in dfSource.columns]
    for column in missing_columns:
        dfSource = dfSource.withColumn(column, lit(None).cast(schema[column].dataType))

    dfSource = dfSource.select(schema.names) # ensure column order matches

    return dfSource

StatementMeta(spark3p3sm, 44, 4, Finished, Available)

In [4]:
column_names = ['id', 'source_resource_activity_id', 'resource_id', 'user_id_pseudonym', 'attempt_number',
                    'attempt_grade', 'attempt_state', 'time_start', 'time_finish', 'data_source', 'source_directory']
schema = StructType([StructField(name, StringType(), nullable=True) for name in column_names])
dfFactResourceActivity = spark.createDataFrame([], schema)

StatementMeta(spark3p3sm, 44, 5, Finished, Available)

In [5]:
# load resource dim table to lookup resource ids
source_directory = 'stage2/Enriched/learning_analytics/v2.0/general/dim_resource'
dfDimResource = oea.load(source_directory)
dfDimResource = dfDimResource.select(['id','source_resource_id','resource_type', 'data_source'])
dfDimResource = dfDimResource.withColumnRenamed("id", "resource_id")

StatementMeta(spark3p3sm, 44, 6, Finished, Available)

In [6]:
# moodle quiz_attempts learning resource activity data
source_directory = 'stage2/Refined/moodle/v4.1/general/quiz_attempts'
dfQuizAttempts = oea.load(source_directory)

dfQuizAttempts = dfQuizAttempts.select(['id','quiz', 'userid_pseudonym', 'attempt', 'sumgrades', 
                            'state', 'timestart', 'timefinish'])

# find resource idea via dim_resource DeltaTable
dfResourceLookup = dfDimResource.filter((col("resource_type") == "quiz") & 
                                        (col("data_source") == "moodle"))
dfQuizAttempts = dfResourceLookup.join(dfQuizAttempts, 
                dfResourceLookup["source_resource_id"] == dfQuizAttempts["quiz"])
dfQuizAttempts = dfQuizAttempts.drop("source_resource_id", "quiz", 
                                    "resource_type", "data_source")

column_mapping = {'id': 'source_resource_activity_id', 'resource_id': 'resource_id', 'userid_pseudonym': 'user_id_pseudonym', 
            'attempt': 'attempt_number', 'sumgrades': 'attempt_grade','state': 'attempt_state',
            'timestart': 'time_start','timefinish': 'time_finish'}

dfSource = format_to_schema(dfQuizAttempts, column_mapping, dfFactResourceActivity.schema, source_directory)           

dfFactResourceActivity = dfFactResourceActivity.union(dfSource)

StatementMeta(spark3p3sm, 44, 7, Finished, Available)

In [7]:
# moodle assign_submission learning resource activity data
source_directory =  'stage2/Refined/moodle/v4.1/general/assign_submission'
dfAssignmentSubmissions = oea.load(source_directory)

dfAssignmentSubmissions = dfAssignmentSubmissions.select(['id','assignment', 'userid_pseudonym', 'attemptnumber', 
                        'status', 'timestarted', 'timecreated'])

# find resource idea via dim_resource DeltaTable
dfResourceLookup = dfDimResource.filter((col("resource_type") == "assignment") & 
                                        (col("data_source") == "moodle"))
dfAssignmentSubmissions = dfResourceLookup.join(dfAssignmentSubmissions, 
                dfResourceLookup["source_resource_id"] == dfAssignmentSubmissions["assignment"])
dfAssignmentSubmissions = dfAssignmentSubmissions.drop("source_resource_id", "assignment", 
                                    "resource_type", "data_source")

column_mapping = {'id': 'source_resource_activity_id', 'resource_id': 'resource_id', 'userid_pseudonym': 'user_id_pseudonym', 
            'attemptnumber': 'attempt_number', 'status': 'attempt_state',
            'timestarted': 'time_start','timecreated': 'time_finish'}
dfSource = format_to_schema(dfAssignmentSubmissions, column_mapping, dfFactResourceActivity.schema, source_directory)           

dfFactResourceActivity = dfFactResourceActivity.union(dfSource)

StatementMeta(spark3p3sm, 44, 8, Finished, Available)

In [8]:
# moodle lesson_attempts learning resource activity data
df1 = oea.load('stage2/Refined/moodle/v4.1/general/lesson_attempts')
subset_columns = ['id','lessonid','userid_pseudonym','correct']
df1 = df1.select(subset_columns)
df1 = df1.withColumnRenamed('lessonid', 'lesson_id')
df1 = df1.withColumnRenamed('userid_pseudonym', 'user_id_pseudonym')
df1 = df1.withColumnRenamed('correct', 'attempt_grade')

df2 = oea.load('stage2/Refined/moodle/v4.1/general/lesson_timer')
subset_columns = ['lessonid','userid_pseudonym','completed','starttime','lessontime']
df2 = df2.select(subset_columns)
df2 = df2.withColumnRenamed('lessonid', 'lesson_id')
df2 = df2.withColumnRenamed('userid_pseudonym', 'user_id_pseudonym')
df2 = df2.withColumnRenamed('completed', 'attempt_state')
df2 = df2.withColumnRenamed('starttime', 'time_start')
df2 = df2.withColumnRenamed('lessontime', 'time_finish')

# Subset the DataFrame by earliest start date and latest end date for each user ID and lesson ID
earliest_start_dates = df2.groupBy("user_id_pseudonym", "lesson_id").agg(f.min("time_start").alias("earliest_time_start"))
latest_end_dates = df2.groupBy("user_id_pseudonym", "lesson_id").agg(f.max("time_finish").alias("latest_time_finish"))

dfResult = earliest_start_dates.join(latest_end_dates, ["user_id_pseudonym", "lesson_id"], "inner")
dfResult = dfResult.withColumnRenamed('earliest_time_start', 'time_start')
dfResult = dfResult.withColumnRenamed('latest_time_finish', 'time_finish')

dfLessonAttempts = df1.join(dfResult, ["user_id_pseudonym", "lesson_id"], how='left')

StatementMeta(spark3p3sm, 44, 9, Finished, Available)

In [9]:
# moodle lesson_attempts learning resource activity data

# find resource idea via dim_resource DeltaTable
dfResourceLookup = dfDimResource.filter((col("resource_type") == "lesson") & 
                                        (col("data_source") == "moodle"))
dfLessonAttempts = dfResourceLookup.join(dfLessonAttempts, 
                dfResourceLookup["source_resource_id"] == dfLessonAttempts["lesson_id"])
dfLessonAttempts = dfLessonAttempts.drop("source_resource_id", "lesson_id", 
                                    "resource_type", "data_source")

column_mapping = {'id': 'source_resource_activity_id', 'resource_id': 'resource_id', 'user_id_pseudonym': 'user_id_pseudonym', 
            'attempt_grade': 'attempt_grade', 'time_start': 'time_start','time_finish': 'time_finish'}
dfSource = format_to_schema(dfLessonAttempts, column_mapping, dfFactResourceActivity.schema, source_directory)           

dfFactResourceActivity = dfFactResourceActivity.union(dfSource)

StatementMeta(spark3p3sm, 44, 10, Finished, Available)

In [11]:
# canvas submissions learning resource activity data
source_directory =  'stage2/Refined/canvas/v2.0/general/submissions'
dfAssignmentSubmissions = oea.load(source_directory)

dfAssignmentSubmissions = dfAssignmentSubmissions.select(['id','user_id_pseudonym', 'assignment_id', 
                        'attempt', 'published_score', 'workflow_state', 'created_at', 'submitted_at'])

# find resource id via dim_resource DeltaTable
dfResourceLookup = dfDimResource.filter((col("resource_type") == "assignment") & 
                                        (col("data_source") == "canvas"))
dfAssignmentSubmissions = dfResourceLookup.join(dfAssignmentSubmissions, 
                dfResourceLookup["source_resource_id"] == dfAssignmentSubmissions["assignment_id"])
dfAssignmentSubmissions = dfAssignmentSubmissions.drop("source_resource_id", "assignment_id",
                                    "resource_type", "data_source")

column_mapping = {'id': 'source_resource_activity_id', 'resource_id': 'resource_id', 'user_id_pseudonym': 'user_id_pseudonym', 
            'attempt': 'attempt_number', 'published_score': 'attempt_grade','workflow_state': 'attempt_state',
            'created_at': 'time_start','submitted_at': 'time_finish'}
dfSource = format_to_schema(dfAssignmentSubmissions, column_mapping, dfFactResourceActivity.schema, source_directory)           

dfFactResourceActivity = dfFactResourceActivity.union(dfSource)

StatementMeta(spark3p3sm, 44, 12, Finished, Available)

In [13]:
# canvas quiz_submissions learning resource activity data
source_directory = 'stage2/Refined/canvas/v2.0/general/quiz_submissions'
dfQuizAttempts = oea.load(source_directory)

dfQuizAttempts = dfQuizAttempts.select(['id','user_id_pseudonym', 'quiz_id', 'attempt', 'kept_score', 
                            'workflow_state', 'started_at', 'finished_at'])

# find resource idea via dim_resource DeltaTable
dfResourceLookup = dfDimResource.filter((col("resource_type") == "quiz") & 
                                        (col("data_source") == "canvas"))
dfQuizAttempts = dfResourceLookup.join(dfQuizAttempts, 
                dfResourceLookup["source_resource_id"] == dfQuizAttempts["quiz_id"])
dfQuizAttempts = dfQuizAttempts.drop("source_resource_id", "quiz_id", 
                                    "resource_type", "data_source")

column_mapping = {'id': 'source_resource_activity_id', 'resource_id': 'resource_id', 'user_id_pseudonym': 'user_id_pseudonym', 
            'attempt': 'attempt_number', 'kept_score': 'attempt_grade','workflow_state': 'attempt_state',
            'started_at': 'time_start', 'finished_at': 'time_finish'}

dfSource = format_to_schema(dfQuizAttempts, column_mapping, dfFactResourceActivity.schema, source_directory)           

dfFactResourceActivity = dfFactResourceActivity.union(dfSource)

StatementMeta(spark3p3sm, 44, 14, Finished, Available)

In [14]:
# generate uuid
uuid_udf = f.udf(lambda : str(uuid.uuid4().hex), StringType())
dfFactResourceActivity = dfFactResourceActivity.withColumn('id', uuid_udf())

StatementMeta(spark3p3sm, 44, 15, Finished, Available)

In [15]:
publish(dfFactResourceActivity, 'stage2/Enriched/learning_analytics/v2.0/general/fact_resource_activity', 'stage3/Published/learning_analytics/v2.0/general/fact_resource_activity', primary_key='id')

StatementMeta(spark3p3sm, 44, 16, Finished, Available)

2023-07-05 15:59:28,476 - OEA - INFO - Number of new inbound rows processed: 78905


78905

In [16]:
oea.add_to_lake_db(f'stage3/Published/learning_analytics/v2.0/general/fact_resource_activity')

StatementMeta(spark3p3sm, 44, 17, Finished, Available)

In [None]:
#oea.rm_if_exists('stage2/Enriched/learning_analytics')
#oea.rm_if_exists('stage3/Published/learning_analytics')

StatementMeta(, , , Cancelled, )