# Learning Analytics v2

## Learning Resources Schema Dimension Tables


In [1]:
%run OEA_py

StatementMeta(, 43, -1, Finished, Available)

2023-06-29 21:51:03,937 - OEA - INFO - Now using workspace: dev
2023-06-29 21:51:03,939 - OEA - INFO - OEA initialized.


In [2]:
workspace = 'dev'
oea.set_workspace(workspace)

StatementMeta(spark3p3sm, 43, 3, Finished, Available)

2023-06-29 21:51:04,249 - OEA - INFO - Now using workspace: dev


In [3]:
from pyspark.sql.functions import col, lit, split
from pyspark.sql import functions as f
import os
import uuid

# helper functions
def _publish_to_stage2(df, destination, pk):
    oea.upsert(df, destination, pk)

def publish(df, stage2_destination, stage3_destination, primary_key='id'):
    _publish_to_stage2(df, stage2_destination, primary_key)

    spark.sql("set spark.sql.streaming.schemaInference=true")
    streaming_df = spark.readStream.format('delta').load(oea.to_url(stage2_destination))
    # for more info on append vs complete vs update modes for structured streaming: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#basic-concepts
    query = streaming_df.writeStream.format('delta').outputMode('append').trigger(once=True).option('checkpointLocation', oea.to_url(stage2_destination) + '/_checkpoints')
    query = query.start(oea.to_url(stage3_destination))
    query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
    number_of_new_inbound_rows = query.lastProgress["numInputRows"]
    logger.info(f'Number of new inbound rows processed: {number_of_new_inbound_rows}')
    logger.debug(query.lastProgress)
    return number_of_new_inbound_rows

def format_to_schema(df, column_mapping, schema, source_directory):
    """ This funciton formats a dataframe to match a schema dataframe format. 
        Column mapping needs to be provided. If columns are missing, they are filled as none type.
    """
    
    # rename columns
    dfSource = df.select([col(existing_col).alias(column_mapping[existing_col]) for existing_col in df.columns])
    data_source = source_directory.split(os.path.sep)[2]
    dfSource = dfSource.withColumn("data_source", lit(data_source))
    dfSource = dfSource.withColumn("source_directory", lit(source_directory)) 

    # create missing columns with needed data type
    missing_columns = [col for col in schema.names if col not in dfSource.columns]
    for column in missing_columns:
        dfSource = dfSource.withColumn(column, lit(None).cast(schema[column].dataType))

    dfSource = dfSource.select(schema.names) # ensure column order matches

    return dfSource

StatementMeta(spark3p3sm, 43, 4, Finished, Available)

In [4]:
column_names = ['id', 'course_id', 'source_resource_id', 'resource_type', 'name', 'time_open', 'time_close',
                   'data_source', 'source_directory']
schema = StructType([StructField(name, StringType(), nullable=True) for name in column_names])
dfDimResource = spark.createDataFrame([], schema)

StatementMeta(spark3p3sm, 43, 5, Finished, Available)

In [5]:
# moodle quiz learning resource data
source_directory = 'stage2/Refined/moodle/v4.1/general/quiz'
dfQuiz = oea.load(source_directory)
dfQuiz = dfQuiz.select(['id','course','name','timeopen','timeclose'])

# map course id to SIS course id
source_directory = 'stage2/Refined/moodle/v4.1/general/course'
dfCourse = oea.load(source_directory)
dfCourse = dfCourse.select(['id','category'])
dfCourse = dfCourse.withColumnRenamed("id", "course")
dfQuiz = dfQuiz.join(dfCourse, "course")
dfQuiz = dfQuiz.drop("course")

column_mapping = {'id': 'source_resource_id', 'category': 'course_id', 'name': 'name', 'timeopen': 'time_open', 
                'timeclose': 'time_close'}
dfSource = format_to_schema(dfQuiz, column_mapping, dfDimResource.schema, source_directory)           

dfSource = dfSource.withColumn("resource_type", lit("quiz"))
dfSource = dfSource.select(dfDimResource.columns) # ensure column order matches

dfDimResource = dfDimResource.union(dfSource)

StatementMeta(spark3p3sm, 43, 6, Finished, Available)

In [6]:
# moodle assignment learning resource data
source_directory = 'stage2/Refined/moodle/v4.1/general/assign'
dfAssgn = oea.load(source_directory)
dfAssgn = dfAssgn.select(['id','course','name','allowsubmissionsfromdate','duedate'])

# map course id to SIS course id
source_directory = 'stage2/Refined/moodle/v4.1/general/course'
dfCourse = oea.load(source_directory)
dfCourse = dfCourse.select(['id','category'])
dfCourse = dfCourse.withColumnRenamed("id", "course")
dfAssgn = dfAssgn.join(dfCourse, "course")
dfAssgn = dfAssgn.drop("course")

column_mapping = {'id': 'source_resource_id', 'category': 'course_id', 'name': 'name', 'allowsubmissionsfromdate': 'time_open', 
                'duedate': 'time_close'}
dfSource = format_to_schema(dfAssgn, column_mapping, dfDimResource.schema, source_directory)           

dfSource = dfSource.withColumn("resource_type", lit("assignment"))
dfSource = dfSource.select(dfDimResource.columns) # ensure column order matches

dfDimResource = dfDimResource.union(dfSource)

StatementMeta(spark3p3sm, 43, 7, Finished, Available)

In [7]:
# moodle lesson learning resource data
source_directory = 'stage2/Refined/moodle/v4.1/general/lesson'
dfLesson = oea.load(source_directory)
dfLesson = dfLesson.select(['id','course','name','available','deadline'])

# map course id to SIS course id
source_directory = 'stage2/Refined/moodle/v4.1/general/course'
dfCourse = oea.load(source_directory)
dfCourse = dfCourse.select(['id','category'])
dfCourse = dfCourse.withColumnRenamed("id", "course")
dfLesson = dfLesson.join(dfCourse, "course")
dfLesson = dfLesson.drop("course")

column_mapping = {'id': 'source_resource_id', 'category': 'course_id', 'name': 'name', 'available': 'time_open', 
                'deadline': 'time_close'}
dfSource = format_to_schema(dfLesson, column_mapping, dfDimResource.schema, source_directory)           

dfSource = dfSource.withColumn("resource_type", lit("lesson"))
dfSource = dfSource.select(dfDimResource.columns) # ensure column order matches

dfDimResource = dfDimResource.union(dfSource)

StatementMeta(spark3p3sm, 43, 8, Finished, Available)

In [20]:
# canvas assignment learning resource data
source_directory = 'stage2/Refined/canvas/v2.0/general/assignments'
dfAssgn = oea.load(source_directory)
dfAssgn = dfAssgn.select(['id', 'title', 'context_id', 'unlock_at', 'lock_at'])

# map course id to SIS course id
source_directory = 'stage2/Refined/canvas/v2.0/general/courses'
dfCourse = oea.load(source_directory)
dfCourse = dfCourse.select(['id','sis_source_id'])
dfCourse = dfCourse.withColumnRenamed("id", "context_id")
dfAssgn = dfAssgn.join(dfCourse, "context_id")
dfAssgn = dfAssgn.drop("context_id")

# map to schema columns
column_mapping = {'id': 'source_resource_id', 'sis_source_id': 'course_id', 'title': 'name', 'unlock_at': 'time_open', 
                'lock_at': 'time_close'}
dfSource = format_to_schema(dfAssgn, column_mapping, dfDimResource.schema, source_directory)           

dfSource = dfSource.withColumn("resource_type", lit("assignment"))
dfSource = dfSource.select(dfDimResource.columns) # ensure column order matches

dfDimResource = dfDimResource.union(dfSource)

StatementMeta(spark3p3sm, 43, 21, Finished, Available)

In [21]:
# canvas quiz learning resource data
source_directory = 'stage2/Refined/canvas/v2.0/general/quizzes'
dfQuiz = oea.load(source_directory)
dfQuiz = dfQuiz.select(['id', 'title', 'context_id', 'unlock_at', 'lock_at'])

# map course id to SIS course id
source_directory = 'stage2/Refined/canvas/v2.0/general/courses'
dfCourse = oea.load(source_directory)
dfCourse = dfCourse.select(['id','sis_source_id'])
dfCourse = dfCourse.withColumnRenamed("id", "context_id")
dfQuiz = dfQuiz.join(dfCourse, "context_id")
dfQuiz = dfQuiz.drop("context_id")

# map to schema columns
column_mapping = {'id': 'source_resource_id', 'sis_source_id': 'course_id', 'title': 'name', 'unlock_at': 'time_open', 
                'lock_at': 'time_close'}
dfSource = format_to_schema(dfQuiz, column_mapping, dfDimResource.schema, source_directory)           

dfSource = dfSource.withColumn("resource_type", lit("quiz"))
dfSource = dfSource.select(dfDimResource.columns) # ensure column order matches

dfDimResource = dfDimResource.union(dfSource)

StatementMeta(spark3p3sm, 43, 22, Finished, Available)

In [26]:
# generate uuid
uuid_udf = f.udf(lambda : str(uuid.uuid4().hex), StringType())
dfDimResource = dfDimResource.withColumn('id', uuid_udf())

StatementMeta(spark3p3sm, 43, 27, Finished, Available)

In [28]:
publish(dfDimResource, 'stage2/Enriched/learning_analytics/v2.0/general/dim_resource', 'stage3/Published/learning_analytics/v2.0/general/dim_resource', primary_key='id')

StatementMeta(spark3p3sm, 43, 29, Finished, Available)

2023-06-29 22:12:39,016 - OEA - INFO - Number of new inbound rows processed: 1517


1517

In [29]:
oea.add_to_lake_db(f'stage3/Published/learning_analytics/v2.0/general/dim_resource')

StatementMeta(spark3p3sm, 43, 30, Finished, Available)

In [27]:
# oea.rm_if_exists('stage2/Enriched/learning_analytics')
# oea.rm_if_exists('stage3/Published/learning_analytics')

StatementMeta(spark3p3sm, 43, 28, Finished, Available)