In [1]:
import random
from tokenize import Ignore
from faker import Faker
import pandas as pd
import datetime as dt
import numpy as np

class CleverDataGen(BaseOEAModule):
    def __init__(self, source_folder='test_data'):
        BaseOEAModule.__init__(self, oea, source_folder)

        self.startdate = dt.date(2022,1,3)
        self.enddate = dt.date(2022,1,28)
        self.daterange = self._get_date_range(self.startdate,self.enddate)
        self.faker = Faker('en_US')
        self.currentDateTime = dt.datetime.now()
        self.currentDateTime = self.currentDateTime.strftime('%Y-%m-%dT%H-%M-%S')

        self.schemas['daily-participation'] = [['date', 'date'],
                                ['sis_id', 'string'],
                                ['clever_user_id', 'string'],
                                ['clever_school_id', 'string'],
                                ['school_name', 'string'],
                                ['active', 'boolean'],
                                ['num_logins', 'integer'],
                                ['num_resources_accessed', 'integer']]
        self.schemas['resource-usage'] = [['date', 'date'],
                                ['sis_id', 'string'],
                                ['clever_user_id', 'string'],
                                ['clever_school_id', 'string'],
                                ['school_name', 'string'],
                                ['resource_type', 'string'],
                                ['resource_name', 'string'],
                                ['resource_id', 'string'],
                                ['num_access', 'integer']]

        dfStudents = oea.load_csv('stage1np/test_data/gen_base_truth_tables/students/*/*.csv')
        self.students = dfStudents.toPandas()
        
        self.participationfile = oea.create_empty_dataframe(self.schemas['daily-participation'])
        self.resourceusagefile = oea.create_empty_dataframe(self.schemas['resource-usage'])

    def gen_daily_participation(self):
        pdfParticipation = self.participationfile.toPandas()
        for day in self.daterange:
            for index, student in self.students.iterrows():
                date = day
                sis_id = student['StudentID']
                clever_user_id = student['StudentID']
                clever_school_id = student['SchoolID']
                school_name = student['School']
                active = random.choice(['False', 'True'])
                if active == 'True':
                    num_logins = random.randint(1,9)
                    num_resources_accessed = random.randint(1,3)
                else:
                    num_logins = 0
                    num_resources_accessed = 0
                new_row = {'date' : date, 'sis_id' : sis_id, 'clever_user_id': clever_user_id, 'clever_school_id' : clever_school_id, 'school_name': school_name, 'active' : active, 'num_logins' : num_logins, 'num_resources_accessed': num_resources_accessed}
                pdfParticipation = pdfParticipation.append(new_row,ignore_index=True)
        
        dfParticipation = spark.createDataFrame(pdfParticipation)
        dfParticipation.coalesce(1).write.save(oea.path('stage1np', directory_path='clever/gen_daily-participation/' + self.currentDateTime + '/Students'), format='csv', mode='append', mergeSchema='true', header=True)


    def gen_resource_usage(self):
        pdfResourceUsage = self.resourceusagefile.toPandas()
        gen_participation = oea.load_csv('stage1np/clever/gen_daily-participation/*/*/*.csv')
        gen_participation = gen_participation.toPandas()
        resource_list = oea.load_csv('stage1np/test_data/clever_resource_list/resources.csv')
        resource_list = resource_list.toPandas()
        
        for index, student in gen_participation.iterrows():
            if student['active'] == 'True':
                n = 0
                student['num_resources_accessed'] = pd.to_numeric(student['num_resources_accessed'])
                while n < student['num_resources_accessed']:
                    res_num = random.randint(0,27)

                    date = student['date']
                    sis_id = student['sis_id']
                    clever_user_id = student['clever_user_id']
                    clever_school_id = student['clever_school_id']
                    school_name = student['school_name']
                    resource_type = 'App'
                    resource_name = resource_list.at[res_num, 'resource']
                    resource_id = resource_list.at[res_num, 'resourceid']
                    num_access = random.randint(1,9)

                    new_row = {'date' : date, 'sis_id' : sis_id, 'clever_user_id': clever_user_id, 'clever_school_id' : clever_school_id, 'school_name': school_name, 'resource_type' : resource_type, 'resource_name' : resource_name, 'resource_id': resource_id, 'num_access' : num_access}
                    pdfResourceUsage = pdfResourceUsage.append(new_row,ignore_index=True)
                    n = n + 1
        
        dfResourceUsage = spark.createDataFrame(pdfResourceUsage)
        dfResourceUsage.coalesce(1).write.save(oea.path('stage1np', directory_path='clever/gen_resource-usage/' + self.currentDateTime + '/Students'), format='csv', mode='append', mergeSchema='true', header=True)

    # returns a list of dates in date range specified
    def _get_date_range(self, startdate, enddate):
        daterange = []
        while(startdate < enddate): 
            #daterange = daterange.append(startdate)
            daterange.append(startdate)
            startdate = startdate + dt.timedelta(days=1)
        return daterange

    # reset test data generation for either or both of the Clever tables
    def reset_generation(self, reset_daily_participation=False, reset_resource_usage=False):
        if reset_daily_participation:
            oea.rm_if_exists(path='stage1np/clever/gen_daily-participation')
        else:
            pass

        if reset_resource_usage:
            oea.rm_if_exists(path='stage1np/clever/gen_resource-usage')
        else:
            pass

