## Data Clearning and Generation

This notebook walkthroughs how we cleaned and generated the data for the LearnPlatform COVID-19 Impact on Digital Learning. We used the data provided by the competition organizers. They have provided a set of daily edtech engagement data from over 200 school districts in 2020, and we leveraged other publicly available data on school closures and shelter-in-place orders in our analysis. We include three basic sets of files to help you get started:

- The engagement_ data folder is based on LearnPlatform’s Student Chrome Extension. The extension collects page load events of over 10K education technology products in our product library, including websites, apps, web apps, software programs, extensions, ebooks, hardwares, and services used in educational institutions. The engagement data have been aggregated at school district level, and each file represents data from one school district.
- The products_info.csv file includes information about the characteristics of the top 372 products with most users in 2020.
- The districts_info.csv file includes information about the characteristics of school districts, including data from NCES and FCC.
- The polcy.csv includes information the dates of school closures and shelter-in-place orders



In [None]:
# importing packages and importing data
import datetime
import numpy as np
import pandas as pd
products_info = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
districts_info = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
# Data of state policies on school closures and shelter-in-place orders.
policy = pd.read_csv('/kaggle/input/policy/policy.csv')

In [None]:
# subseting data by Prek-12 and the rest. We specified it digital learning platforms for kids and teen-adults
higher = products_info[(products_info['Sector(s)']=='PreK-12; Higher Ed; Corporate') | (products_info['Sector(s)']=='PreK-12; Higher Ed') | (products_info['Sector(s)']=='Higher Ed; Corporate')]
pre_k = products_info[products_info['Sector(s)']=='PreK-12']
pre_k.columns=['lp_id', 'URL', 'Product Name', 'Provider/Company Name', 'Sector(s)',
       'Primary Essential Function']
higher.columns=['lp_id', 'URL', 'Product Name', 'Provider/Company Name', 'Sector(s)',
       'Primary Essential Function']

In [None]:
# aggreagating data engagement data at school district level
df= pd.DataFrame()
for i in districts_info['district_id']:
    df1 = pd.read_csv(f'/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/{i}.csv')
    df1 = df1.groupby(by='time').agg({'lp_id':'count', 'pct_access':'mean', 'engagement_index':'sum'}).reset_index()
    df1['district_id'] = i
    df = pd.concat([df,df1])
df['time'] = df['time'].astype('datetime64[ns]')

In [None]:
# building dataset by extracting kids digital learning platforms from the engagement data that have been aggregated at school district level, 
# and each file represents data from one school district.
pre_k_sample= pd.DataFrame()
for i in districts_info['district_id']:
    df1 = pd.read_csv(f'/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/{i}.csv')
    df1 = pd.merge(df1, pre_k['lp_id'], on = 'lp_id')
    df1 = df1.groupby(by='time').agg({'lp_id':'count', 'pct_access':'mean', 'engagement_index':'sum'}).reset_index()
    df1['district_id'] = i
    pre_k_sample = pd.concat([pre_k_sample,df1])
pre_k_sample['time'] = pre_k_sample['time'].astype('datetime64[ns]')

In [None]:
# building dataset by extracting teen-aduls digital learning platforms from the engagement data that have been aggregated at school district level, 
# and each file represents data from one school district.
higher_sample= pd.DataFrame()
for i in districts_info['district_id']:
    df1 = pd.read_csv(f'/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/{i}.csv')
    df1 = pd.merge(df1, higher['lp_id'], on = 'lp_id')
    df1 = df1.groupby(by='time').agg({'lp_id':'count', 'pct_access':'mean', 'engagement_index':'sum'}).reset_index()
    df1['district_id'] = i
    higher_sample = pd.concat([higher_sample,df1])
higher_sample['time'] = higher_sample['time'].astype('datetime64[ns]')

In [None]:
# cleaning policy dataset and edit errors
policy.columns = ['state', 'SC', 'SIP']
policy.loc[8, 'state'] = 'District Of Columbia'
policy.loc[6, 'SIP'] = '3/23/20'
policy.loc[43, 'SIP'] = '4/2/20'
policy.loc[policy.SIP =='0', 'SIP'] = np.nan
policy.loc[policy.SC =='0', 'SC'] = np.nan
policy['SC'] = policy['SC'].astype('datetime64[ns]')
policy['SIP'] = policy['SIP'].astype('datetime64[ns]')

In [None]:
# merging working dataset with district info
df = pd.merge(df, districts_info, on ='district_id')
higher_sample = pd.merge(higher_sample, districts_info, on ='district_id')
pre_k_sample = pd.merge(pre_k_sample, districts_info, on ='district_id')

In [None]:
# merging working dataset with policy dataset
df = pd.merge(df, policy, on = 'state', how = 'left')
df = df[(df['district_id'].notnull()) & (df['state'].notnull())]
higher_sample = pd.merge(higher_sample, policy, on = 'state', how = 'left')
higher_sample = higher_sample[(higher_sample['district_id'].notnull()) & (higher_sample['state'].notnull())]
pre_k_sample = pd.merge(pre_k_sample, policy, on = 'state', how = 'left')
pre_k_sample = pre_k_sample[(pre_k_sample['district_id'].notnull()) & (pre_k_sample['state'].notnull())]

In [None]:
#creating dummies for when the state have policy intervention
df['treat_SC'] = 0
df['treat_SIP'] = 0
df.loc[df['SC'] <= df['time'], 'treat_SC'] = 1
df.loc[df['SIP'] <= df['time'], 'treat_SIP'] = 1
higher_sample['treat_SC'] = 0
higher_sample['treat_SIP'] = 0
higher_sample.loc[higher_sample['SC'] <= higher_sample['time'], 'treat_SC'] = 1
higher_sample.loc[higher_sample['SIP'] <= higher_sample['time'], 'treat_SIP'] = 1
pre_k_sample['treat_SC'] = 0
pre_k_sample['treat_SIP'] = 0
pre_k_sample.loc[pre_k_sample['SC'] <= pre_k_sample['time'], 'treat_SC'] = 1
pre_k_sample.loc[pre_k_sample['SIP'] <= pre_k_sample['time'], 'treat_SIP'] = 1

In [None]:
# droping outliers that has few observations and unrealistic pct_access
df = df.drop(df[(df['lp_id']<10) & (df['pct_access']>5)].index.tolist()).reset_index(drop=True)
higher_sample = higher_sample.drop(higher_sample[(higher_sample['lp_id']<10) & (higher_sample['pct_access']>5)].index.tolist()).reset_index(drop=True)
pre_k_sample = pre_k_sample.drop(pre_k_sample[(pre_k_sample['lp_id']<10) & (pre_k_sample['pct_access']>5)].index.tolist()).reset_index(drop=True)

In [None]:
# exporting dataset
df.to_csv('learning.csv', index = False)
higher_sample.to_csv('learning_subset_higher.csv', index = False)
pre_k_sample.to_csv('learning_subset_pre_k.csv', index = False)