In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Overview

The COVID-19 Pandemic has disrupted learning for more than 56 million students in the United States. In the Spring of 2020, most states and local governments across the U.S. closed educational institutions to stop the spread of the virus. In response, schools and teachers have attempted to reach students remotely through distance learning tools and digital platforms. Until today, concerns of the exacaberting digital divide and long-term learning loss among America’s most vulnerable learners continue to grow. 

We are going to explore<br/>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(1) the state of digital learning in 2020 and <br/>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(2) how the engagement of digital learning relates to factors such as district demographics, broadband access, and state/national level policies and events.

We will try to answer some of the questions below which relate to our problem statement:

    What is the picture of digital connectivity and engagement in 2020?
    What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?
    How does student engagement with different types of education technology change over the course of the pandemic?
    How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?
    Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?


In [None]:
import os
import glob
import math 
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose

import warnings
warnings.filterwarnings("ignore")

In [None]:
products_info = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
districts_info = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')

In [None]:
file_extension = '.csv'
source = '/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'

all_files = [i for i in glob.glob(f'{source}*{file_extension}')]

In [None]:
len(all_files)

In [None]:
list_of_all_dfs = []

for file in tqdm(all_files):
    df = pd.read_csv(file)
    df['school_district'] = file.split('/')[-1].split('.')[0]
    list_of_all_dfs.append(df)

In [None]:
engagement_data = pd.concat(list_of_all_dfs)

In [None]:
products_info.head(3)

In [None]:
districts_info.head(3)

In [None]:
engagement_data.head(3)

In [None]:
print('The shape of products_info is', products_info.shape)
print('The shape of districts_info is', districts_info.shape)
print('The shape of engagement_data is', engagement_data.shape)

# Analysis on products_info

The product file `products_info.csv` includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

| Name | Description |
| :--- | :----------- |
| LP ID| The unique identifier of the product |
| URL | Web Link to the specific product |
| Product Name | Name of the specific product |
| Provider/Company Name | Name of the product provider |
| Sector(s) | Sector of education where the product is used |
| Primary Essential Function | The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled |

In [None]:
products_info.head()

In [None]:
products_info.shape

# Are there any missing values in products_info? And how many?

In [None]:
products_info.isnull().any() #check for missing values

We have missing values for 3 columns in the products_info dataframe and they are 'Provider/Company Name', 'Sector(s)' and 'Primary Essential Function'. So the next question is how many missing values for each feature?

In [None]:
print("Number of missing values in the feature 'Provider/Company Name': ", products_info['Provider/Company Name'].isna().value_counts()[1])
print("Number of missing values in the feature 'Sector(s)': ", products_info['Sector(s)'].isna().value_counts()[1])
print("Number of missing values in the feature 'Primary Essential Function': ", products_info['Primary Essential Function'].isna().value_counts()[1])

In [None]:
plt.figure(figsize=(19,5))
plt.subplot(1, 3, 1)
sns.barplot(x=products_info['Provider/Company Name'].isna().value_counts().index, 
            y=products_info['Provider/Company Name'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: Provider/Company Name')

plt.subplot(1, 3, 2)
sns.barplot(x=products_info['Sector(s)'].isna().value_counts().index, 
            y=products_info['Sector(s)'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: Sector(s)')

plt.subplot(1, 3, 3)
sns.barplot(x=products_info['Primary Essential Function'].isna().value_counts().index, 
            y=products_info['Primary Essential Function'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: Primary Essential Function')
plt.show()

So in the column 'Provider/Company Name', we have 1 missing value while the features 'Sector(s)' and 'Primary Essential Function' have 20 missing values. This number is quite low compared to the total number of data points in products_info. We will perform further analysis to decide on any method of imputation of missing values.

# How many unique primary essential functions along with sub-category?

In [None]:
#how many unique primary essential functions
primary_essential_function_count = pd.DataFrame(products_info['Primary Essential Function'].value_counts()).reset_index(level=0)
primary_essential_function_count.columns = ['Primary Essential Function', 'Count']
primary_essential_function_count.head()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x='Count', y='Primary Essential Function', data=primary_essential_function_count)
plt.title('Count of unique primary essential functions in products_info')
plt.show()

In products_info, we see that 'LC - Digital Learning Platforms' occurs the highest number of times with a total of 74 out of 372 datapoints. The categories 'CM - Teacher Resources - Grading & Attendance', 'SDO - Environmental, Health & Safety (EHS) Com', 'SDO - Admissions, Enrollment & Rostering' and 'LC - Study Tools - Tutoring' all occur only once and are the lowest compared to any other category in the dataset.

# How many unique primary essential functions are there with respect to the main category (i.e, LC, CM, SDO or a mix of these)?

In [None]:
all_main_categories = []
for index, row in enumerate(list(primary_essential_function_count['Primary Essential Function'])):
    row = row.split('-')
    main_category = row[0].replace(" ","")
    all_main_categories.append(main_category)
    
count_of_main_categories = Counter(all_main_categories)

In [None]:
plt.figure(figsize=(19,5))
sns.barplot(x=list(count_of_main_categories.keys()), y=list(count_of_main_categories.values()))
plt.title('Count of the main category of primary essential function')
plt.xlabel('Main Category')
plt.ylabel('Count')
plt.show()

We see that most of the datapoints belong to LC and SDO main categories and a few points belong to CM but only 1 data points belongs to a mix of all categories.

# How many unique sectors are there in products_info?

In [None]:
#how many unique sectors
sectors_count = pd.DataFrame(products_info['Sector(s)'].value_counts()).reset_index(level=0)
sectors_count.columns = ['Sector(s)', 'Count']
sectors_count.head()

In [None]:
plt.figure(figsize=(19,6))
sns.barplot(x='Sector(s)', y='Count', data=sectors_count)
plt.title('Count of unique sectors in products_info')
plt.show()

There are 5 unique sectors in the products_info dataset. Among them 'PreK-12' seems to have the highest count and the lowest is 'Corporate' and 'Higher Ed; Corporate'.

# What is the relationship between sectors and primary essential function?

In [None]:
#in each sector what is the number of primary essential functions
sns.catplot(x='Sector(s)', y='Primary Essential Function', data=products_info, height=15)
plt.title("Catplot: Sector(s) vs Primary Essential Function")
plt.show()

From the above catplot we clearly see that first three sectors have the most distribution of categories. We have seen earlier that there is only one datapoint each for 'Corporate' and 'Higher Ed; Corporate'. In the above plot we can see that the one data point each of both the aforementioned sectors is from the category 'SDO - Data Analytics & Reporting'.

There are some exlusive functions for each sector. For example, 'SDO - Other', 'CM - Virtual Classroom - Video Conferencing' and 'LC - Sites, Resources & Reference - Thesaurus & Dictionary' are exculsive to the sector 'PreK-12; Higher Ed; Corporate'.

# What are the top provider/company names?

In [None]:
top_companies = dict(list(Counter(list(products_info['Provider/Company Name'])).most_common())[0:10])

In [None]:
top_companies

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x=list(top_companies.values()), y=list(top_companies.keys()))
plt.title('Top 10 most common provider/company names')
plt.xlabel('Count')
plt.ylabel('Provider/Company Name')
plt.show()

The most common providers/company name is 'Google LLC'. The number of time it occurs is 30 which is so much more greater than the second most common provide 'Houghton Mifflin Harcourt' at only 6 occurances.

# Analysis on districts_info

The district file `districts_info.csv` includes information about the characteristics of school districts, including data from [NCES](https://nces.ed.gov/) (2018-19), [FCC](https://www.fcc.gov/) (Dec 2018), and [Edunomics Lab](https://edunomicslab.org/). In this data set, we removed the identifiable information about the school districts. We also used an open source tool [ARX](https://arx.deidentifier.org/) [(Prasser et al. 2020)](https://onlinelibrary.wiley.com/doi/full/10.1002/spe.2812) to transform several data fields and reduce the risks of re-identification. For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset. 

| Name | Description |
| :--- | :----------- |
| district_id | The unique identifier of the school district |
| state | The state where the district resides in |
| locale | NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See [Locale Boundaries User's Manual](https://eric.ed.gov/?id=ED577162) for more information. |
| pct_black/hispanic | Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data |
| pct_free/reduced | Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data |
| county_connections_ratio | `ratio` (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See [FCC data](https://www.fcc.gov/form-477-county-data-internet-access-services) for more information. |
| pp_total_raw | Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district. |

In [None]:
districts_info

In [None]:
#fixing all the percentages/ranges
for index, row in districts_info.iterrows():
    try:
        row = list(row['pct_black/hispanic'])
        row[-1] = ']'
        row = "".join(row)
        districts_info['pct_black/hispanic'][index] = row
    except TypeError:
        pass
    
for index, row in districts_info.iterrows():
    try:
        row = list(row['pct_free/reduced'])
        row[-1] = ']'
        row = "".join(row)
        districts_info['pct_free/reduced'][index] = row
    except TypeError:
        pass
    
for index, row in districts_info.iterrows():
    try:
        row = list(row['county_connections_ratio'])
        row[-1] = ']'
        row = "".join(row)
        districts_info['county_connections_ratio'][index] = row
    except TypeError:
        pass
    
for index, row in districts_info.iterrows():
    try:
        row = list(row['pp_total_raw'])
        row[-1] = ']'
        row = "".join(row)
        districts_info['pp_total_raw'][index] = row
    except TypeError:
        pass

In [None]:
districts_info

### Check for missing values and how many?

In [None]:
districts_info.isna().any()

In [None]:
print("Number of missing values in the feature 'state': ", districts_info['state'].isna().value_counts()[1])
print("Number of missing values in the feature 'locale': ", districts_info['locale'].isna().value_counts()[1])
print("Number of missing values in the feature 'pct_black/hispanic': ", districts_info['pct_black/hispanic'].isna().value_counts()[1])
print("Number of missing values in the feature 'pct_free/reduced': ", districts_info['pct_free/reduced'].isna().value_counts()[1])
print("Number of missing values in the feature 'county_connections_ratio': ", districts_info['county_connections_ratio'].isna().value_counts()[1])
print("Number of missing values in the feature 'pp_total_raw': ", districts_info['pp_total_raw'].isna().value_counts()[1])

In [None]:
plt.figure(figsize=(19,10))
plt.subplot(2, 3, 1)
sns.barplot(x=districts_info['state'].isna().value_counts().index, 
            y=districts_info['state'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: state')

plt.subplot(2, 3, 2)
sns.barplot(x=districts_info['locale'].isna().value_counts().index, 
            y=districts_info['locale'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: locale')

plt.subplot(2, 3, 3)
sns.barplot(x=districts_info['pct_black/hispanic'].isna().value_counts().index, 
            y=districts_info['pct_black/hispanic'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: pct_black/hispanic')

plt.subplot(2, 3, 4)
sns.barplot(x=districts_info['pct_free/reduced'].isna().value_counts().index, 
            y=districts_info['pct_free/reduced'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: pct_free/reduced')

plt.subplot(2, 3, 5)
sns.barplot(x=districts_info['county_connections_ratio'].isna().value_counts().index, 
            y=districts_info['county_connections_ratio'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: county_connections_ratio')

plt.subplot(2, 3, 6)
sns.barplot(x=districts_info['pp_total_raw'].isna().value_counts().index, 
            y=districts_info['pp_total_raw'].isna().value_counts())
plt.ylabel('Count')
plt.title('Count of missing values: pp_total_raw')

plt.show()

We see that almost all the features have missing values. But the feature 'pp_total_raw' has the highest number of missing values compared to the rest with a count of 115 out of 233.

# What are the unique state and locales in districts_info?

In [None]:
state_count = pd.DataFrame(districts_info['state'].value_counts()).reset_index(level=0)
state_count.columns = ['State', 'Count']
state_count.head()

In [None]:
state_count.shape

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x='Count', y='State', data=state_count)
plt.title('Count of unique states')
plt.show()

In [None]:
locale_count = pd.DataFrame(districts_info['locale'].value_counts()).reset_index(level=0)
locale_count.columns = ['Locale', 'Count']
locale_count.head()

In [None]:
plt.figure(figsize=(19,5))
sns.barplot(x='Locale', y='Count', data=locale_count)
plt.title('Count of unique locales')
plt.show()

We see that Connecticut and Utah are the most common states and Minnesota, Florida, Arizona and North Dakota are the least common states in districts_info. In total we have 23 states in the dataframe. 

With respect to the locale, we see the Suburb is the most common locale while Town is the least common locale. In total we have 4 different locales.

# What is the most common range/percentage of Black or Hispanic students among all the states?

In [None]:
districts_info['pct_black/hispanic'].value_counts()

In [None]:
pct_black_hispanic_count = pd.DataFrame(districts_info['pct_black/hispanic'].value_counts()).reset_index(level=0)
pct_black_hispanic_count.columns = ['pct_black/hispanic', 'count']
pct_black_hispanic_count.head()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x='count', y='pct_black/hispanic', data=pct_black_hispanic_count)
plt.title('Highest percentage range for pct_black/hispanic')
plt.show()

There are a lot of school districts with only 0 to 20% of blacks or hispanics. And very less number of school districts with more than 80% blacks or hispanics.

# What is the most common range/percentage of students in the districts eligible for free or reduced-price lunch?

In [None]:
districts_info['pct_free/reduced'].value_counts()

In [None]:
pct_free_reduced_count = pd.DataFrame(districts_info['pct_free/reduced'].value_counts()).reset_index(level=0)
pct_free_reduced_count.columns = ['pct_free/reduced', 'count']
pct_free_reduced_count.head()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x='count', y='pct_free/reduced', data=pct_free_reduced_count)
plt.title('Highest percentage range for pct_free/reduced')
plt.show()

We see that most of the datapoints are between 0 to 60% if we combine the first three categories in the barplot. This mean atleast 60% of the students in districts eligible for free or reduced-price lunch.

# Analysis on engagement_data

The engagement data are aggregated at school district level, and each file in the folder `engagement_data` represents data from one school district. The 4-digit file name represents `district_id` which can be used to link to district information in `district_info.csv`. The `lp_id` can be used to link to product information in `product_info.csv`.

| Name | Description |
| :--- | :----------- |
| time | date in "YYYY-MM-DD" |
| lp_id | The unique identifier of the product |
| pct_access | Percentage of students in the district have at least one page-load event of a given product and on a given day |
| engagement_index | Total page-load events per one thousand students of a given product and on a given day |

In [None]:
engagement_data.head()

In [None]:
len(engagement_data['lp_id'].unique())

In [None]:
len(products_info['LP ID'].unique())

We see that there are far more unique product ids in engagement_data compared to products_info

# Which product has the highest page load event (engagement_index) in all districts?

In [None]:
engagement_data.iloc[np.argmax(engagement_data['engagement_index'])]

In [None]:
districts_info[districts_info['district_id'] == 8815]

In [None]:
engagement_data.iloc[np.argmin(engagement_data['engagement_index'])]

In [None]:
districts_info[districts_info['district_id'] == 3864]

The highest engagement_index is 213045.47 in the state of Illinois and the suburb locale. While the lowest engagement_index is 0.01 in the state of Utah and of the locale suburb.

# What is the picture of engagement_index in 2020?

In [None]:
engagement_data['time'] = pd.to_datetime(engagement_data['time']) #converting to timestamps

In [None]:
engagement_data.dtypes

In [None]:
plt.figure(figsize=(19,5))
plt.rcParams['agg.path.chunksize'] = 10000
plt.plot(engagement_data['time'], engagement_data['engagement_index'])
plt.title("Engagement Index in 2020")
plt.xlabel('time')
plt.ylabel('engagement_index')
plt.show()

We see that there is an increase in the engagement_index from the month of March till July. These are the months were we saw the first wave of coronavirus all over the world. Most countries have imposed lockdown and all the people have been forced to stay home. If we look at this graph, keeping in mind the actual course of events, we can say that, since the people have been forced to stay home, the engagement_index of the students has increased. This is due to shifting of education to an online mode. 

Again in the month of July and August, the engagement_index has drastically reduced. This maybe due to easing of lockdowns in different countries. Furthermore, we again see a spike in engagement_index from mid august and the index is consistent till the end of the year with occasional dips for a few days. These maybe be weekends.