In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob # for assembling multipe csvs


#for visualization
import seaborn as sns 
import matplotlib as mpl

import datetime
# Format & locate ticks for dates
from matplotlib.dates import DateFormatter, DayLocator
# general plot formatting
import matplotlib.pyplot as plt
#format plot x/y tick labels
from matplotlib.ticker import FuncFormatter
%matplotlib inline 
#inline plots

#for Ignoring the warnings and errors
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Challenge

We challenge the Kaggle community to explore 

(1) the `state of digital learning` in 2020 and 

(2) how the `engagement of digital learning` relates to `factors` such as `district demographics`, `broadband access`, and `state/national level policies and events`.

We encourage you to guide the analysis with questions that are related to the themes that are described above (in bold font). Below are some examples of questions that relate to our problem statement:

What is the `picture of digital connectivity and engagement` in 2020?

What is the `effect of the COVID-19 pandemic on online and distance learning`, and how might this also `evolve` in the future?

How does `student engagement with different types of education technology` change over the course of the pandemic?

How does `student engagement with online learning platforms` relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

Do certain `state interventions, practices or policies` (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

# The Relationship
1. Optimization of `engagement index` using state, locale, cc_ratio, pct_access
2. Top 5 state with the highest engagement
3. Digital conectivity cc_ratio and pct_access influence the engagement
4. Effectiveness of `education technology` in `education sector`


In [None]:
# importing dataset
districts_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")


In [None]:
districts_df.head(10)

# DISTRICTS DATA

#### The district file districts_info.csv includes information about the characteristics of school districts, including data from NCES (2018-19), FCC (Dec 2018), and Edunomics Lab. In this data set, we removed the identifiable information about the school districts. We also used an open source tool ARX (Prasser et al. 2020) to transform several data fields and reduce the risks of re-identification. For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset.

#### district_id - The unique identifier of the school district


#### state	- The state where the district resides in


#### locale - NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See Locale Boundaries User's Manual for more information.

#### pct_black/hispanic	 - Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data

#### pct_free/reduced	- Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data

#### county_connections_ratio - ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information.

#### pp_total_raw - Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.


In [None]:
districts_df.info()

In [None]:
districts_df.describe()

In [None]:
# Get info, if null values exist, or duplicate entries

#It's necessary to know if there are empty values or duplicate values in Pandas.

#Empty values need to be handled by dropping them, or applying algorithms such as Encoder, Mean or so on. Same for duplicates

In [None]:
districts_df.isnull().sum()

In [None]:
percent_missing = districts_df.isnull().sum() * 100 / len(districts_df)
percent_missing

In [None]:
# Must dropna before analysing
## Drop the col with more than 30% missing value
districts_df=districts_df.drop(["pct_free/reduced","pp_total_raw"], axis=1)


In [None]:
# Pandas Fillna of the Columns with Mode
cols = ["state","locale"]
districts_df[cols]=districts_df[cols].fillna(districts_df.mode().iloc[0])

In [None]:
# Convert pct_black/hispanic from str to numeric

In [None]:
# process and separate pct_black/hispanic
pct_black_hispanic = districts_df['pct_black/hispanic'].str.split(",",n=1,expand=True)
# extract pct_black and pct_hispanic
districts_df['pct_black']=pct_black_hispanic[0].str.replace('[','',regex=True)
districts_df['pct_hispanic']= pct_black_hispanic[1].str.replace('[','',regex=True)
# convert pct_black and pct_hispanic to numeric
districts_df['pct_black']=pd.to_numeric(districts_df['pct_black'])
districts_df['pct_hispanic']=pd.to_numeric(districts_df['pct_hispanic'])
# combine with mean value
districts_df['pct_black_and_hispanic']=(districts_df['pct_black'] + districts_df['pct_hispanic'])/2

In [None]:
# Drop column
districts_df=districts_df.drop(columns=['pct_black/hispanic','pct_black','pct_hispanic'])

In [None]:
# Pandas Fillna of the Columns with Mode
cols = ["pct_black_and_hispanic"]
districts_df[cols]=districts_df[cols].fillna(districts_df.median().iloc[0])

In [None]:
# Convert county_connections_ratio from str to numeric

In [None]:
# process and separate county_connections_ratio
county_connections_ratio = districts_df['county_connections_ratio'].str.split(",",n=1,expand=True)
#extract county and connections value
districts_df['county']=county_connections_ratio[0].str.replace('[','',regex=True)
districts_df['connections']= county_connections_ratio[1].str.replace('[','',regex=True)
# convert county and connections to numeric
districts_df['county']=pd.to_numeric(districts_df['county'])
districts_df['connections']=pd.to_numeric(districts_df['connections'])

In [None]:
# Pandas Fillna of the Columns with Median
# populate missing value for county and connections with median
districts_df['county'].fillna(districts_df['county'].median(), inplace=True)
districts_df['connections'].fillna(districts_df['connections'].median(), inplace=True)
#combine county and connections with mean value
districts_df['cc_ratio'] = (districts_df['county'] + districts_df['connections'])/2

In [None]:
# Drop column
districts_df=districts_df.drop(columns=['county_connections_ratio','county','connections'])

In [None]:
# Now data has been cleaned. Lets do basic visualization

## Top 5 State with Locale Classification

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.countplot(x=districts_df["state"], hue=districts_df["locale"], palette="Paired",
                 order=pd.value_counts(districts_df["state"]).iloc[:5].index)
ax.set_title('Top 5 State with The Locale Classification')
ax.set_ylabel('No. of District')
ax.set_xlabel('The State Where the District Resides in')
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')

1. Connecticut only have 3 `locale classification` with the highest no. of district in `Suburb`, 75 and less in `City`, 2.
2. Utah have all the `locale classification` with highest district in `Suburb`, 18 and less in `Rural`, 2.
3. The highest no. of district is in `Suburb` for Massachusetts that is 20.

## Top 5 State Have The Highest Percentage of Students in the Districts Identified as Black or Hispanic

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.countplot(x=districts_df["state"],
              hue=districts_df["pct_black_and_hispanic"],
             palette="Paired",
            order=pd.value_counts(districts_df["state"]).iloc[:5].index)
ax.set_title("Top 5 State Have The Highest Percentage of students in the districts identified as Black or Hispanic")
ax.set_xlabel('The State Where the District Resides in')
ax.set_ylabel('No. of District')
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')

In [None]:
# County Connection Ratio by State

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.countplot(x=districts_df["cc_ratio"],
              hue=districts_df["state"],
                 palette="Paired"
            )
ax.set_title("County Connection Ratio by State")
ax.set_xlabel('County Connection Ratio')
ax.set_ylabel('No. of District')
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')

In [None]:
# County Connection Ratio by Locale

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.countplot(x=districts_df["cc_ratio"],
              hue=districts_df["locale"],
             palette="Paired",
            order=pd.value_counts(districts_df["cc_ratio"]).iloc[:2].index)
ax.set_title("County Connection Ratio by Locale")
ax.set_xlabel('County Connection Ratio')
ax.set_ylabel('No. of District')
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')

# PRODUCTS DATA

#### The product file products_info.csv includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. Data were labeled by our team. Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

#### LP ID - The unique identifier of the product

#### URL - Web Link to the specific product

#### Product Name - Name of the specific product

#### Provider/Company Name - Name of the product provider



In [None]:
# importing dataset
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")


In [None]:
products_df.head()

In [None]:
products_df.info()

In [None]:
products_df.describe()

In [None]:
# Change LP ID

In [None]:
# rename LP ID column to lp_id for it to match as in engagement_data
products_df.rename(columns={'LP ID':'lp_id'}, inplace=True)
products_df.rename(columns={'Product Name':'product_name'}, inplace=True)
products_df.rename(columns={'Provider/Company Name':'company_provider'}, inplace=True)
products_df.rename(columns={'Sector(s)':'sector'}, inplace=True)

In [None]:
# Extract Sector(s) & Primary Essential Function
products_df['main_product_function'] = products_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_df['sub_product_function'] = products_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)


In [None]:
# Synchronize similar values
products_df['sub_product_function'] = products_df['sub_product_function'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
products_df.drop("Primary Essential Function", axis=1, inplace=True)


In [None]:
# Get info, if null values exist, or duplicate entries

#It's necessary to know if there are empty values or duplicate values in Pandas.

#Empty values need to be handled by dropping them, or applying algorithms such as Encoder, Mean or so on. Same for duplicates

In [None]:
per_col = products_df.isnull().sum()
per_col


In [None]:
whole_df = products_df.isnull().sum().sum()
whole_df

In [None]:
percent_missing = products_df.isnull().sum() * 100 / len(products_df)
percent_missing

In [None]:
# Pandas Fillna of Multiple Columns with Mode of Each Column
cols = ["company_provider","sector","main_product_function","sub_product_function"]
products_df[cols]=products_df[cols].fillna(products_df.mode().iloc[0])

In [None]:
# DV OF PRODUCTS

* The product file `products_info.csv` includes information about the characteristics of the `top 372 products with most users in 2020`.

* LP ID - The unique identifier of the product
* URL - Web Link to the specific product
* Product Name - Name of the specific product
* Provider/Company Name - Name of the product provider

* Sector(s) - `Sector of education` where the product is used

*  Primary Essential Function - The `basic function` of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = `Learning & Curriculum`, CM = `Classroom Management`, and SDO = `School & District Operations`. Each of these categories have multiple sub-categories with which the products were labeled

* Effectiveness of `education technology` in `education sector`


In [None]:
sector_main_product=products_df.groupby(['sector','main_product_function']).count()[['lp_id']]
sector_main_product.reset_index(inplace=True)
sector_main_product.value_counts()

In [None]:
sector_main_product=sector_main_product.nlargest(50, columns=['lp_id'])

In [None]:
sector_main_product['percent'] = (sector_main_product['lp_id']/sector_main_product['lp_id'].sum() *100)
x=list(sector_main_product['percent'])
y=[]
for i in x:
    y.append(str(i))
sector_main_product

# The Proportion of Main Product Function by Different Education Sector

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 5))

sns.barplot(data=sector_main_product,x='main_product_function',y='percent',hue='sector',
              order=sector_main_product['main_product_function'].value_counts().index[:10],
              palette='Paired')
plt.title("The Proportion of Main Product Function by Different Education Sector")
plt.legend(title="Sector(s):")
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')
plt.tight_layout()
plt.show()

In [None]:
sector_sub_product=products_df.groupby(['sector','sub_product_function']).count()[['lp_id']]
sector_sub_product.reset_index(inplace=True)
sector_sub_product.value_counts()

In [None]:
sector_sub_product=sector_sub_product.nlargest(50, columns=['lp_id'])

In [None]:
sector_sub_product['percent'] = (sector_sub_product['lp_id']/sector_sub_product['lp_id'].sum() *100)
x=list(sector_sub_product['percent'])
y=[]
for i in x:
    y.append(str(i))
sector_sub_product

# Proportion of Top 5 Sub Product Function by Different Education Sector

In [None]:
#Top Sub Product Function by Sector
fig, ax = plt.subplots(1,1, figsize=(17, 6))

sns.barplot(data=sector_sub_product,
           y='percent', x='sub_product_function', hue='sector',
           order=sector_sub_product['sub_product_function'].value_counts().index[:5], palette='Paired')
plt.title("Proportion of Top 5 Sub Product Function by Different Education Sector")
plt.legend(title="Sector(s):")
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')
plt.tight_layout()
plt.show()

In [None]:
company_sector=products_df.groupby(['company_provider','sector']).count()[['lp_id']]
company_sector.reset_index(inplace=True)
company_sector.value_counts()

In [None]:
company_sector=company_sector.nlargest(50, columns=['lp_id'])

In [None]:
company_sector['percent'] = (company_sector['lp_id']/company_sector['lp_id'].sum() *100)
x=list(company_sector['percent'])
y=[]
for i in x:
    y.append(str(i))
company_sector

# Proportion of Top 5 Company Product Provider by Different Education Sector

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.barplot(data=company_sector,
            x='company_provider',
            y='percent',
            hue='sector', 
            palette='Paired',
            order=pd.value_counts(company_sector['company_provider']).iloc[:5].index)
plt.title("Proportion of Top 5 Company Product Provider by Different Education Sector")
plt.legend(title="Sector(s):")
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')
plt.tight_layout()
plt.show()

# ENGAGEMENT

#### The engagement_ data folder is based on LearnPlatform’s Student Chrome Extension. The extension collects page load events of over 10K education technology products in our product library, including websites, apps, web apps, software programs, extensions, ebooks, hardwares, and services used in educational institutions. The engagement data have been aggregated at school district level, and each file represents data from one school district.

#### The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district. The 4-digit file name represents district_id which can be used to link to district information in district_info.csv. The lp_id can be used to link to product information in product_info.csv.

#### time : date in "YYYY-MM-DD"
#### lp_id : The unique identifier of the product. can be used to link to product information in product_info.csv
#### pct_access : Percentage of students in the district have at least one page-load event of a given product and on a given day
#### engagement_index : Total page-load events per one thousand students of a given product and on a given day
#### district_id : can be used to link to district information in district_info.csv

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.head()

In [None]:
engagement_df.head()

In [None]:
engagement_df.info()

In [None]:
engagement_df.describe()

In [None]:
# Rename time to date

In [None]:
engagement_df.rename(columns={'time':'date'}, inplace=True)


In [None]:
# Change object to datetime format

In [None]:
pd.to_datetime(engagement_df['date'])


In [None]:
per_col = engagement_df.isnull().sum()
per_col

In [None]:
whole_df = engagement_df.isnull().sum().sum()
whole_df

In [None]:
percent_missing = engagement_df.isnull().sum() * 100 / len(engagement_df)
percent_missing

In [None]:
# Fillna with mean

In [None]:
# Pandas Fillna of Multiple Columns with Mean of Each Column
cols = ["pct_access", "engagement_index"]
engagement_df[cols]=engagement_df[cols].fillna(engagement_df.median().iloc[0])

In [None]:
# Convert date to day and month

In [None]:
# Convert date to day, month, year
engagement_df['month'] = pd.to_datetime(engagement_df['date']).dt.month_name()
engagement_df['day'] = pd.to_datetime(engagement_df['date']).dt.day_name()


In [None]:
# Time vs Percentage of students access in the district
# pct_access : Percentage of students in the district have at least one page-load event of a given product and on a given day
# engagement_index : Total page-load events per one thousand students of a given product and on a given day

In [None]:
month_district=engagement_df.groupby(['month'], as_index=False,sort=False)[['engagement_index']].median()
month_district.reset_index(inplace=True)
month_district.value_counts()


In [None]:
month_district['percent'] = (month_district['engagement_index']/month_district['engagement_index'].sum() *100)
x=list(month_district['percent'])
y=[]
for i in x:
    y.append(str(i))
month_district

# Percentage of Student Engagement by Month

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 6))

sns.lineplot(data=month_district,
                x='month',y='percent',
                palette='Paired')
ax.set_title("Percentage of Student Engagement by Month")
ax.set_xlabel('Month')
ax.set_ylabel('Percentage of Student Engagement')



In [None]:
day_district=engagement_df.groupby(['day'], as_index=False,sort=False)[['engagement_index']].median()
day_district.reset_index(inplace=True)
day_district.value_counts()


In [None]:
day_district['percent'] = (day_district['engagement_index']/day_district['engagement_index'].sum() *100)
x=list(day_district['percent'])
y=[]
for i in x:
    y.append(str(i))
day_district

# Percentage of Student Engagement by Day

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 6))

sns.lineplot(data=day_district,
                x='day',y='percent',
                palette='Paired')
ax.set_title("Percentage of Student Engagement by Day")
ax.set_xlabel('Day')
ax.set_ylabel('Percentage of Student Engagement')


# COMBINE ENGAGEMENT, DISTRICT AND PRODUCT

* How does `student engagement` with `online learning platforms` relate to different `geography`? Demographic context (e.g., `race/ethnicity`, ESL, `learning disability`)? Learning context? Socioeconomic status?

In [None]:
print(districts_df["district_id"].nunique())
print(engagement_df["district_id"].nunique())

In [None]:
# merge dataframe using district_id as key
engagement_df["district_id"] = engagement_df["district_id"].astype(str).astype(int)
districts_engagement_data = pd.merge(districts_df, engagement_df, left_on='district_id', right_on='district_id')
districts_engagement_data.head()

In [None]:
districts_engagement_data.info()

In [None]:
districts_engagement_data.describe()

In [None]:
per_col = districts_engagement_data.isnull().sum()
per_col

In [None]:
whole_df = districts_engagement_data.isnull().sum().sum()
whole_df

In [None]:
percent_missing = districts_engagement_data.isnull().sum() * 100 / len(districts_engagement_data)
percent_missing

In [None]:
month_locale=districts_engagement_data.groupby(['month','locale'], as_index=False, sort=False)[['pct_access']].median()
month_locale.reset_index(inplace=True)
month_locale.value_counts()

# Percentage of Student Access in the Locale District by Month

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))
sns.relplot(data=month_locale,
                x='month',y='pct_access',hue='locale',style='locale',kind='line',
                palette='Paired',height=5,aspect=2)
ax.set_title("Percentage of Student Access in the Locale District by Month")
ax.set_xlabel('Month')
ax.set_ylabel('Percentage of Student Access')

In [None]:
day_locale=districts_engagement_data.groupby(['day','locale'], as_index=False, sort=False)[['pct_access']].median()
day_locale.reset_index(inplace=True)
day_locale.value_counts()

# Percentage of Student Access in the Locale District by Day

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))
sns.relplot(data=day_locale,
                x='day',y='pct_access',hue='locale',style='locale',kind='line',
                palette='Paired',height=5,aspect=2)
ax.set_title("Percentage of Student Access in the Locale District by Day")
ax.set_xlabel('Day')
ax.set_ylabel('Percentage of Student Access')

In [None]:
state_engage=districts_engagement_data.groupby(['state'])[['engagement_index']].median()
state_engage.reset_index(inplace=True)
state_engage.value_counts()

In [None]:
state_engage=state_engage.nlargest(50, columns=['engagement_index'])

In [None]:
state_engage['percent'] = (state_engage['engagement_index']/state_engage['engagement_index'].sum() *100)
x=list(state_engage['percent'])
y=[]
for i in x:
    y.append(str(i))
state_engage

# Percentage of Student Engagement by State

In [None]:
fig, ax = plt.subplots(1,1, figsize=(17, 6))
sns.barplot(data=state_engage,
                x='percent',y='state',
                palette='Paired')
plt.title("Percentage of Student Engagement by State")
plt.ylabel('State')
plt.xlabel('Percntage')
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')
plt.tight_layout()
plt.show()

In [None]:
locale_engage=districts_engagement_data.groupby(['locale'])[['engagement_index']].median()
locale_engage.reset_index(inplace=True)
locale_engage.value_counts()

In [None]:
locale_engage=locale_engage.nlargest(50, columns=['engagement_index'])

In [None]:
locale_engage['percent'] = (locale_engage['engagement_index']/locale_engage['engagement_index'].sum() *100)
x=list(locale_engage['percent'])
y=[]
for i in x:
    y.append(str(i))
locale_engage

# Percentage of Student Engagement by Locale

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 6))
sns.barplot(data=locale_engage,
                y='percent',x='locale',
                palette='Set3')
plt.title("Percentage of Student Engagement by Locale")
plt.ylabel('Locale')
plt.xlabel('Percntage')
for bars in ax.containers:
        ax.bar_label(bars, fmt='%.1f')
plt.tight_layout()
plt.show()

In [None]:
locale_day_month=districts_engagement_data.groupby(['locale','day','month'],as_index=False,sort=False)[['engagement_index']].median()
locale_day_month.reset_index(inplace=True)
locale_day_month.value_counts()

In [None]:
locale_day_month=locale_day_month.nlargest(336, columns=['engagement_index'])

In [None]:
locale_day_month['percent'] = (locale_day_month['engagement_index']/locale_day_month['engagement_index'].sum() *100)
x=list(locale_day_month['percent'])
y=[]
for i in x:
    y.append(str(i))
locale_day_month

In [None]:
# Percentage of Student Engagement by Locale, Day and Month

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(12, 6))
sns.catplot(data=locale_day_month,
                y='day',x='percent',hue='month',col='locale',sharex=False,sharey=False,
                palette='hls',height=7,aspect=.7)


In [None]:
locale_time_access=districts_engagement_data.groupby(['locale','day','month'],as_index=False,sort=False)[['pct_access']].median()
locale_time_access.reset_index(inplace=True)
locale_time_access.value_counts()

In [None]:
locale_time_access=locale_time_access.nlargest(336, columns=['pct_access'])

In [None]:
locale_time_access['percent'] = (locale_time_access['pct_access']/locale_time_access['pct_access'].sum() *100)
x=list(locale_time_access['percent'])
y=[]
for i in x:
    y.append(str(i))
locale_time_access

In [None]:
# Percentage of Student Access by Locale, Day and Month

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(12, 6))
sns.catplot(data=locale_time_access,
                y='day',x='pct_access',hue='month',col='locale',sharex=False,sharey=False,
                palette='hls',height=7,aspect=.7)



In [None]:
locale_engagement=districts_engagement_data.groupby(['locale','day'],as_index=False,sort=False)[['engagement_index']].median()
locale_engagement.reset_index(inplace=True)
locale_engagement.value_counts()

In [None]:
locale_engagement=locale_engagement.nlargest(50, columns=['engagement_index'])

# Distribution of Daily Student Engagement by Locale

In [None]:
# Univariate Conditional Distributions
fig, ax = plt.subplots(1,1, figsize=(15, 7))
sns.kdeplot(data=locale_engagement,
            x='engagement_index',
            hue='locale',hue_order=['Town','Rural','City','Suburb'],
            fill=True,palette='Paired',
            alpha=.5,linewidth=0)
ax.set_title("Distribution of Daily Student Engagement by Locale Classification")
ax.set_xlabel('Student Engagement')


In [None]:
locale_access=districts_engagement_data.groupby(['locale','month'],as_index=False,sort=False)[['pct_access']].median()
locale_access.reset_index(inplace=True)
locale_access.value_counts()

In [None]:
locale_access=locale_access.nlargest(50, columns=['pct_access'])

# Distribution of Monthly Student Access by Locale 

In [None]:
# Univariate Conditional Distributions
fig, ax = plt.subplots(1,1, figsize=(15, 7))
sns.kdeplot(data=locale_access,
            x='pct_access',
            hue='locale',hue_order=['Town','Rural','City','Suburb'],
            alpha=.5,fill=True,palette='Paired',linewidth=0)



ax.set_title("Distribution of Monthly Student Access by Locale Classification")
ax.set_xlabel('Student Access')
# Rajin nanti try explore how to edit legend in seaborn
# month day state locale cc_ratio pct_access engagement_index

In [None]:
locale_ratio=districts_engagement_data.groupby(['locale','day','month'],as_index=False,sort=False)[['cc_ratio']].median()
locale_ratio.reset_index(inplace=True)
locale_ratio.value_counts()

In [None]:
locale_ratio=locale_ratio.nlargest(336, columns=['cc_ratio'])

In [None]:
locale_ratio['percent'] = (locale_ratio['cc_ratio']/locale_ratio['cc_ratio'].sum() *100)
x=list(locale_ratio['percent'])
y=[]
for i in x:
    y.append(str(i))
locale_ratio

In [None]:
# Percentage of County Connection Ratio by Locale, Day and Month

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.catplot(data=locale_ratio,
            y='month',x='percent',hue='day', col='locale', palette='Paired',
            height=5, aspect=.8)


# Challenge

We challenge the Kaggle community to explore 

(1) the `state of digital learning` in 2020 and 

(2) how the `engagement of digital learning` relates to `factors` such as `district demographics`, `broadband access`, and `state/national level policies and events`.

We encourage you to guide the analysis with questions that are related to the themes that are described above (in bold font). Below are some examples of questions that relate to our problem statement:

What is the `picture of digital connectivity and engagement` in 2020?

What is the `effect of the COVID-19 pandemic on online and distance learning`, and how might this also `evolve` in the future?

How does `student engagement with different types of education technology` change over the course of the pandemic?

How does `student engagement with online learning platforms` relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

Do certain `state interventions, practices or policies` (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

In [None]:
products_engagement_data = pd.merge(products_df, districts_engagement_data, left_on='lp_id', right_on='lp_id')
products_engagement_data.head()

In [None]:
products_engagement_data['cc_ratio'].value_counts()

In [None]:
products_engagement_data.info()

In [None]:
products_engagement_data.describe()

In [None]:
per_col = products_engagement_data.isnull().sum()
per_col

In [None]:
sector_locale_day=products_engagement_data.groupby(['locale','sector','day','sub_product_function'],as_index=False,sort=False)[['engagement_index']].median()
sector_locale_day.reset_index(inplace=True)
sector_locale_day.value_counts()

In [None]:
sector_locale_day=sector_locale_day.nlargest(1287, columns=['engagement_index'])

In [None]:
sector_locale_day['percent'] = (sector_locale_day['engagement_index']/sector_locale_day['engagement_index'].sum() *100)
x=list(sector_locale_day['percent'])
y=[]
for i in x:
    y.append(str(i))
sector_locale_day

In [None]:
# Percentage of Daily Student Engagement by Locale, Sector and Sub Product Function

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.catplot(data=sector_locale_day, x='percent', y='sub_product_function',
              hue='sector', col='locale', kind='strip',
              palette="Paired", height=6, aspect=.8)


In [None]:
day_main=products_engagement_data.groupby(['day','main_product_function'],as_index=False,sort=False)[['engagement_index']].median()
day_main.reset_index(inplace=True)
day_main.value_counts()

In [None]:
day_main=day_main.nlargest(50, columns=['engagement_index'])

In [None]:
day_main['percent'] = (day_main['engagement_index']/day_main['engagement_index'].sum() *100)
x=list(day_main['percent'])
y=[]
for i in x:
    y.append(str(i))
day_main

In [None]:
# Percentage of Daily Student Engagement by Main Product Function

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.relplot(data=day_main, x='percent', y='main_product_function', 
              hue='day', size='day', sizes=(200,500), alpha=.5, ci=None,
               height=6, aspect=2)

In [None]:
product_day=products_engagement_data.nlargest(50000, columns=['engagement_index'])
#product_day.reset_index(inplace=True)
product_day.value_counts()

In [None]:
product_day['percent'] = (product_day['engagement_index']/product_day['engagement_index'].sum() *100)
x=list(product_day['percent'])
y=[]
for i in x:
    y.append(str(i))
product_day

# Top Product Engagement

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.catplot(data=product_day, x='percent', y='product_name',kind='bar',
             palette='Set3',height=6, aspect=2, ci=None)


In [None]:
sector_locale_day=products_engagement_data.groupby(['sector','locale','product_name'],as_index=False,sort=False)[['engagement_index']].mean()
sector_locale_day.reset_index(inplace=True)
sector_locale_day.value_counts()

In [None]:
sector_locale_day=sector_locale_day.nlargest(80, columns=['engagement_index'])

In [None]:
sector_locale_day['percent'] = (sector_locale_day['engagement_index']/sector_locale_day['engagement_index'].sum() *100)
x=list(sector_locale_day['percent'])
y=[]
for i in x:
    y.append(str(i))
sector_locale_day

In [None]:
# Top Product Engagement by Locale and Education Sector

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.relplot(data=sector_locale_day, x='percent', y='product_name',
              hue='sector',size='locale', sizes=(100,500), ci=None, alpha=.5,
              palette="Set1", height=8, aspect=1.2)


In [None]:
sector_locale_day=products_engagement_data.groupby(['sector','main_product_function','product_name'],as_index=False,sort=False)[['engagement_index']].mean()
sector_locale_day.reset_index(inplace=True)
sector_locale_day.value_counts()

In [None]:
sector_locale_day=sector_locale_day.nlargest(20, columns=['engagement_index'])

In [None]:
sector_locale_day['percent'] = (sector_locale_day['engagement_index']/sector_locale_day['engagement_index'].sum() *100)
x=list(sector_locale_day['percent'])
y=[]
for i in x:
    y.append(str(i))
sector_locale_day

In [None]:
# Top Product Engagement by Education Sector and Main Product Function

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.relplot(data=sector_locale_day, x='percent', y='product_name',
              hue='sector',size='main_product_function', sizes=(100,500), ci=None, alpha=.5,
              palette="Set1", height=8, aspect=1.2)


In [None]:
product_access=products_engagement_data.groupby(['product_name'],as_index=False,sort=False)[['pct_access']].median()
product_access.reset_index(inplace=True)
product_access.value_counts()

In [None]:
product_access=product_access.nlargest(20, columns=['pct_access'])

# Top Product Access

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.catplot(data=product_access, x='pct_access', y='product_name',kind='bar',
             palette='Set3',height=6, aspect=2, ci=None)

In [None]:
product_access=products_engagement_data.groupby(['locale','product_name'],as_index=False,sort=False)[['pct_access']].median()
product_access.reset_index(inplace=True)
product_access.value_counts()

In [None]:
product_access=product_access.nlargest(20, columns=['pct_access'])

In [None]:
# Top Product Access by Locale

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.relplot(data=product_access, x='pct_access', y='product_name',
              hue='locale',size='locale', sizes=(100,500), ci=None, alpha=.5,
              palette="Set1", height=8, aspect=1.2)


In [None]:
product_access=products_engagement_data.groupby(['locale','sector','product_name'],as_index=False,sort=False)[['pct_access']].median()
product_access.reset_index(inplace=True)
product_access.value_counts()

In [None]:
product_access=product_access.nlargest(20, columns=['pct_access'])

In [None]:
# Top Product Access by Locale and Sector

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.relplot(data=product_access, x='pct_access', y='product_name',
              hue='locale',size='sector', sizes=(100,500), ci=None, alpha=.5,
              palette="Set1", height=8, aspect=1.2)


In [None]:
product_access=products_engagement_data.groupby(['main_product_function','sector','product_name'],as_index=False,sort=False)[['pct_access']].median()
product_access.reset_index(inplace=True)
product_access.value_counts()

In [None]:
product_access=product_access.nlargest(20, columns=['pct_access'])

In [None]:
# Top Product Access by Main Product Function and Sector

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.relplot(data=product_access, x='pct_access', y='product_name',
              hue='main_product_function',size='sector', sizes=(160,500), ci=None, alpha=.5,
              palette="Set1", height=8, aspect=1.2)


In [None]:
locale_ratio=products_engagement_data.groupby(['locale'],as_index=False,sort=False)[['cc_ratio']].count()
locale_ratio.reset_index(inplace=True)
locale_ratio.value_counts()

In [None]:
locale_ratio=locale_ratio.nlargest(4, columns=['cc_ratio'])

# Top Locale County Connections

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.catplot(data=locale_ratio, x='locale', y='cc_ratio',kind='bar',
             palette='Set3',height=6, aspect=2, ci=None)

In [None]:
locale_ratio=products_engagement_data.groupby(['locale','sector'],as_index=False,sort=False)[['cc_ratio']].count()
locale_ratio.reset_index(inplace=True)
locale_ratio.value_counts()

In [None]:
locale_ratio=locale_ratio.nlargest(20, columns=['cc_ratio'])

In [None]:
# Top Locale County Connections by Education Sector

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.catplot(data=locale_ratio, x='locale', y='cc_ratio', hue='sector', kind='bar',
             palette='Set3',height=7, aspect=2.6, ci=None)

In [None]:
locale_ratio=products_engagement_data.groupby(['locale','sector','main_product_function'],as_index=False,sort=False)[['cc_ratio']].count()
locale_ratio.reset_index(inplace=True)
locale_ratio.value_counts()

In [None]:
locale_ratio=locale_ratio.nlargest(20, columns=['cc_ratio'])

In [None]:
# Top Locale County Connections By Education Sector and Main Product Function

In [None]:
#fig, ax = plt.subplots(1,1, figsize=(15, 7))

sns.catplot(data=locale_ratio, x='main_product_function', y='cc_ratio', hue='sector', col='locale', kind='bar',
             palette='Set2',height=6, aspect=.6, ci=None)