# Problem Statement

Due to COVID-19,**most states and local governments a**cross the U.S. **closed educational institutions**. In Response, **schools and teachers** have **attempted** to **reach** **students remotely through distance learning tools and digital platforms**. Until today, **concerns of the exacaberting digital divide and long-term learning loss** among America’s most vulnerable learners continue to grow.

# Challenge

* What is the picture of digital connectivity and engagement in 2020?
* What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?
* How does student engagement with different types of education technology change over the course of the pandemic?
* How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?
* Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

In [None]:
import pandas as pd
import matplotlib.pyplot as  plt
import seaborn as sns
import os
from pathlib import Path
import numpy as np
%matplotlib inline

In [None]:
district_info_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
product_info_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

In [None]:
product_info_df.isna().sum()

In [None]:
product_info_df.at[61,'Sector(s)'] = 'PreK-12'

In [None]:
product_info_df['Sector(s)'] = product_info_df['Sector(s)'].fillna('Others')

In [None]:
product_info_df[product_info_df.isna().any(axis=1)]

In [None]:
district_info_df

In [None]:
product_info_df['Sector(s)'].unique()

In [None]:
txt_folder = Path('../input/learnplatform-covid19-impact-on-digital-learning/engagement_data').rglob('*.*')
files = [x for x in txt_folder]
engagement_data = dict()
index=0
for name in files:
    f = open(name, 'r') 
    district = int(os.path.basename(name).split(".")[0])
    engagement_data[district]=name
    
print(engagement_data)

# Generate DistrictWise Engagement data

In [None]:
districts=district_info_df['district_id'].tolist()
df_district=pd.DataFrame()
for i in range(len(districts)):
    temp_df=pd.DataFrame()
    temp_df = pd.read_csv(engagement_data[districts[i]])
    temp_df['district'] = districts[i]
    df_district = pd.concat([df_district,temp_df],axis=0, ignore_index = True)

In [None]:
df_district

# Data Clean

# A. Engagement data

In [None]:
df_district = df_district.dropna()

In [None]:
df_district = df_district.groupby(['district','time','lp_id']).mean()
df_district.index[0][2]

In [None]:
df_district.reset_index(inplace=True)
df_district.lp_id = df_district.lp_id.astype(int)
df_district

# B. District information data

In [None]:
district_info_df

In [None]:
district_info_df['pct_black/hispanic'] = district_info_df['pct_black/hispanic'].str.strip('[]')
district_info_df['pct_free/reduced'] = district_info_df['pct_free/reduced'].str.strip('[]')
district_info_df['county_connections_ratio'] = district_info_df['county_connections_ratio'].str.strip('[]')
district_info_df['pp_total_raw'] = district_info_df['pp_total_raw'].str.strip('[]')

In [None]:
district_info_df = district_info_df[district_info_df.state.isna()==False]
district_info_df

In [None]:
pd.options.mode.chained_assignment = None
district_info_df['pct_free/reduced'] =  district_info_df['pct_free/reduced'].astype(str)
district_info_df['county_connections_ratio'] =  district_info_df['county_connections_ratio'].astype(str)
district_info_df['pp_total_raw'] =  district_info_df['pp_total_raw'].astype(str)

In [None]:
district_info_df1 = district_info_df.copy()
def find_avg(val):
    if val == 'nan':
        return val
    else:
        nos = val.split(',')
        return (float(nos[0])+float(nos[1]))/2
    
district_info_df1['pct_black/hispanic'] =  district_info_df['pct_black/hispanic'].apply(find_avg)
district_info_df1['pct_free/reduced'] =  district_info_df['pct_free/reduced'].apply(find_avg)
district_info_df1['county_connections_ratio'] =  district_info_df['county_connections_ratio'].apply(find_avg)
district_info_df1['pp_total_raw'] =  district_info_df['pp_total_raw'].apply(find_avg)
district_info_df1

In [None]:
district_info_df1['pct_free/reduced'] =  district_info_df1['pct_free/reduced'].astype(float)
district_info_df1['county_connections_ratio'] =  district_info_df1['county_connections_ratio'].astype(float)
district_info_df1['pp_total_raw'] =  district_info_df1['pp_total_raw'].astype(float)
district_info_df1.dtypes

In [None]:
district_info_df1['pp_total_raw'].fillna(district_info_df1['pp_total_raw'].mean(),inplace=True)
district_info_df1['pct_black/hispanic'].fillna(district_info_df1['pct_black/hispanic'].mean(),inplace=True)
district_info_df1['pct_free/reduced'].fillna(district_info_df1['pct_free/reduced'].mean(),inplace=True)
district_info_df1['county_connections_ratio'].fillna(district_info_df1['county_connections_ratio'].mean(),inplace=True)
district_info_df1

# C. Product Information Data 

In [None]:
product_info_df = product_info_df.drop(['Provider/Company Name'],axis=1)

In [None]:
product_info_df = product_info_df.fillna('SDO - Other')

In [None]:
product_info_df

# Marge Dataset

In [None]:
df = district_info_df1.merge(df_district, left_on='district_id', right_on='district')

In [None]:
df = product_info_df.merge(df, left_on='LP ID', right_on='lp_id')

In [None]:
df = df.drop(['LP ID','district'],axis=1)

In [None]:
df

# What is the picture of digital connectivity and engagement in 2020?

###### 1. Districtwise engagement and connictivity ratio

In [None]:
df1 = df.groupby(['district_id'])['county_connections_ratio','engagement_index'].mean()
df1.reset_index(inplace = True)
df1

In [None]:
X = df1['engagement_index']
Y = df1['county_connections_ratio']
Z = df1['district_id']
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(5)
plt.xlabel('engagement_index Range')
plt.ylabel('county_connections_ratio Range')
plt.scatter(X, Y)
plt.show()

###### 2. Statewise Cannectivity ratio and engagement index

In [None]:
df1 = df.groupby(['state'])['county_connections_ratio','engagement_index'].mean()
df1.reset_index(inplace = True)
df1

In [None]:
X = df1['state']
Y = df1['county_connections_ratio']
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(5)
plt.xlabel('state')
plt.ylabel('county_connections_ratio')
plt.title("Cannection Ratio Statewise")
plt.xticks(rotation=90)
plt.bar(X, Y)
plt.show()

In [None]:
X = df1['state']
Y = df1['engagement_index']
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(5)
plt.xlabel('state')
plt.ylabel('engagement_index')
plt.title("Enagagement index Statewise")
plt.xticks(rotation=90)
plt.bar(X, Y)
plt.show()

###### 3. Top 50 product based on engagement index or based on page surfing

In [None]:
#find Top 50 popupalr product
df1 = df.groupby(['lp_id'])['county_connections_ratio','engagement_index'].mean()
df1.drop(['county_connections_ratio'],axis=1,inplace=True)
df1= df1.sort_values(by=['engagement_index'], ascending=False)

df1.reset_index(inplace = True)
df1 = df1[0:50]
df1

# What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?

In [None]:
df['Sector(s)'].unique()

In [None]:
df_district

In [None]:
df_product_temp = df_district.merge(product_info_df, left_on='lp_id', right_on='LP ID')
df_product_temp.columns

In [None]:
df_product_temp = df_product_temp.drop(['district','LP ID','URL', 'Product Name', 'Primary Essential Function'],axis=1)
df_product_temp['month'] = df_product_temp['time'].apply(lambda x: x.split("-")[1])
df_product_temp['year'] = df_product_temp['time'].apply(lambda x: x.split("-")[0])
df_product_temp['month'] = df_product_temp['month'].astype(int)
df_product_temp

In [None]:
df_product_temp = df_product_temp.groupby(['Sector(s)', 'month', 'year'])['pct_access','engagement_index'].mean()
df_product_temp.reset_index(inplace=True)
df_product_temp

In [None]:
def generate_engaement_bar_chart(dfnew,sector):
    X = dfnew['month']
    Y = dfnew['engagement_index'] 
    f = plt.figure()
    f.set_figwidth(5)
    f.set_figheight(2)
    xpos = np.arange(len(X))
    plt.xticks(X,rotation=90)
    plt.bar(X,Y,color="green")
    plt.xlabel("Month")
    plt.ylabel("engagement_index")
    plt.title("Compare engagement index sector-wise for '"+sector+"'")
    plt.show()

In [None]:
def generate_pctaccess_bar_chart(dfnew,sector):
    X = dfnew['month']
    Y = dfnew['pct_access'] 
    f = plt.figure()
    f.set_figwidth(5)
    f.set_figheight(2)
    xpos = np.arange(len(X))
    plt.xticks(X,rotation=90)
    plt.bar(X,Y,color="blue")
    plt.xlabel("Month")
    plt.ylabel("pct_access")
    plt.title("Compare pct_access sector-wise for '"+sector+"'")
    plt.show()

In [None]:
sectors = df_product_temp['Sector(s)'].unique()
for sector in sectors:
    generate_engaement_bar_chart(df_product_temp[df_product_temp['Sector(s)'] == sector],sector)
    generate_pctaccess_bar_chart(df_product_temp[df_product_temp['Sector(s)'] == sector],sector)

# How does student engagement with different types of education technology change over the course of the pandemic?

In [None]:
df.columns

In [None]:
total_no_of_function = df['Primary Essential Function'].unique().tolist()
total_no_of_function,len(total_no_of_function)

In [None]:
df1 = df.groupby(['Primary Essential Function'])['engagement_index'].mean()
df1 = df1.to_frame()
df1 = df1.sort_values(by=['engagement_index'], ascending=False)
df1.reset_index(inplace=True)
df1

###### So we found most demanding courses in covid-19 are releated to the online classes and learning platform

In [None]:
df1[0:15]

In [None]:
#bar chart to know maximum share
X = df1['Primary Essential Function']
Y = df1['engagement_index']
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(10)
plt.xlabel('state')
plt.ylabel('engagement_index')
plt.title("Most Demading software in Market for Covid-19")
plt.xticks(rotation=90)
plt.bar(X,Y)
plt.show()

In [None]:
#Pie chart to know maximum share
X = df1['Primary Essential Function']
Y = df1['engagement_index']
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(15)
plt.xlabel('state')
plt.ylabel('engagement_index')
plt.title("Most Demading software in Market for Covid-19")
plt.xticks(rotation=90)
plt.pie(Y,labels=X,radius=10,autopct="%0.1f%%",shadow="true")
plt.show()

# How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?

In [None]:
df_temp = df[['state','locale','lp_id','Primary Essential Function','pct_black/hispanic','pct_free/reduced','county_connections_ratio','pp_total_raw','engagement_index','pct_access']]

In [None]:
df_temp

### Black based on locality

In [None]:
df_group_by_hispanic = df_temp.groupby(['locale'])['pct_black/hispanic'].sum()
df_group_by_hispanic = df_group_by_hispanic.to_frame()
df_group_by_hispanic.reset_index(inplace=True)
df_group_by_hispanic

In [None]:
X = df_group_by_hispanic['locale']
Y = df_group_by_hispanic['pct_black/hispanic']
f = plt.figure()
f.set_figwidth(5)
f.set_figheight(5)
plt.title("Black percentage based on locality")
plt.xticks(rotation=90)
plt.pie(Y,labels=X,radius=1,autopct="%0.1f%%",shadow="true")
plt.show()

### Engagement_index based on hispnac

In [None]:
df_group_by_hispanic_engage = df_temp.groupby(['pct_black/hispanic'])['engagement_index','pct_access'].mean()
df_group_by_hispanic_engage.reset_index(inplace=True)
df_group_by_hispanic_engage

In [None]:
X = df_group_by_hispanic_engage['pct_black/hispanic']
Y = df_group_by_hispanic_engage['engagement_index']
f = plt.figure()
f.set_figwidth(5)
f.set_figheight(5)
plt.title("Engagement_index based on hispanic")
plt.xticks(rotation=90)
plt.pie(Y,labels=X,radius=1,autopct="%0.1f%%",shadow="true")
plt.show()

### Eligible for free lunch on localtiy

In [None]:
df_group_by_free_lunch = df_temp.groupby(['locale'])['pct_free/reduced'].mean()
df_group_by_free_lunch = df_group_by_free_lunch.to_frame()
df_group_by_free_lunch.reset_index(inplace=True)
df_group_by_free_lunch

In [None]:
X = df_group_by_free_lunch['locale']
Y = df_group_by_free_lunch['pct_free/reduced']
f = plt.figure()
f.set_figwidth(5)
f.set_figheight(5)
plt.title("Free luch based on demography")
plt.xticks(rotation=90)
plt.pie(Y,labels=X,radius=1,autopct="%0.1f%%",shadow="true")
plt.show()

### Per people expenditure based on locality 

In [None]:
df_group_by_pptraw = df_temp.groupby(['locale'])['pp_total_raw'].mean()
df_group_by_pptraw = df_group_by_pptraw.to_frame()
df_group_by_pptraw.reset_index(inplace=True)
df_group_by_pptraw

In [None]:
X = df_group_by_pptraw['locale']
Y = df_group_by_pptraw['pp_total_raw']
f = plt.figure()
f.set_figwidth(5)
f.set_figheight(5)
plt.title("Per people expenditure based on locality")
plt.xticks(rotation=90)
plt.pie(Y,labels=X,radius=1,autopct="%0.1f%%",shadow="true")
plt.show()

### Statewise per peopil expenditure 

In [None]:
df_group_by_state_pptraw = df_temp.groupby(['state'])['pp_total_raw'].mean()
df_group_by_state_pptraw = df_group_by_state_pptraw.to_frame()
df_group_by_state_pptraw.reset_index(inplace=True)
df_group_by_state_pptraw

In [None]:
X = df_group_by_state_pptraw['state']
Y = df_group_by_state_pptraw['pp_total_raw']
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(10)
plt.xlabel('state')
plt.ylabel('pp_total_raw')
plt.title("Statewise per peopile expenditure")
plt.xticks(rotation=90)
plt.bar(X,Y)
plt.show()

### Statewise free lunch

In [None]:
df_group_by_state_freelunch = df_temp.groupby(['state'])['pp_total_raw','engagement_index'].mean()
# df_group_by_state_freelunch = df_group_by_state_freelunch.to_frame()
df_group_by_state_freelunch.reset_index(inplace=True)
df_group_by_state_freelunch

In [None]:
X = df_group_by_state_freelunch['state']
Y = df_group_by_state_freelunch['pp_total_raw']
Z = df_group_by_state_freelunch['engagement_index'] 
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(5)
xpos = np.arange(len(X))
plt.xticks(xpos,X,rotation=90)
plt.bar(xpos-0.2,Y,width=0.4,color="green",label="Per-pupil total expenditure")
plt.bar(xpos+0.2,Z,width=0.4,color="red",label="engagement_index")
plt.xlabel("State")
plt.ylabel("Vale")
plt.title("Compare ")
plt.legend(fontsize="12")
plt.show()

### Engagement_index based on Locality

In [None]:
df_group_by_locale = df.groupby(['locale'])['engagement_index','pct_access'].mean()
df_group_by_locale.reset_index(inplace=True)
df_group_by_locale

In [None]:
X = df_group_by_locale['locale']
Y = df_group_by_locale['engagement_index']
f = plt.figure()
f.set_figwidth(5)
f.set_figheight(5)
plt.title("Engagement_index based on demography")
plt.xticks(rotation=90)
plt.pie(Y,labels=X,radius=1,autopct="%0.1f%%",shadow="true")
plt.show()

###### Engagement and PCT access based on locality, hispanic, ppt_raw

In [None]:
df_temp['pct_black/hispanic'] = df_temp['pct_black/hispanic'].round(2) 
df_temp['pp_total_raw'] = df_temp['pp_total_raw'].round(2) 
df_temp['pct_free/reduced'] = df_temp['pct_free/reduced'].round(2) 
df_demography = df_temp.groupby(['locale','pct_black/hispanic','pp_total_raw','pct_free/reduced'])['engagement_index','pct_access'].mean()
df_demography

In [None]:
X = df_demography.index
Y = df_demography['engagement_index']
f = plt.figure()
f.set_figwidth(5)
f.set_figheight(5)
plt.title("Engagement_index based on demography(locale,pct_black/hispanic,pp_total_raw,pct_free/reduced)")
plt.xticks(rotation=80)
plt.pie(Y,labels=X,radius=15,autopct="%0.1f%%",shadow="true",labeldistance=1.01,wedgeprops = {'linewidth': 3})
plt.show()

# Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

In [None]:
df_district_marge_info = district_info_df1.merge(df_district, left_on='district_id', right_on='district')

In [None]:
df_district_marge_info = df_district_marge_info[['district','state', 'time', 'lp_id', 'pct_access', 'engagement_index']]
df_district_marge_info

In [None]:
df_district_marge_info.dtypes

In [None]:
df_district_marge_info['month'] = df_district_marge_info['time'].apply(lambda x: x.split("-")[1])
df_district_marge_info['year'] = df_district_marge_info['time'].apply(lambda x: x.split("-")[0])

In [None]:
df_district_marge_info['month'] = df_district_marge_info['month'].astype(int)
df_district_marge_info

In [None]:
# df_district_marge_info_temp = df_district_marge_info.groupby(['district', 'state', 'month', 'year'])['pct_access','engagement_index'].mean()
df_district_marge_info_temp = df_district_marge_info.groupby(['state', 'month', 'year'])['pct_access','engagement_index'].mean()
df_district_marge_info_temp.reset_index(inplace=True)
df_district_marge_info_temp

In [None]:
df_district_marge_info_temp[df_district_marge_info_temp.state == 'Arizona']

In [None]:
def generate_engaement_bar_chart(dfnew,state):
    X = dfnew['month']
    Y = dfnew['engagement_index'] 
    f = plt.figure()
    f.set_figwidth(5)
    f.set_figheight(2)
    xpos = np.arange(len(X))
    plt.xticks(X,rotation=90)
    plt.bar(X,Y,color="green")
    plt.xlabel("Month")
    plt.ylabel("engagement_index")
    plt.title("Compare engagement index statewise for "+stat)
    plt.show()

In [None]:
def generate_pctaccess_bar_chart(dfnew,sector):
    X = dfnew['month']
    Y = dfnew['pct_access'] 
    f = plt.figure()
    f.set_figwidth(5)
    f.set_figheight(2)
    xpos = np.arange(len(X))
    plt.xticks(X,rotation=90)
    plt.bar(X,Y,color="red")
    plt.xlabel("Month")
    plt.ylabel("pct_access")
    plt.title("Compare pct_access sector-wise for "+sector)
    plt.show()

In [None]:
states = df_district_marge_info_temp.state.unique()
for stat in states:
    generate_engaement_bar_chart(df_district_marge_info_temp[df_district_marge_info_temp.state == stat],stat)
    generate_pctaccess_bar_chart(df_district_marge_info_temp[df_district_marge_info_temp.state == stat],stat)