In [None]:
# Import libraries 

In [None]:
import numpy as np 
import pandas as pd 
import math
import glob
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import folium
from geopy.geocoders import Nominatim
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

# Reading data files 👓

### Product information data
> The product file ```products_info.csv``` includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy. 

📝 Some products may not have labels due to being duplicate, lack of accurate url or other reasons.

| Name                       | Description                                                                                                                                                                                                                                                                                                                    |
|----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LP ID                      | The unique identifier of the product <br>製品ID                                                                                                                                                                                                                                                                                          |
| URL                        | Web Link to the specific product <br>製品に対するウェブリンク                                                                                                                                                                                                                                                                                              |
| Product Name               | Name of the specific product <br>製品名                                                                                                                                                                                                                                                                                                  |
| Provider/Company Name      | Name of the product provider <br>製品のプロバイダー名                                                                                                                                                                                                                                                                                                  |
| Sector(s)                  | Sector of education where the product is used <br>製品が使われている教育セクター                                                                                                                                                                                                                                                                                 |
| Primary Essential Function | The basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled <br>製品の基本機能。2層のラベルがある。まずLC（学習とカリキュラム）,CM（教室管理）,SDO（学校と地区の運営）のいずれかのカテゴリにラベル付けされる。それぞれのカテゴリに対して複数のサブカテゴリがラベル付けされる。|
|                            |                                                                                                                                                                                

In [None]:
# 製品情報をデータフレーム（products_df）に入れる
#Product information
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_df.head()

### District information data

>The district file ```districts_info.csv``` includes information about the **characteristics of school districts**, including data from 
>- NCES (2018-19), 
>- FCC (Dec 2018), and 
>- Edunomics Lab. 

Steps taken to preserve Privacy 🔒 
- Identifiable information about the school districts has been removed. 
- An open source tool ARX (Prasser et al. 2020) was used to transform several data fields and reduce the risks of re-identification. 

📝 For data generalization purposes some data points are released with a range where the actual value falls under. Additionally, there are many missing data marked as 'NaN' indicating that the data was suppressed to maximize anonymization of the dataset.

| Name                   | Description                                                                                                                                                                                                                                                                              |
|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| district_id            | The unique identifier of the school district <br>学区ID                                                                                                                                                                                                                                            |
| state                  | The state where the district resides in <br>地区が存在する州                                                                                                                                                                                                                                                 |
| locale                 | NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See Locale Boundaries User's Manual for more information. <br>アメリカの領土を都市、郊外、町、地方の4つのタイプの領域に分類するNCESロケール分類。詳細については、ロケール境界ユーザーズマニュアルを参照。                                                                                                         |
| pct_black/hispanic     | Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data <br>2018-19NCESデータに基づいて黒人またはヒスパニックとして識別された地区の学生の割合                                                                                                                                                                                      |
| pct_free/reduced       | Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data <br>2018-19 NCESデータに基づく、無料または割引価格された昼食の対象となる地区の学生の割合                                                                                                                                                                             |
| countyconnectionsratio | ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information. <br>FCC From 477（2018年12月版）の国家基準データに基づく（少なくとも一方向/世帯で200kbpsを超える住宅用固定高速接続の）比率。詳細については、FCCデータを参照。                                                                         |
| pptotalraw             | Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERDS) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.<br>Edunomics研究所のNERDS(National Education Resource Database on Schools)プロジェクトによる生徒1人あたりの総支出（地方と連邦の支出の合計）。支出データは学校ごとであり、中央値を使用して学区の支出を表す。 |
                                                         

In [None]:
# 地区情報をデータフレーム（districts_df）に入れる
#District information
districts_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_df.head()

### Engagement data
> The engagement data are aggregated at school district level, and each file in the folder ```engagement_data``` represents data from **one school district***. 

📝The 4-digit file name represents ```district_id``` which can be used to link to district information in ```district_info.csv```. 

📝The ```lp_id``` can be used to link to product information in ```product_info.csv```.

| Name             | Description                                                                                                    |
|------------------|----------------------------------------------------------------------------------------------------------------|
| time             | date in "YYYY-MM-DD" <br>日付                                                                                          |
| lp_id            | The unique identifier of the product <br>製品ID                                                                          |
| pct_access       | Percentage of students in the district have at least one page-load event of a given product and on a given day <br>指定された製品が指定された日に少なくとも1回のページ読み込みイベントを行った地区内の学生の割合 |
| engagement_index | Total page-load events per one thousand students of a given product and on a given day <br>指定された製品の指定された日における学生1,000人あたりのページ読み込みイベントの合計                        |

In [None]:
# 地区ごとの製品に対する特定の日時のエンゲージメントデータをデータフレームall_filesに
#Specific date and time engagement data for products by district
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.head()

# Missing values

In [None]:
# 製品情報のデータフレームにおける各カラムのデータ量の可視化（欠損値の探索）
# Product information　Finding Missing Values
msno.bar(products_df,color='#7209b7', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

In [None]:
# 地区情報のデータフレームにおける各カラムのデータ量の可視化（欠損値の探索）
#　District information　Finding Missing Values
msno.bar(districts_df,color='#f72585', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

In [None]:
# エンゲージメント情報のデータフレームにおける各カラムのデータ量の可視化（欠損値の探索）
# Engagement information Finding Missing Values
msno.bar(engagement_df,color='#4895ef', sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

# EDA 📊

In [None]:
# 色合いの設定
pal = ["#ffcbf2","#f3c4fb","#ecbcfd","#e5b3fe","#e2afff","#deaaff","#d8bbff","#d0d1ff","#c8e7ff","#c0fdff"]

In [None]:
# 地区情報(districts_df)に含まれるロケール種別（都市/田舎/郊外/街）の割合の表示
# Display of the percentage of locale types included in the district information

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Locale Distribution', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05, 0.05)
labels = list(districts_df.locale.value_counts().index)
sizes = districts_df.locale.value_counts().values
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.7, colors=["#d45d00","#ff9100","#eaaa00","#6d6875"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
# 製品情報(products_df)に含まれるセクターの割合を表示
# Display the percentage of sectors included in product information

c1=c2=c3=0
for s in products_df["Sector(s)"]:
    if(not pd.isnull(s)):
        s = s.split(";")
        for i in range(len(s)):
            sub = s[i].strip()
            if(sub == 'PreK-12'): c1+=1
            if(sub == 'Higher Ed'): c2+=1
            if(sub == 'Corporate'): c3+=1
​
fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Sector Distribution', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['PreK-12','Higher Ed','Corporate']
sizes = [c1,c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#ff228a","#20b1fd","#ffb703"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
primary_essential_main = []
primary_essential_sub = []
for s in products_df["Primary Essential Function"]:
    if(not pd.isnull(s)):
        s1 = s.split("-",1)[0].strip()
        primary_essential_main.append(s1)
    else:
        primary_essential_main.append(np.nan)
    
    if(not pd.isnull(s)):
        s2 = s.split("-",1)[1].strip()
        primary_essential_sub.append(s2)
    else:
        primary_essential_sub.append(np.nan)

products_df["primary_essential_main"] = primary_essential_main
products_df["primary_essential_sub"] = primary_essential_sub

In [None]:
# 製品情報(products_df)のラベルの1層目の割合の表示
# Display of the ratio of the first layer of the product information label
c1=c2=c3=0

for s in products_df["primary_essential_main"]:
    if(not pd.isnull(s)):
        c1 += s.count("CM")
        c2 += s.count("LC")
        c3 += s.count("SDO")

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Primary Essential Function', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = ['CM','LC','SDO']
sizes = [c1, c2, c3]
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.2f%%', pctdistance=0.7, colors=["#18ff9f","#2cfbff","#ffb703"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
#The amount of pages that open the basic functions of the product
# 製品の基本機能を開いたページ量
pem_ei_df = products_engagement_data[['primary_essential_main','engagement_index']]

print(pem_ei_df['primary_essential_main'].value_counts())

In [None]:
#bar graph
# 棒グラフ
#Basic product functions:LC (Learning and Curriculum), CM (Classroom Management), SDO (School and District Management)
#製品の基本機能:LC（学習とカリキュラム）,CM（教室管理）,SDO（学校と地区の運営）
pem_ei_df.groupby('primary_essential_main').mean().drop('LC/CM/SDO').plot.bar()

# Merging files 📁 

In [None]:
engagement_df['time'] = pd.to_datetime(engagement_df['time'])

In [None]:
print(products_df["LP ID"].nunique())
print(engagement_df["lp_id"].nunique())

In [None]:
products_engagement_data = pd.merge(products_df, engagement_df, left_on='LP ID', right_on='lp_id')
products_engagement_data.head()

In [None]:
print(districts_df["district_id"].nunique())
print(engagement_df["district_id"].nunique())

In [None]:
engagement_df["district_id"] = engagement_df["district_id"].astype(str).astype(int)
districts_engagement_data = pd.merge(districts_df, engagement_df, left_on='district_id', right_on='district_id')
districts_engagement_data.head()

# Correlation 📁

In [None]:
# Display pct_free / reduced in descending order
#pct_free/reducedを多い順に表示
plt.figure(figsize=(4, 5))
districts_engagement_data['pct_free/reduced'] = districts_engagement_data['pct_free/reduced'].replace('[0, 0.2[', 0.2).replace('[0.2, 0.4[', 0.4).replace('[0.4, 0.6[', 0.6).replace('[0.6, 0.8[', 0.8).replace('[0.8, 1[', 1.0)
sns.countplot(y='pct_free/reduced', data=districts_engagement_data, order=districts_engagement_data["pct_free/reduced"].value_counts().index,color = pal[6])
plt.title("pct_free/reduced",font="Serif", size=20)
plt.show()


In [None]:
#Display pct_black / hispanic in descending order
# pct_black/hispanicを多い順に表示
plt.figure(figsize=(4, 5))
districts_engagement_data['pct_black/hispanic'] = districts_engagement_data['pct_black/hispanic'].replace('[0, 0.2[', 0.1).replace('[0.2, 0.4[', 0.3).replace('[0.4, 0.6[', 0.5).replace('[0.6, 0.8[', 0.7).replace('[0.8, 1[', 0.9)
sns.countplot(y='pct_black/hispanic', data=districts_engagement_data, order=districts_engagement_data["pct_black/hispanic"].value_counts().index,color = pal[6])
plt.title("pct_black/hispanic",font="Serif", size=20)
plt.show()


In [None]:
#Correlation between Hispanic students and free students
#ヒスパニックの学生と無料対象の学生の相関関係
corr = districts_engagement_data[['pct_black/hispanic','pct_free/reduced']].corr()
fig = plt.figure(figsize=(6,6),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of text properties', fontsize=15,font="Serif")
plt.show()


In [None]:
#Scatter plot
#散布図
df=districts_engagement_data
plt.scatter(df['pct_black/hispanic'], df['pct_free/reduced'])


In [None]:
#Correlation between Hispanic students and page volume
#ヒスパニックの学生とページ量の相関関係
engagement_df['engagement_index']
corr = districts_engagement_data[['pct_black/hispanic','engagement_index']].corr()
fig = plt.figure(figsize=(6,6),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of text properties', fontsize=15,font="Serif")
plt.show()

In [None]:
#Scatter plot
#散布図
df=districts_engagement_data
plt.scatter(df['pct_black/hispanic'], df['engagement_index'])

In [None]:
#Correlation between free target students and page volume
#無料の対象学生とページ量の相関関係
engagement_df['engagement_index']
corr = districts_engagement_data[['pct_free/reduced','engagement_index']].corr()
fig = plt.figure(figsize=(6,6),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of text properties', fontsize=15,font="Serif")
plt.show()

In [None]:
##Scatter plot
#散布図
df=districts_engagement_data
plt.scatter(df['pct_free/reduced'], df['engagement_index'])

In [None]:
#Correlation between free target students and first-time access
#無料の対象学生と初回アクセスの相関関係
engagement_df['pct_access']
corr = districts_engagement_data[['pct_free/reduced','pct_access']].corr()
fig = plt.figure(figsize=(6,6),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of text properties', fontsize=15,font="Serif")
plt.show()

In [None]:
#Correlation between first-time　access rate and page volume
#初回アクセスの割合とページ量の相関関係
engagement_df['engagement_index']
corr = districts_engagement_data[['pct_access','engagement_index']].corr()
fig = plt.figure(figsize=(6,6),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of text properties', fontsize=15,font="Serif")
plt.show()

In [None]:
#Correlation between total project spend per student and page volume
#プロジェクトによる生徒1人あたりの総支出額とページ量の相関関係
districts_engagement_data['pp_total_raw'] = districts_engagement_data['pp_total_raw'].replace('[8000, 10000[', 9000).replace('[10000, 12000[', 11000).replace('[12000, 14000[', 13000).replace('[14000, 16000[', 15000).replace('[6000, 8000[', 7000).replace('[16000, 18000[', 17000).replace('[18000, 20000[', 19000).replace('[22000, 24000[', 23000).replace('[20000, 22000[', 21000).replace('[4000, 6000[', 5000).replace('[32000, 34000[', 33000)
corr = districts_engagement_data[['pp_total_raw', 'engagement_index']].corr()
fig = plt.figure(figsize=(6,6),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of pp_total_raw and engagement_index', fontsize=15,font="Serif")
plt.show()

# 【About the current state of digital learning in 2020】
【2020年のデジタル学習の現状について】

The effect of correlation on the amount of learning of digital learning was investigated

**相関関係からデジタル学習の学習量へ影響を検証しました。**

**≪Those that had a correlation（相関関係があったもの）≫**

・Correlation between Hispanic students and free students

　ヒスパニックの学生と無料対象の学生の相関関係
 
・Correlation between first-time　access rate and page volume

 初回アクセスの割合とページ量の相関関係
 
・Correlation between total project spend per student and page volume

 プロジェクトによる生徒1人あたりの総支出額とページ量の相関関係


**≪Those that have no correlation（相関関係がないもの）≫**

・Correlation between free target students and first-time access

 無料の対象学生と初回アクセスの相関関係
 
・Correlation between free target students and page volume

　無料の対象学生とページ量の相関関係
 
・Correlation between Hispanic students and page volume

　ヒスパニックの学生とページ量の相関関係

Because there is a correlation between the percentage of initial visits and the amount of pages

The effect of promoting learning can be expected by taking measures to increase the number of initial accesses.

初回アクセスの割合とページ量の相関関係があるため
初回のアクセス数を増やす施策をすることにより学習を促進させる効果が見込める