In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
from IPython.core.display import HTML
import re
from glob import glob

# Data Analysis

### Product information data
    - The product file products_info.csv includes information about the characteristics of the top 372 products with most users in 2020

In [None]:
# Load in the data
prod_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')
print(f"# of Product information available: {prod_df.shape[0]}")
prod_df.head()

In [None]:
print("Columns details")
pd.DataFrame(prod_df.dtypes).rename(columns={0:'dtype'})

* **LP ID** - The unique identifier of the product
* **URL** - Web Link to the specific product
* **Product Name** - Name of the specific product
* **Provider/Company Name** - Name of the product provider
* **Sector(s)** - Sector of education where the product is used
* **Primary Essential Function** - Has three main categories  
        - LC = Learning & Curriculum, 
        - CM = Classroom Management, and 
        - SDO = School & District Operations

Check for the null values in the data, incase of any fill them by unknown

In [None]:

prod_df.columns = ['lp_id', 'url', 'product_name', 'provider/company_name', 'sectors', 'primary_essential_function']
prod_df.isnull().sum()

In [None]:
# Fill one missing data for Provider/Company Name as unknown
prod_df['provider/company_name'].fillna('Unknown', inplace=True)
prod_df['provider/company_name'] = prod_df['provider/company_name'].astype(str)

***Analysis on Sector(s) variable***

In [None]:
# Fill the missing data for Sectors as unknown
prod_df['sectors'].fillna('Unknown', inplace=True)
prod_df['sectors'] = prod_df['sectors'].astype(str)
print(f"List of sectors:")
list(prod_df['sectors'].unique())

> **There are products which aims to provide learning platform for more than one sectors of education** 

In [None]:
# Split the multiple values of Sectors into different rows 
# def splitDataFrameList(df,target_column,separator):
#     ''' df = dataframe to split,
#     target_column = the column containing the values to split
#     separator = the symbol used to perform the split
#     returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
#     The values in the other columns are duplicated across the newly divided rows.
#     '''
#     def splitListToRows(row,row_accumulator,target_column,separator):
#         split_row = row[target_column].split(separator)
#         for s in split_row:
#             new_row = row.to_dict()
#             new_row[target_column] = s.strip()
#             row_accumulator.append(new_row)
#     new_rows = []
#     df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
#     new_df = pd.DataFrame(new_rows)
#     return new_df
# prod_df = splitDataFrameList(prod_df, 'Sector(s)', ';')

In [None]:
sectors_data = prod_df.groupby('sectors').size().reset_index(name='count')
sectors = sectors_data['sectors'].values
explode = (0.01,0.01,0.01,0.01, 0.01, 0.01)
plt.figure(figsize=(8,8))
plt.pie(sectors_data['count'], labels=sectors, autopct='%1.1f%%', startangle=15, explode=explode)
plt.title('% of Digital Learning Products per Education Sectors')
plt.axis('equal')
plt.show()

> **As the plot indicates that more than 45% of the learning products targets PreK-12 sector, this could be the result of Prek-12 initiative (The PreK-12 initiative works to ensure that all children attending public elementary and secondary schools have access to and receive high-quality educational experiences, with a particular emphasis on improving equity and outcomes for traditionally underserved students.) https://www.newamerica.org/education-policy/prek-12-education/about**

> **less than 0.5% of product provide learning platform for all the sectors under one roof**


***Analysis on Primary Essential Function variable***

In [None]:
list(prod_df['primary_essential_function'].unique())

In [None]:
# Fill the missing data for Primary Essential Function as unknown
prod_df['primary_essential_function'].fillna('Unknown', inplace=True)
prod_df['primary_essential_function'] = prod_df['primary_essential_function'].astype(str)
prod_df['PSF_main_category'] = prod_df['primary_essential_function'].apply(lambda x: x.split()[0]) 

In [None]:
pef_data = prod_df.groupby('PSF_main_category').size().reset_index(name='count')
category  = pef_data['PSF_main_category'].values
explode = (0.05,0.05,0.05,0.05, 0.05)
plt.figure(figsize=(8,8))
plt.pie(pef_data['count'], labels=category, autopct='%1.1f%%', startangle=15, explode=explode)
plt.title('% of Digital Learning Products per Primary Essential Function')
plt.axis('equal')
plt.show()

> **There are three main essential functions that are provided by Digital learning platform providers**  

        - LC = Learning & Curriculum,
        
        - CM = Classroom Management, and 
        
        - SDO = School & District Operations

> **More than 70% of the products focuses on providing plotform for learning and curriculum.** 

In [None]:
def plot_percentage_with_hue(plot, feature, Number_of_categories, hue_categories):
    a = [p.get_height() for p in plot.patches]
    patch = [p for p in plot.patches]
    for i in range(Number_of_categories):
        total = feature.value_counts().values[i]
        for j in range(hue_categories):
            percentage = '{:.1f}%'.format(100 * a[(j*Number_of_categories + i)]/total)
            x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.15
            y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height() 
            ax.annotate(percentage, (x, y), size = 12)
    plt.show()


plt.figure(figsize=(15,8))
ax = sns.countplot("PSF_main_category", hue="sectors", data=prod_df)
plot_percentage_with_hue(ax, prod_df['PSF_main_category'], 5,6)

> **There is evidence of more focus on the PreK-12 Learning process. More products are build to support PreK-12 education sector**

*Extract sub categories of Primary Essential Function*

In [None]:
prod_df['PSF_sub_categories'] = prod_df['primary_essential_function'].apply(lambda x: re.split("^(LC - |CM - |SDO - |LC\/CM\/SDO - )", x)[-1])

In [None]:
psf_groups = prod_df.groupby(['PSF_main_category', 'PSF_sub_categories', ]).size().reset_index(name='count')
psf_groups

> **Above table gives the comprehensive list of primary essential services provided by the digital platforms.** 

> **More than 70 % of the Product's focus is on Study Tools primarily for PreK-12 sectors(Refer to the Pie chart above)**

> **Remaining 30% is focused on Teacher Resources, Classroom facility, Career Planning and School managment**

### District Information Data

The district file districts_info.csv includes information about the characteristics of school districts, including data from NCES (2018-19), FCC (Dec 2018), and Edunomics Lab.

In [None]:
# Load in the data
dist_df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
print(f"# of district information available: {dist_df.shape[0]}")
dist_df.head()

In [None]:
print("Columns details")
pd.DataFrame(dist_df.dtypes).rename(columns={0:'dtype'})

* **district_id** - the unique identifier of the school district
* **state** - the state where the district resides
* **locale** - four types of areas: City, Suburban, Town, and Rural
* **pct_black/hispanic** - Percentage of students in the districts identified as Black or Hispanic
* **pct_free/reduced** - Percentage of students in the districts eligible for free or reduced-price lunch
* **countyconnectionsratio** - residential fixed high-speed connections over 200 kbps in at least one direction/households
* **pptotalraw** - Per-pupil total expenditure (sum of local and federal expenditure)

In [None]:
dist_df.isnull().sum()

A Lot of values are missing in district information, lets mark those values as unknown

Removing the rows which has more than 5 columns are NaN

In [None]:
dist_df = dist_df[dist_df.isnull().sum(axis=1) < 7]
print(dist_df.shape)
dist_df.isnull().sum()

***Analysis on locale variable***

There are four types of areas: City, Suburban, Town, and Rural. Let's understand the distribution of the schools by locale

In [None]:
def plot_percentage_without_hue(plot, feature):
    total = len(feature)
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2 - 0.05
        y = p.get_y() + p.get_height()
        ax.annotate(percentage, (x, y), size = 12)
    plt.show()
    

plt.figure(figsize=(10,8))
ax = sns.countplot(x='locale', data=dist_df)
plt.xticks(size=12)
plt.title("Distribution of School District per locale")
plot_percentage_without_hue(ax, dist_df.locale)
plt.show()

> **Plot indicates that 59.1% of school districts are placed in suburban areas. It would be interesting to know what drives the number of schools ditricts in each locale, is it the population, economic conditions and more**

***Analysis on pct_black/hispanic variable***

This variable indicates the proportion os balck and hispanic students in a given school district

In [None]:
dist_df['pct_black/hispanic'].unique()

Values are presented in [reverse-bracket](https://proofwiki.org/wiki/Definition:Real_Interval/Notation/Reverse-Bracket) notation 

[𝑎, 𝑏 [ := {𝑥 ∈ ℝ: 𝑎 ≤ 𝑥 < 𝑏} - Half-open on the right

In [None]:
plt.figure(figsize=(10,8))
ax = sns.countplot(x='pct_black/hispanic', data=dist_df)
plt.xticks(size=12)
plt.title("% Distribution of Black/hispanic Students accross the school district")
plot_percentage_without_hue(ax, dist_df.locale)
plt.show()

> **Nearly 65.9% of the School districts has 0 to 20% of Black/Hispanic students**

> **3.4% of school districts with only/mostly Black/Hispanic Students**

In [None]:
plt.figure(figsize=(15,8))
ax = sns.countplot("pct_black/hispanic", hue="locale", data=dist_df)
plot_percentage_with_hue(ax, dist_df['pct_black/hispanic'], 5,4)

> **Rural and Towns' School districts has less than 0 to 60% of the pct_black/hispanic**

***Analysis on pct_free/reduced variable***

This variable indicates the proportion of free or reduced lunch price eligible students 

In [None]:
dist_df['pct_free/reduced'].unique()

In [None]:
plt.figure(figsize=(10,8))
ax = sns.countplot(x='pct_free/reduced', data=dist_df)
plt.xticks(size=12)
plt.title("% distribution of students eligible for free or reduced lunch accross the school district")
plot_percentage_without_hue(ax, dist_df.locale)
plt.show()

> **There are only 2.3% of the school districts which has 80 to 100% of students who are eligible for free or reduced lunch**

> **There are 26.1% of school districts which has no or 20% of the students eligible for free or reduced lunch**

***Analysis on countyconnectionsratio variable***

ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households)

In [None]:
dist_df['county_connections_ratio'].unique()

In [None]:
plt.figure(figsize=(10,8))
ax = sns.countplot(x='county_connections_ratio', data=dist_df)
plt.xticks(size=12)
plt.title("% distribution of connection ratio accross the school district")
plot_percentage_without_hue(ax, dist_df.locale)
plt.show()

***Analysis on pptotalraw variable***

Per-pupil total expenditure (sum of local and federal expenditure)

In [None]:
dist_df['pp_total_raw'].unique()

**Total per pupil expendature across different locale**

In [None]:
plt.figure(figsize=(15,8))
ax = sns.countplot(x='pp_total_raw', data=dist_df)
plt.xticks(size=8)
plt.title("% distribution per pupil total expenditure  accross the school district")
plot_percentage_without_hue(ax, dist_df.locale)
plt.show()

> **There are less than 1% of school districts whose per pupil expenditure is in the range of 32000 to 34000**

> **Distribution is right tailed, there are less than 10% of school districts which spends more than 20000**

***Analysis on state variable***

In [None]:
print(f"Number of states where school district details are given: {len(dist_df['state'].unique())}")

### Analysis on engagement data

In [None]:
eng_data_path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data'
csv_files = glob(eng_data_path + "/*.csv")
csv_files[0]

> Extracting the district id from the file path and concatinating all the files to a single dataframe

> Merging the product, district and enganement dtaa to a single dataframe. This might omit the data points in enagement data for which product details are not available 

In [None]:
dfs = []

for filename in csv_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = int(district_id)
    merge_dist_df = pd.merge(df, dist_df, on ='district_id')
    merge_prod_df = pd.merge(merge_dist_df, prod_df, on ='lp_id')
    dfs.append(merge_prod_df)
    
eng_df = pd.concat(dfs)
eng_df = eng_df.reset_index(drop=True)
print(f"# of data points in engagement data: {eng_df.shape[0]}")
eng_df.head()

In [None]:
# Remove missing values for engagement_index Missing values 
eng_df_c = eng_df[~eng_df['engagement_index'].isnull()]
eng_df_c.isnull().sum()

Some of the data points are missing for few columns 

# WIP ***Ananlysis on pct_access***

Percentage of students in the district have at least one page-load event of a given product and on a given day

In [None]:
df_1000 = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/1000.csv")
df_1039 = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/1039.csv")