In [None]:
import os
import numpy as np 
import pandas as pd 
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action = 'ignore')

# Reading data files 😵

### Product information data
>  The product file products_info.csv includes information about the characteristics of the top 372 products with most users in 2020. The categories listed in this file are part of LearnPlatform's product taxonomy.

> > LP ID                        
> > > The unique identifier of the product

> > URL                           
> > > Web Link to the specific product

> > Product Name                  
> > > Name of the specific product

> > Provider/Company Name         
> > > Name of the product provider

> > Sector(s)                     
> > > Sector of education where the product is used

> > Primary Essential Function    
> > > The basic function of the product. There are two layers of labels here. Products   
                                       are  first labeled as one of these three categories: LC = Learning & Curriculum,                                          CM = Classroom Management, and SDO = School & District Operations. Each of these                                          categories have multiple sub-categories with which the products were labeled



In [None]:
products_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_df.head()

## Districts information data
> The district file districts_info.csv includes information about the characteristics of school districts, including data from
NCES (2018-19),
FCC (Dec 2018), and
Edunomics Lab.

> > district_id       
> > > The unique identifier of the school district

> > state  
> > > The state where the district resides in

> > locale                   
> > > NCES locale classification that categorizes U.S. territory into four types of areas: City, Suburban, Town, and Rural. See Locale Boundaries User's Manual for more information
.
> > pct_black/hispanic                    
> > > Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data

> > pct_free/reduced        
> > > Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data

> > countyconnectionsratio               
> > > ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information.

> > pptotalraw                       
> > > Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.

In [None]:
districts_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_df.head()

### Engagement data
> The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district*.

> 📝The 4-digit file name represents district_id which can be used to link to district information in district_info.csv.

> 📝The lp_id can be used to link to product information in product_info.csv.

> > Name
> > > Description

> > time
> > > date in "YYYY-MM-DD"

> > lp_id
> > > The unique identifier of the product

> > pct_access
> > > Percentage of students in the district have at least one page-load event of a given product and on a given day

> > engagement_index
> > > Total page-load events per one thousand students of a given product and on a given day

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.head()


# Exploratory data analysis 🔎 

### Products table

In [None]:
print('Shape of products table= {}'.format(products_df.shape))

#### Checking Null values / missing values

In [None]:
products_df.isna().sum()

In [None]:
products_df['Product Name'].value_counts()

In [None]:
products_df['Sector(s)'].value_counts()

In [None]:
products_df['Primary Essential Function'].value_counts()

In [None]:
products_df['Provider/Company Name'].value_counts()

#### Data Visualization 📊

In [None]:
sns.set(style = 'darkgrid')
plt.figure(figsize=(12,10))
sns.countplot(products_df['Primary Essential Function'])
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(products_df['Sector(s)'])
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(products_df['Primary Essential Function'])
plt.xticks(rotation=90);

### Districts table

In [None]:
print('Shape of districts table= {}\n'.format(districts_df.shape))
print('Name of columns:\n {}'.format(districts_df.columns))

In [None]:
districts_df.isna().sum()

In [None]:
districts_df['state'].value_counts()

In [None]:
districts_df['locale'].value_counts()

#### Data Visualization 📊

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(districts_df['state'])
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(districts_df['locale'])
plt.xticks(rotation=90);

# to be continued....🧐