<h1 style="font-family:'lucida console';"> <center>📚 DIGITAL LEARNING IN THE PANDEMIC PERIOD (2020) 🖥️</center> </h1>

In [None]:
# Libraries and some environment configurations
try:
    import openpyxl
except:
    !pip install openpyxl
import pandas as pd
import re
import glob
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from warnings import simplefilter
import datetime as dt
from IPython.display import Markdown as md
# from IPython.display import Image
import gc
gc.collect()

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

pal2 = ["#ffcbf2","#f3c4fb","#ecbcfd","#e5b3fe","#e2afff","#deaaff","#d8bbff","#d0d1ff","#c8e7ff","#c0fdff"]

In [None]:
# Functions that will be used in the process

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type) == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, index_col=None, header=0)
    df = reduce_mem_usage(df)
    return df


In [None]:
product_df = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
district_df = import_data("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")

path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = import_data(filename)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    if df.time.nunique() == 366:
        li.append(df)
    
engagement_df = pd.concat(li, axis=0, ignore_index=True)
engagement_df = engagement_df.reset_index(drop=True)

engagement_df['district_id']=engagement_df['district_id'].astype(str)
district_df['district_id']=district_df['district_id'].astype(str)
district_df.loc[:,'district_id'] = district_df['district_id'].str.replace('\.0', '')


del li, path, all_files

gc.collect()

In [None]:
# Shape of the data files ( number of rows and number of columns) 
print('\033[1m'"Shape of the Engagement File "'\033[0m',engagement_df.shape )
print('\033[1m'"Shape of the District File"'\033[0m', district_df.shape)
print('\033[1m'"Shape of the Product File"'\033[0m',product_df.shape)

In [None]:
show = district_df.head()

show = show.style.format(precision=0, na_rep='MISSING')

cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #000066; color: white;'
}
show.set_table_styles([cell_hover, headers])
show.set_table_styles([
    {'selector': 'th.col_heading','props': 'text-align: center; font-size:1.25em'},
    {'selector': 'td', 'props': 'text-align: center;'},
], overwrite=False)
show.set_caption("First 5 rows from district table")\
 .set_table_styles([{
     'selector': 'caption',
     'props': 'caption-side: bottom; font-size:1.25em;font-style: italic;'
 }], overwrite=False)

show.set_table_styles([  # create internal CSS classes
    {'selector': '.border-red', 'props': 'border: 2px dashed red;'}
], overwrite=False)

cell_border = pd.DataFrame([[' ', ' ',' ',' ', ' ', ' ',' '],
                           ['border-red ', 'border-red ', 'border-red ', 'border-red ','border-red ','border-red ','border-red '],
                           [' ', ' ', ' ', ' ',' ',' ',' '],
                           ['border-red ', 'border-red ', 'border-red ', 'border-red ','border-red ','border-red ','border-red '],
                           ['border-red ', 'border-red ', 'border-red ', 'border-red ','border-red ','border-red ','border-red ']],
                          index=show.index,
                          columns=show.columns)

show.set_td_classes(cell_border)

In [None]:
district_df.iloc[district_df[(district_df.isnull().sum(axis=1) ==6)].index].count()

In [None]:
del show
gc.collect()

#district_df = district_df[['district_id','state','locale']]
district_df = district_df[district_df.state.notna()]
district_df = district_df[district_df.district_id.isin(engagement_df.district_id.unique())].reset_index(drop=True)
district_df.head()

In [None]:
show = product_df.head()

show = show.style.format(precision=0, na_rep='MISSING')

cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #000066; color: white;'
}
show.set_table_styles([cell_hover, headers])
show.set_table_styles([
    {'selector': 'th.col_heading','props': 'text-align: center; font-size:1.25em'},
    {'selector': 'td', 'props': 'text-align: center;'},
], overwrite=False)
show.set_caption("First 5 rows from product table")\
 .set_table_styles([{
     'selector': 'caption',
     'props': 'caption-side: bottom; font-size:1.25em;font-style: italic;'
 }], overwrite=False)

In [None]:
del show
gc.collect()

temp_sectors = product_df['Sector(s)'].str.get_dummies(sep="; ")
temp_sectors.columns = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]
product_df = product_df.join(temp_sectors)

del temp_sectors
gc.collect()

product_df['primary_function_main'] = product_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
product_df['primary_function_sub'] = product_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

# Synchronize similar values
product_df['primary_function_sub'] = product_df['primary_function_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
product_df.drop("Primary Essential Function", axis=1, inplace=True)
product_df = product_df[product_df['LP ID'].isin(engagement_df.lp_id.unique())].reset_index(drop=True)

In [None]:
product_df.head()

In [None]:
show = engagement_df.head()

show = show.style.format(precision=0, na_rep='MISSING')

cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #000066; color: white;'
}
show.set_table_styles([cell_hover, headers])
show.set_table_styles([
    {'selector': 'th.col_heading','props': 'text-align: center; font-size:1.25em'},
    {'selector': 'td', 'props': 'text-align: center;'},
], overwrite=False)
show.set_caption("First 5 rows from engagement table")\
 .set_table_styles([{
     'selector': 'caption',
     'props': 'caption-side: bottom; font-size:1.25em;font-style: italic;'
 }], overwrite=False)

In [None]:
print(len(engagement_df))
engagement_df = engagement_df[engagement_df.lp_id.isin(product_df['LP ID'].unique())]
print(len(engagement_df))
engagement_df = engagement_df[engagement_df.district_id.isin(district_df['district_id'].unique())]
print(len(engagement_df))

In [None]:
print(engagement_df.isna().sum())

In [None]:
full_table = pd.merge(engagement_df,product_df, "inner", left_on='lp_id', right_on='LP ID')
full_table = pd.merge(full_table,district_df, "inner", on='district_id')
full_table = full_table[full_table.engagement_index.notna()]
full_table.dropna(subset = ["state"], inplace=True)

In [None]:
full_table.shape

In [None]:
full_table.head()

In [None]:
# datetime 
full_table["time"] = pd.to_datetime(full_table.time)
full_table["week"] = full_table.time.dt.dayofweek 
full_table["holiday"] = full_table.week.apply(lambda x: 1 if x in [5, 6] else 0)
d = pd.date_range(start="2020-01-01", end="2020-01-19")
full_table["is_pandemic"] = full_table.time.apply(lambda x: 0 if x in d else 1)
full_table.drop("week", axis=1, inplace=True)

In [None]:
full_table.head()

# 1. Acesso e engajamento entre feriados pré e póspandemia

In [None]:
full_table[full_table.is_pandemic == 0].groupby('holiday')['pct_access'].mean()

In [None]:
full_table[full_table.is_pandemic == 1].groupby('holiday')['pct_access'].mean()

In [None]:
full_table[full_table.is_pandemic == 0].groupby('holiday')['engagement_index'].mean()

In [None]:
full_table[full_table.is_pandemic == 1].groupby('holiday')['engagement_index'].mean()

# 2. Ferramenta mais popular

In [None]:
most_popular_produscts = full_table.groupby('Product Name')['engagement_index'].mean()

In [None]:
most_popular_produscts.reset_index().sort_values(by = 'engagement_index', ascending=False, ignore_index=True).head(10)

In [None]:
# AQUI ADICIONAR A ANÁLISE HISTÓRICA

# 3. Estados com maior engajamento

In [None]:
most_engaged_states = full_table.groupby('state')['engagement_index'].mean()
most_engaged_states.reset_index().sort_values(by = 'engagement_index', ascending=False, ignore_index=True).head(10)

In [None]:
most_engaged_states = full_table.groupby('locale')['engagement_index'].mean()
most_engaged_states.reset_index().sort_values(by = 'engagement_index', ascending=False, ignore_index=True).head(10)

In [None]:
most_engaged_states = full_table.groupby(['state', 'locale'])['engagement_index'].mean()
most_engaged_states.reset_index().sort_values(by = 'engagement_index', ascending=False, ignore_index=True).head(10)

# 4. Demografia

In [None]:
full_table.groupby('pct_black/hispanic')['pct_access'].mean()

In [None]:
full_table.groupby('pct_black/hispanic')['engagement_index'].mean()

# 5. Empresas com maior número de produtos

In [None]:
product_df['Provider/Company Name'].value_counts().reset_index().head(10)

Aqui gostaria ainda de testar se as ferramentas das empresas com mais produtos são aquelas com melhor aceitação (se isso é correlato de alguma forma) ou algum fornecedor de apenas um produto (especializado) tem os melhores resultados.