In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:purple;
           font-size:60%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
<a id="districts_data"> </a>
Districts data
</h1>
</div>

Districts data provides information about the characteristics of school districts

Shape of districts_data = (233, 7)

Columns in districts_data: 'district_id', 'state', 'locale', 'pct_black/hispanic','pct_free/reduced', 'county_connections_ratio', 'pp_total_raw'

In [None]:
districts_data=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
districts_data.head()

In [None]:
def count_values(df, col_name):
    # Calculate value counts and percentage of a numeric column
    series = pd.DataFrame(df[col_name].value_counts())
    series['percentage'] = (series[col_name]/df[col_name].value_counts().sum())*100
    series.reset_index(inplace=True)
    series.columns = [col_name, 'count', 'percentage']

    return series

In [None]:
def plotbar_horizontal(series, value_col, label_col, percentage_col, title, palette = None,  color = 'gray', percentage_text = True,
                        xlabel_text = "", ylabel_text = "", figsize = (12,10), barwidth=0.5, pctlcnadjust=1):
    # ------------------------------------------------------------------ #
    # Horizontal bar graph
    # ------------------------------------------------------------------ #

    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(1, 1)
    ax = fig.add_subplot(gs[0, 0])

    # Remove some axis lines
    for s in ["right", "top","left", "bottom"]:
        ax.spines[s].set_visible(False)

    n_unique = series.shape[0]
    value = series[value_col]
    label = series[label_col]

    ax_sns = sns.barplot(ax=ax, x=label, y=value, color=color, palette=palette)

    # Customize/Remove ticks
    ax.tick_params(axis = "y", which = "both", left = False, labelcolor='gray')
    ax.tick_params(width=0, length=0)
    ax.set_xticklabels(label, rotation=90)
    # Customize/Remove labels
    ax.set_ylabel(xlabel_text,color='lightgray')    
    ax.set_xlabel(ylabel_text,color='lightgray')

    plt.title(title, fontsize=14, weight='bold', color="gray")

    # Change bar width
    def change_width_horizontal(ax, new_value):
        for patch in ax.patches :
            current_width = patch.get_width()
            diff = current_width - new_value

            # we change the bar width
            patch.set_width(new_value)

            # we recenter the bar
            patch.set_x(patch.get_x() + diff * .5)

    change_width_horizontal(ax, .5)

    if percentage_text:
        for p, percentage in zip(ax.patches, series[percentage_col]):
            label_1 = "%.0f%%" %percentage
            x = p.get_x() + p.get_width() / 2
            y = p.get_height() + pctlcnadjust
            ax.text(x, y, label_1, ha='center', va='center', color='gray', fontsize=8)

    plt.show()

In [None]:
col_name = 'state'
state_series = count_values(districts_data, col_name)
state_series

In [None]:
value_col = 'count'
label_col = 'state'
percentage_col = 'percentage'
xlabel_text = "Number of rows"
ylabel_text = ''
title = 'Percentage of Districts from each State in the data'
figsize = (12,10)
pctlcnadjust = 0.8
palette = None
color = 'gray'
percentage_text = True
barwidth = 0.5

plotbar_horizontal(state_series, value_col, label_col, percentage_col, title, palette = palette,  color = color, percentage_text = percentage_text,
                   xlabel_text = xlabel_text, ylabel_text = ylabel_text, figsize = figsize, barwidth = barwidth, pctlcnadjust = pctlcnadjust)

In [None]:
def plotbar_vertical(series, value_col, label_col, percentage_col, title, palette = None,  color = 'gray', percentage_text = True,
                    xlabel_text = "", ylabel_text = "", figsize = (12,10), barwidth=0.5, pctlcnadjust=1):
    # ------------------------------------------------------------------ #
    # Vertical bar graph
    # ------------------------------------------------------------------ #

    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(1, 1)
    ax = fig.add_subplot(gs[0, 0])

    # Remove some axis lines
    for s in ["right", "top","left", "bottom"]:
        ax.spines[s].set_visible(False)

    n_unique = series.shape[0]
    value = series[value_col]
    label = series[label_col]

    ax_sns = sns.barplot(ax=ax, x=value, y=label, color=color, palette=palette)

    # Customize/Remove ticks
    ax.tick_params(axis = "x", which = "both", left = False, labelcolor='gray')
    ax.tick_params(width=0, length=0)
    ax.set_xticklabels([]) # remove x tick label
    # Customize/Remove labels
    ax.set_ylabel(xlabel_text,color='lightgray')    
    ax.set_xlabel(ylabel_text,color='lightgray')

    plt.title(title, fontsize=14, weight='bold', color="gray")

    # Change bar width for vertical plot
    def change_width_vertical(ax, new_value) :
        for patch in ax.patches :
            current_width = patch.get_height()
            diff = current_width - new_value

            # we change the bar width
            patch.set_height(new_value)

            # we recenter the bar
            patch.set_y(patch.get_y() + diff * .5)

    change_width_vertical(ax, barwidth)

    if percentage_text:
        for p, percentage in zip(ax.patches, series[percentage_col]):
            label_1 = "%.0f%%" %percentage
            y = p.get_y() + p.get_height() / 2
            x = p.get_width() + pctlcnadjust
            ax.text(x, y, label_1, ha='center', va='center', color='gray', fontsize=10)

    plt.show()

In [None]:
col_name = 'locale'
locale_series = count_values(districts_data, col_name)
locale_series

In [None]:
value_col = 'count'
label_col = 'locale'
percentage_col = 'percentage'
palette = ["darkred" if x=='Suburb' else 'gray' for x in locale_series['locale']]
color = 'gray'
percentage_text = True
xlabel_text = ""
ylabel_text = ''
title = 'Distribution of locale in the data'
figsize = (8,6)
barwidth = 0.3
pctlcnadjust = 5

plotbar_vertical(locale_series, value_col, label_col, percentage_col, title, palette = palette,  color = color, percentage_text = True,
                 xlabel_text = xlabel_text, ylabel_text = ylabel_text, figsize = figsize, barwidth=barwidth, pctlcnadjust=pctlcnadjust)

**pct_black/hispanic** - Percentage of students in the districts identified as Black or Hispanic based on 2018-19 NCES data

How to interpret the value "[0.8, 1[" for pct_black/hispanic variable?

It means the percentage of black/hispanic for that district falls under the range of .8-1. For the purpose of data de-identification, some of the categories only shows a range and not the exact value.

In [None]:
col_name = 'pct_black/hispanic'
population_series = count_values(districts_data, col_name)

ranges = {"[0, 0.2[" : "0-20%",
         "[0.2, 0.4[" : "20-40%",
         "[0.4, 0.6[" : "40-60%",
         "[0.6, 0.8[" : "60-80%",
         "[0.8, 1[" : "80-100%"}

population_series["Balck/Hispanic_population"] = population_series["pct_black/hispanic"].map(ranges)

population_series

In [None]:
value_col = 'count'
label_col = 'Balck/Hispanic_population'
percentage_col = 'percentage'
palette = ["darkred" if x=='0-20%' else 'gray' for x in population_series['Balck/Hispanic_population']]
color = 'gray'
percentage_text = True
xlabel_text = ""
ylabel_text = ''
title = 'Distribution of Black/Hispanic population in the districts'
figsize = (8,6)
barwidth = 0.4
pctlcnadjust = 5

plotbar_vertical(population_series, value_col, label_col, percentage_col, title, palette = palette,  color = color, percentage_text = True,
                 xlabel_text = xlabel_text, ylabel_text = ylabel_text, figsize = figsize, barwidth=barwidth, pctlcnadjust=pctlcnadjust)

**pct_free/reduced** - Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data

In [None]:
col_name = 'pct_free/reduced'
income_series = count_values(districts_data, col_name)

ranges = {"[0, 0.2[" : "0-20%",
         "[0.2, 0.4[" : "20-40%",
         "[0.4, 0.6[" : "40-60%",
         "[0.6, 0.8[" : "60-80%",
         "[0.8, 1[" : "80-100%"}

income_series["FreeLunchEligible_population"] = income_series["pct_free/reduced"].map(ranges)

income_series

In [None]:
value_col = 'count'
label_col = 'FreeLunchEligible_population'
percentage_col = 'percentage'
palette = None
color = 'gray'
percentage_text = True
xlabel_text = ""
ylabel_text = ''
title = 'Distribution of Free/Reduced-price lunch eligible population in the districts'
figsize = (12,8)
barwidth = 0.3
pctlcnadjust = 1.3

plotbar_vertical(income_series, value_col, label_col, percentage_col, title, palette = palette,  color = color, percentage_text = True,
                 xlabel_text = xlabel_text, ylabel_text = ylabel_text, figsize = figsize, barwidth=barwidth, pctlcnadjust=pctlcnadjust)

**county_connections_ratio** - ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information.

In [None]:
col_name = 'county_connections_ratio'
connection_series = count_values(districts_data, col_name)
connection_series

In [None]:
value_col = 'count'
label_col = 'county_connections_ratio'
percentage_col = 'percentage'
palette = None
color = 'gray'
percentage_text = True
xlabel_text = ""
ylabel_text = ''
title = 'Distribution of HighSpeed Internet connection per Household Ratio in the districts'
figsize = (8,6)
barwidth = 0.3
pctlcnadjust = 5

plotbar_vertical(connection_series, value_col, label_col, percentage_col, title, palette = palette,  color = color, percentage_text = True,
                 xlabel_text = xlabel_text, ylabel_text = ylabel_text, figsize = figsize, barwidth=barwidth, pctlcnadjust=pctlcnadjust)

**pp_total_raw** - Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.

In [None]:
def find_ppvalue_starting(s):
    s = s.replace('[', ' ')
    s = s.replace(',', ' ')
    split_list = s.split()
    return int(split_list[0])

In [None]:
col_name = 'pp_total_raw'
expenditure_series = count_values(districts_data, col_name)
expenditure_series

# find starting of the range of pp_total_raw to sort
expenditure_series['pp_starting'] = expenditure_series[col_name].apply(find_ppvalue_starting)
expenditure_series.sort_values(by = 'pp_starting', inplace=True)

expenditure_series

In [None]:
value_col = 'count'
label_col = 'pp_total_raw'
percentage_col = 'percentage'
palette = None
color = 'gray'
percentage_text = True
xlabel_text = ""
ylabel_text = ''
title = 'Distribution of Per-pupil total expenditure in the districts'
figsize = (12,8)
barwidth = 0.5
pctlcnadjust = 0.8


plotbar_horizontal(expenditure_series, value_col, label_col, percentage_col, title, palette = palette,  color = color, percentage_text = percentage_text,
                   xlabel_text = xlabel_text, ylabel_text = ylabel_text, figsize = figsize, barwidth = barwidth, pctlcnadjust = pctlcnadjust)

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:purple;
           font-size:60%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
<a id="Products_data"> </a>
Products data
</h1>
</div>

Product data provides information about the characteristics of the top 372 products with most users in 2020

In [None]:
products_data = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
products_data.head()

One-Hot Encoding the Product Sectors column

Reference: https://www.kaggle.com/iamleonie/how-to-approach-analytics-challenges

In [None]:
import re

temp_sectors = products_data['Sector(s)'].str.get_dummies(sep="; ")
temp_sectors.columns = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]
products_data = products_data.join(temp_sectors)
products_data.head()

Splitting up the Primary Essential Function column

In [None]:
products_data['primary_function_main'] = products_data['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_data['primary_function_sub'] = products_data['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

# Synchronize similar values
products_data['primary_function_sub'] = products_data['primary_function_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})

products_data.head()

In [None]:
sector_dict = {'sector' : ['Corporate', 'HigherEd', 'PreK-12'],
               'count' : [products_data['sector_Corporate'].sum(), products_data['sector_HigherEd'].sum(), products_data['sector_PreK-12'].sum()]}

series = pd.DataFrame.from_dict(sector_dict)
series['percentage'] = (series['count']/products_data['sector_Corporate'].shape[0])*100
series.sort_values(by='count', ascending=False, inplace=True)
series

In [None]:
value_col = 'count'
label_col = 'sector'
percentage_col = 'percentage'
#palette = plt.cm.Greys(np.linspace(1, 0.2, n_unique))
palette = ["darkred" if x=='PreK-12' else 'gray' for x in series['sector']]
title = 'Product sector dstribution'
figsize=(12,8)
barwidth=0.3
pctlcnadjust=10

plotbar_vertical(series, value_col, label_col, percentage_col, title, palette=palette, barwidth=barwidth, pctlcnadjust=pctlcnadjust, figsize=figsize)

In [None]:
series = products_data.groupby(['primary_function_main']).agg({'primary_function_sub': 'value_counts'})
series

In [None]:
series_unstacked = series.unstack()
series_unstacked.reset_index(inplace=True)
series_unstacked.columns = series_unstacked.columns.droplevel(0)
series_unstacked.rename(columns={ series_unstacked.columns[0]: "primary_function_main" }, inplace = True)
series_unstacked

In [None]:
series_melt = series_unstacked.melt(id_vars=['primary_function_main'], var_name='sub',value_name='count').sort_values('primary_function_main')

series_melt.fillna(0, inplace=True)# replace NaN with 0
series_melt

In [None]:
series = series_melt
value_col = 'count'
label_col = 'sub'
percentage_col = ''
#palette = plt.cm.Greys(np.linspace(1, 0.2, n_unique))
palette = None
color = 'gray'
percentage_text = False
xlabel_text = ""
ylabel_text = ''
title = 'Distribution of HighSpeed Internet connection per Household Ratio in the districts'
figsize = (12,14)

# ------------------------------------------------------------------ #
# Vertical bar graph
# ------------------------------------------------------------------ #

fig = plt.figure(figsize=figsize)
gs = fig.add_gridspec(1, 1)
ax = fig.add_subplot(gs[0, 0])

# Remove some axis lines
for s in ["top","right", "bottom"]:
    ax.spines[s].set_visible(False)

n_unique = series.shape[0]
value = series[value_col]
label = series[label_col]

ax_sns = sns.barplot(ax=ax, x=value, y=label, hue=series['primary_function_main'], dodge=False)

# Customize/Remove ticks
ax.tick_params(axis = "x", which = "both", left = False, labelcolor='gray')
ax.tick_params(axis = "x", width=1, length=1)
#ax.set_xticklabels([]) # remove x tick label
# Customize/Remove labels
ax.set_ylabel(xlabel_text,color='lightgray')    
ax.set_xlabel(ylabel_text,color='lightgray')

plt.title(title, fontsize=14, weight='bold', color="gray")

# Change bar width
def change_width(ax, new_value) :
    for patch in ax.patches :
        current_width = patch.get_height()
        diff = current_width - new_value

        # we change the bar width
        patch.set_height(new_value)

        # we recenter the bar
        patch.set_y(patch.get_y() + diff * .5)
        
change_width(ax, .3)

if percentage_text:
    for p, percentage in zip(ax.patches, series[percentage_col]):
        label_1 = "%.0f%%" %percentage
        y = p.get_y() + p.get_height() / 2
        x = p.get_width() + 5
        ax.text(x, y, label_1, ha='center', va='center', color='gray', fontsize=10)
            
plt.show()

## **Thank you for reading my notebook. Please Upvote 🙏🏻**

References:

[🔥💯:COVID-19 Impact on Digital Learning](#https://www.kaggle.com/muhammadimran112233/covid-19-impact-on-digital-learning)

[How To Approach Analytics Challenges](#https://www.kaggle.com/iamleonie/how-to-approach-analytics-challenges)