In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import missingno as msg
import altair as alt
import plotly.express as px
import plotly.figure_factory as ff
from IPython.display import Markdown, display, Image
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## Resources:

* ../input/learnplatform-covid19-impact-on-digital-learning/engagement_data
* ../input/learnplatform-covid19-impact-on-digital-learning/README.md
* ../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv
* ../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv
* ../input/covid-usa-data/all-states-history.csv
* ../input/covid-usa-data/national-history.csv
* ../input/dataset-1/COVID-19 US state policy database 3_29_2021.xlsx
* ../input/child-population-by-race/Child population by race.xlsx

In [None]:
display(Markdown("../input/learnplatform-covid19-impact-on-digital-learning/README.md"))

## Challenge:

* What is the picture of digital connectivity and engagement in 2020?
* What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?
* How does student engagement with different types of education technology change over the course of the pandemic?
* How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?
* Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?

# *What is the picture of digital connectivity and engagement in 2020?*

In [None]:
#taking districts_info.csv dataset
df2=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv",engine="python",encoding="utf-8");df2

In [None]:
!pip install openpyxl

## Dataset df_child_race

### cleaning,sorting and analysis 

In [None]:
#taking Child population by race.xlsx dataset
df_child_race=pd.read_excel("../input/child-population-by-race/Child population by race.xlsx");df_child_race.head()

In [None]:
#taking timeframe 2020 into account;
df_child_race=df_child_race[df_child_race["TimeFrame"]==2020] ;df_child_race

In [None]:
#taking data only from "Location","Race","Data" columns
df_child_race=df_child_race[["Location","Race","Data"]];df_child_race.columns=["state","Race","Data"];
df_child_race=df_child_race.reset_index().drop(columns=["index"]);df_child_race

In [None]:
#getting unique Race types in Race column
df_child_race["Race"].unique().tolist()

In [None]:
"""excluding Total less than 18','Non-Hispanic White alone','Non-Hispanic Native Hawaiian and Other Pacific Islander alone',
'Non-Hispanic American Indian and Alaskan Native alone from Race column"""
def clean_list(my_list, exclusion_list):

    new_list = []
    for i in my_list:
        if i in exclusion_list:
            continue
        else:
            new_list.append(i)

    return new_list

In [None]:
#cleaning the Race unwanted data.
cleanlist=clean_list(df_child_race["Race"].unique().tolist(),['Total less than 18','Non-Hispanic White alone','Non-Hispanic Native Hawaiian and Other Pacific Islander alone',
                                                    'Non-Hispanic American Indian and Alaskan Native alone'])

In [None]:
#getting states that we have in df2 dataset 
df_child_race=df_child_race[df_child_race["state"].isin(df2["state"].unique().tolist())];df_child_race

In [None]:
#removing unwanted symbols in data column.
df_child_race["Data"]=df_child_race["Data"].replace({'<': ''}, regex=True).replace({'%': ''}, regex=True);df_child_race

In [None]:
#conversion datatype to float of data column
df_child_race["Data"]=df_child_race["Data"].astype(dtype = float, errors = 'ignore');df_child_race

In [None]:
df_child_race_1=df_child_race[df_child_race["Data"]>1]
df_child_race_3=df_child_race_1.groupby(["state"]).sum()

In [None]:
df_child_race_2=df_child_race[df_child_race["Data"]<1]
df_merge_child=pd.merge(df_child_race_2, df_child_race_3.groupby(["state"]).sum(),on="state")
df_merge_child["Data"]=df_merge_child["Data_x"]*df_merge_child["Data_y"]
df_merge_child=df_merge_child.drop(columns=["Data_x","Data_y"]);df_merge_child

In [None]:
df_child_race=pd.concat([df_child_race_1,df_merge_child],axis=0);df_child_race

In [None]:
#getting race data that we have selected after excluding unwanted race
df_child_race=df_child_race[df_child_race["Race"].isin(cleanlist)];df_child_race

In [None]:
#data of child counts as per race and geographic region
fig,ax=plt.subplots(figsize=(20,8),facecolor="pink",dpi=120)
sns.boxplot(x="state",y="Data",hue="Race",data=df_child_race,ax=ax)
plt.xticks(rotation=90)
fig.suptitle("boxplot of child counts as per race and geographic region in Timeframe 2020",fontsize=25)
ax.set_ylabel("child counts/data of child")
plt.show()

* Most no. of Hispanic or Latino children lives in California,Texas,New York and Florida.
* Among selected race Hispanic or Latino children or students live largely in all US states.

## Dataset df1

### cleaning,sorting and analysis 

In [None]:
#taking products info dataset first
df1=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv",encoding="utf-8",engine="python")
df1

In [None]:
msg.heatmap(df1,cmap="rainbow",vmin=0,vmax=1);
plt.title("data missing using heatmap");plt.show()

In [None]:
# top 5 products provider companies in data
#PRODUCTS brought from these companies recorded highest.
df1["Provider/Company Name"].value_counts().head(20).plot.barh(color="lightblue")
plt.xlim(0,35)
plt.show()

In [None]:
#showing multiple sell done by product provider
data_provider=df1.groupby("Provider/Company Name").count()["Product Name"].sort_values(ascending=False)
multiple_sells=data_provider[data_provider>1].count()
single_sell=data_provider[data_provider==1].count()

sells=[multiple_sells,single_sell]
labels=["Multiple Product selling companies","single Product selling companies"]

plt.pie(labels=labels,x=sells, autopct= "%.1f%%",explode=[0,.1],colors=["peru","grey"])
plt.title("Sells by Provider companies",fontsize=20);plt.legend()
plt.show()

print("count of Provider companies who are selling not more than once :",(df1.groupby("Product Name")["LP ID"].count() ==1).sum())
print("count of Provider companies who are selling more than once :",(df1.groupby("Product Name")["LP ID"].count() >1).sum())

In [None]:
data_prov2=data_provider.where(data_provider.values>1).dropna();data_prov2
fig = px.pie(data_prov2, values=data_prov2.values, names=data_prov2.keys(), title='Count of products from the Providers who are Multiple Product selling companies')
fig.show()

In [None]:
!pip install squarify

In [None]:
import squarify
norm = matplotlib.colors.Normalize(vmin=min(df1["Sector(s)"].value_counts().values), vmax=max(df1["Sector(s)"].value_counts().values))
colors = [matplotlib.cm.Dark2(norm(value)) for value in df1["Sector(s)"].value_counts().values]
fig = plt.gcf()
fig.set_size_inches(18, 6)
squarify.plot(label=df1["Sector(s)"].value_counts().index,sizes=df1["Sector(s)"].value_counts().values
              , pad = True,color = colors, alpha=.6)
plt.title("Count of Products in each sector",fontsize=23,fontweight="bold")
plt.axis("off");plt.show()

In [None]:
inside_df1=df1.loc[:,["Product Name","Provider/Company Name","Sector(s)"]]
inside_df1.groupby(["Sector(s)","Provider/Company Name"]).count().sort_values(by="Product Name",ascending=False)

In [None]:
source = inside_df1.groupby(["Sector(s)","Provider/Company Name"]).count().sort_values(by="Product Name",ascending=False)
source['provider sector wise'] = source.index.copy()
source=source.reset_index().drop(columns=["Sector(s)","Provider/Company Name"])
source=source.rename(columns = {'Product Name':"count of products"}, inplace = False)
alt.Chart(source[source["count of products"]>1]).mark_bar().encode(
    x='provider sector wise',
    y="count of products"
).properties(height=400,width=1000
            , title={
      "text": ["Count of Products groupby by Sector, Provider"], 
      "subtitle": ["Count of Products groupby by Sector, Provider having more than one product count"],
      "color": "black",
      "subtitleColor": "grey"
    })

In [None]:
#products sold by google
df1["Product Name"].where(df1["Provider/Company Name"]=="Google LLC").dropna()

[i for i in df1["Product Name"].where(df1["Provider/Company Name"]=="Google LLC").dropna().sort_values(ascending=False)]

#products most sold from google are in following order
df1["Product Name"].where(df1["Provider/Company Name"]=="Google LLC").value_counts().sort_values(ascending=False)
formed_df=pd.DataFrame({'Google Product Name': df1["Product Name"].where(df1["Provider/Company Name"]=="Google LLC").value_counts().sort_values(ascending=False).index,
        'Counts':df1["Product Name"].where(df1["Provider/Company Name"]=="Google LLC").value_counts().sort_values(ascending=False).values})
sns.barplot(x="Google Product Name",y="Counts",data=formed_df,palette="Accent_r");plt.xticks(rotation=90);
plt.title("Product's count who are sold by Google LLC")
plt.show()

In [None]:
df1.where(df1["Provider/Company Name"]=="Google LLC").dropna().groupby(["Sector(s)","Primary Essential Function"])["Product Name"].count()

In [None]:
df1.where(df1["Provider/Company Name"]=="Google LLC").dropna().groupby(["Sector(s)","Primary Essential Function"])["Product Name"].count().plot.bar(color="skyblue",title="counts of Products as Per Sector and their Primary Function")
plt.ylabel("count of products")
plt.show()

In [None]:
r=df1.groupby(["Primary Essential Function"]).count()["Product Name"].values
theta=df1.groupby(["Primary Essential Function"]).count()["Product Name"].index
df_theta=pd.DataFrame({"r":r,"theta":theta})
fig = px.line_polar(df_theta, r='r', theta='theta', line_close=True,
                   title="Count of Products from various primary essential function")
fig.update_traces(fill='tonext')
fig.update_layout(
    font_family="Courier New",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
    legend_title_font_color="green",
    title_x=0.05
)
fig.show()

In [None]:
from plotnine import *
tips=df1[["Primary Essential Function","Product Name"]]
tips["count_of_Products"]=1

(ggplot(tips, aes('Primary Essential Function', 'count_of_Products', fill='Product Name'))
 + geom_bar(stat='identity', position='stack')
 + ggtitle('count_of_products as per their Primary Essential Function')
 + theme(axis_text_x=element_text(rotation=90, hjust=1))
)

## Dataset df2

### cleaning,sorting and analysis 

In [None]:
msg.heatmap(df2);plt.show()

### Cleaning df2 dataset from errors:

In [None]:
for i in df2.loc[:,["pct_black/hispanic","pct_free/reduced","county_connections_ratio","pp_total_raw"]].columns:
    df2[i]=df2[i].str.replace('[','',regex=True)   
df2=df2.fillna(0);df2

In [None]:
sns.countplot(x="locale",data=df2);plt.xlabel("Local")
plt.show()

In [None]:
empty_list=[]
for i in df2.state.unique().tolist():
    lista=[i,df2[df2["state"]==i].where(df2["locale"]== "Suburb").count()["district_id"],df2[df2["state"]==i].where(df2["locale"]== 0).count()["district_id"],df2[df2["state"]==i].where(df2["locale"]=="Rural").count()["district_id"],
           df2[df2["state"]==i].where(df2["locale"]== "City").count()["district_id"],df2[df2["state"]==i].where(df2["locale"]== "Town").count()["district_id"]]
    
    empty_list.append(lista)
    
df = pd.DataFrame(empty_list,columns=['state', 'suburb', 'Unknown', 'Rural', 'City',"Town"]);df.style.highlight_max(axis=1,color="lightgreen")

In [None]:
# plot data in stack manner of bar type
fig, ax = plt.subplots(figsize=(20, 10))
df.plot(x='state', kind='barh', stacked=True,title=" Stacked Bar Graph for state's local ",ax=ax)
plt.legend(title="local type")
plt.xlabel("count of local(stacked form)")
plt.show()

In [None]:
def split_making(column):
    
    split_pct_black=[str(i).split(", ") for i in column]
    
    split_list=([float(split_pct_black[a][0]) if len(split_pct_black[a])==1 else float(split_pct_black[a][0])+float(split_pct_black[a][1])/2 for a in range(0,len(split_pct_black))])

    return split_list

df2["pct_black/hispanic"]=split_making(df2["pct_black/hispanic"])
df2["pct_free/reduced"]=split_making(df2["pct_free/reduced"])
df2["county_connections_ratio"]=split_making(df2["county_connections_ratio"])
df2["pp_total_raw"]=split_making(df2["pp_total_raw"])  # pct_black/hispanic,pct_free/reduced	county_connections_ratio	pp_total_raw
df2

In [None]:
df2_dis=df2.drop(columns=["district_id"])
df2_dis=df2_dis.groupby(["state","locale"]).mean()
df2_dis=df2_dis.sort_values(ascending=False,by="pp_total_raw").reset_index()
fig1 = px.sunburst(df2_dis, path=['state',"locale",'pct_black/hispanic',"pct_free/reduced"],
                   values='pp_total_raw',color="state", hover_data=['county_connections_ratio'],
                   color_continuous_scale='RdBu',color_continuous_midpoint=np.average(df2_dis["pct_free/reduced"]),
                  title="Details in Sunburst Form")
fig1.show()

In [None]:
reduced_df2=df2.loc[:,["pct_black/hispanic","pct_free/reduced","county_connections_ratio","pp_total_raw"]]
for a in reduced_df2.columns:
    sns.boxplot(x=a,data= reduced_df2,color="orange")
    plt.title("boxplot for "+ a)
    plt.show()

In [None]:
for a in df2.columns[3:]:
    plt.figure(figsize=(15,6))
    sns.boxplot(x=a,y="state",data=df2,hue="locale",palette='spring_r')
    plt.title("boxplot for "+ a + " for each states")
    plt.show()

In [None]:
reduced_df4=df2.drop(columns="district_id")
sns.pairplot(reduced_df4,hue="locale",dropna=True)
plt.show()

In [None]:
sns.heatmap(reduced_df2.corr(),annot=True,cmap="Blues",vmax=1,vmin=0,
           center=0.5);
plt.title("correlation between variables in dataset");
plt.show()

In [None]:
sns.lmplot(x="pct_black/hispanic", y="pct_free/reduced",col="locale", hue="state", data=df2,palette="rainbow_r")
plt.title("lmplot for population getting free in states categorized in Locale")
plt.show()

* pct_black/hispanic population in state increases as the population getting pct_free/reduced meal also increases.

* Population getting Free/reduced meal in City , Suburb are in large number

In [None]:
import os 
for dirname, _, filenames in os.walk('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/engagement_data'):
    list2=[os.path.join(filename) for filename in filenames]
    
list_null=[]
for i in range(0,len(list2)):
    df_csv=pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/"+list2[i])
    list_null.append(df_csv)

vertical_concat=pd.concat(list_null, axis=0).reset_index();
vertical_concat=vertical_concat.fillna(0)
vertical_concat=vertical_concat.drop(columns=["index"])
vertical_concat

In [None]:
df_engage=vertical_concat['lp_id'].value_counts()>1
df_engage.value_counts()
print("unique engagements were done: " ,len(vertical_concat["lp_id"].unique()))

engage_df=pd.DataFrame([["engagements more than once for a companies Product",8229],["engagement only once for a companies Product",418]],columns=["engagement type","count of engagements type"])

print(pd.pivot_table(engage_df,index=engage_df["engagement type"]))

# Pie Chart
plt.pie(engage_df["count of engagements type"], colors=["lightblue","pink"], labels=engage_df["engagement type"].values,
        autopct='%1.1f%%', pctdistance=0.85,explode=[0,0.2])
  
# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
  
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)
  
# Adding Title of chart
plt.title("Engagement done for a company's product",fontsize=20)
  
# Displaing Chart
plt.show()

Out of all compan products 

In [None]:
# top 20 lp id which have done more engagments
vertical_concat["lp_id"].value_counts().head(20).plot.barh(color="firebrick",title="lp_id which have done more engagements in 2020")
plt.xlabel("count of engagements")
plt.ylabel("lp_id")
plt.show()

* lp_id of companies products those having most engagements.

In [None]:
for i in vertical_concat.columns.tolist()[2:]:
    fig, ax = plt.subplots(figsize=(10,5))
    vertical_concat.groupby("time")[i].mean().plot(kind="line",color="peru",ax=ax)
    vertical_concat.groupby("time")[i].median().plot(kind="line",color="hotpink",ax=ax)
    plt.title("mean & median of "+ i +" on a particular day")
    plt.xticks(rotation=90);plt.show()

In [None]:
for i in vertical_concat.columns.tolist()[2:]:
    vertical_concat.groupby("time")[i].median().plot(kind="line",title="median of "+ i +" on a particular day",color="hotpink")
    plt.xticks(rotation=90);plt.show()

In [None]:
x = vertical_concat["time"].unique()
 
#y-axis values
y1 = vertical_concat.groupby("time")["pct_access"].mean()
 
# secondary y-axis values
y2 = vertical_concat.groupby("time")["engagement_index"].mean()

fig, ax = plt.subplots(figsize = (20, 8))
plt.title('Two Y label bar graph for pct_access mean & engagement_index mean over the time')
 
# using the twinx() for creating
# another axes object for secondary y-Axis
ax2 = ax.twinx()

# creating a bar plot
ax.bar(x, y1, color = 'g')
ax2.bar(x, y2, color = 'b',alpha=.95)
 
# giving labels to the axises
ax.set_xlabel('time from 1 JAN 2020 TO 1 JAN 2021', color = 'r')
ax.set_ylabel('pct_access mean', color = 'g')
 
# secondary y-axis label
ax2.set_ylabel('engagement_index mean', color = 'b')
 
# defining display layout
plt.tight_layout()
 
ax.xaxis.set_major_locator(matplotlib.dates.YearLocator(base=1))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%b"))

plt.xticks(rotation=90)
# show plot
plt.show()

In [None]:
# plotting figures by creating aexs object
# using subplots() function
fig, ax = plt.subplots(figsize = (10, 5))
plt.title('Two Y label line graph for pct_access mean & engagement_index mean over the time')
 
# using the twinx() for creating another
# axes object for secondary y-Axis
ax2 = ax.twinx()

ax.plot(x, y1, color = 'g')
ax2.plot(x, y2, color = 'b')
 
# giving labels to the axises
ax.set_xlabel('from January 2020-January 2021', color = 'r')
ax.set_ylabel('pct_access mean', color = 'g')
 
# secondary y-axis label
ax2.set_ylabel('engagement_index mean', color = 'b')
# defining display layout
plt.tight_layout()

ax.xaxis.set_major_locator(matplotlib.dates.YearLocator(base=1))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%B"))

# show plot
plt.show()

In [None]:
vertical_concat3=vertical_concat.drop(columns="lp_id");vertical_concat3
avg_pa=vertical_concat3.groupby("time").mean()["pct_access"]
avg_ei=vertical_concat3.groupby("time").mean()["engagement_index"]
data = {'time':vertical_concat3["time"].unique(),
        'avg_pct_access':avg_pa,
        'avg_engagement_index':avg_ei}

vertical_concat3=pd.DataFrame(data)
vertical_concat3

In [None]:
plt.figure(figsize=(12,5),facecolor="whitesmoke",edgecolor="k",dpi=180)
sns.set_style("darkgrid")
plt.title("graphs of avg_pct_access and avg_engagement_index")
plt.subplot(1,3,1)
sns.boxplot(y=vertical_concat3["avg_pct_access"])
plt.subplot(1,3,2)
sns.boxplot(y=vertical_concat3["avg_engagement_index"])
plt.subplot(1,3,3)
sns.regplot(x=vertical_concat3["avg_pct_access"],y=vertical_concat3["avg_engagement_index"])
plt.suptitle('average pct_access and avg engagement_index over time',fontsize=10)
plt.show()

In [None]:
#pct_access	Percentage of students in the district have at least one page-load event of a given product and on a given day
#engagement_index	Total page-load events per one thousand students of a given product and on a given day
sort_vertical_concat=vertical_concat.groupby(["time"]).count().sort_index(ascending=True).reset_index()
fig, ax = plt.subplots(figsize=(20, 10),dpi=120)
sns.set_style("white")
sns.lineplot(x="time",y="lp_id",data=sort_vertical_concat,marker="o",linewidth=0.5,linestyle="dashed",ax=ax)
plt.ylabel("count of page_loads per day in year 2020")
plt.xlabel("Time from Jan 2020 to Jan 2021")
ax.xaxis.set_major_locator(matplotlib.dates.YearLocator(base=1))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%b"))
plt.title("Page loads of various products per day in 2020")
plt.show()

In [None]:
df1=df1.rename(columns={"LP ID":"lp_id"},inplace=False)
merged_df=df1.merge(vertical_concat, how='outer', on='lp_id')
merged_df=merged_df[merged_df['Product Name'].notna()]
list_of_values = merged_df.lp_id.value_counts().head(30).keys().tolist() 
n_df=merged_df[merged_df["lp_id"].isin(list_of_values)]
n_df

In [None]:
plt.figure.figsize =(15,8)
sns.countplot(x="Product Name",data=n_df,palette='Set3')
plt.xticks(rotation=90)
plt.title("Count of Products that were interacted by students most times in 2020")
plt.show()

In [None]:
!pip install joypy

In [None]:
#chosen products from top 30 most engaged products.
list_of_values_1=["Google Docs","Wikipedia","Zoom","CoolMath Games","Disney+","Netflix","MIT App Inventor","nytimes.com","Quizlet","ABCya!",
                  "CNN Student News","Canvas","Clever","Desmos","Microsoft Outlook","Scholastic","Epic! - Unlimited Books for Kids","Grammarly","Kahoot!",
                  "TeachersPayTeachers","Weebly","Prodigy","Khan Academy"]
                  
n_df1=merged_df[merged_df["Product Name"].isin(list_of_values_1)]

In [None]:
from joypy import joyplot

joyplot(n_df1, by ='Product Name', column = "pct_access",
        fade = True,colormap=matplotlib.cm.autumn, figsize = (20,8)
        ,range_style="own",fill=False)
plt.xlabel("pct_access")

plt.title("Ridgeplot of pct_access for product name",fontsize=20)

plt.show()

In [None]:
joyplot(n_df1, by ='Product Name', column = "engagement_index",
        fade = True,colormap=matplotlib.cm.rainbow, figsize = (20,8)
        ,range_style="own",fill=False)
plt.xlabel("engagement_index")

plt.title("Ridgeplot of engagement_index for top products",fontsize=20);

plt.show()

In [None]:
(
    ggplot(n_df, aes(x='pct_access', y='engagement_index', color='Sector(s)'))
    + geom_point()
    + geom_smooth(method='lm')
    + labs(x='pct_access', y='engagement_index')
)

In [None]:
data1=merged_df.sort_values(ascending=False,by="pct_access")
fig, (ax1, ax2) = plt.subplots(2, 1,sharex=True,figsize=(20,8))
sns.barplot(x="Product Name",y="pct_access",data=data1.head(1000),ax=ax1,palette="jet_r");
plt.xticks(rotation=90)
sns.swarmplot(x="Product Name",y="pct_access",data=data1.head(1000),ax=ax2);
plt.xticks(rotation=90)
plt.suptitle("Barplot for highest pct_access product along with its swarmplot")
plt.show()

In [None]:
data2=merged_df.sort_values(ascending=False,by="engagement_index")
fig, (ax1, ax2) = plt.subplots(2, 1,sharex=True,figsize=(20,8))
sns.boxenplot(x="Product Name",y="engagement_index",data=data2.head(1000),ax=ax1);
plt.xticks(rotation=90)

sns.swarmplot(x="Product Name",y="engagement_index",data=data2.head(1000),ax=ax2);
plt.xticks(rotation=90)
plt.suptitle("Boxplot for highest engagement_index product along with its swarmplot")
plt.show()

### *picture of digital connectivity and engagement in 2020*

### Conclusion:

* Products like Google Classroom, Canvas,Google Docs,Schoolboy,Meet,Kahoot!,Savvas Realize,PowerSchool,Freckle by Renaissance,Google Drive,Youtube,Google Forms,Edgenuity are among the Products that has highest engagement_index. 
* Google docs,Google classroom,Canvas,Schoolboy has highest engagement_index mean respectively
* Saavas Realize,Prezi,Google classroom,schoolboy,Google docs,grammarly,grammarly by chrome,Thesaurus.com,Zoom,Canvas Meet etc recorded highest pct_access among products
* pct_access increasing linearly with engagement index
* products like Google Docs,Canvas,Clever,Zoom,Kahoot! with engagement_index recorded most number of the times in a Year.
* Google Docs,Canvas,Clever,Zoom,Grammarly with pct_access recorded most number of the times in a Year.
* page load per day counts increased in 2020 but decrease at the mid and the end of 2020
* average pct_access is 0.43,average engagement_index is approx 100
* engagement_index and pct_access among google products is highest.
* pct_access per day remains the same over the course of year but engagement_index/day increased.
* total engagements done more than once for a product in a day is 95.2% ,only 4.8% engagements are done only once.
* Primary Function having LC-Digital Learning has most wide range Products to chose from.
* Google is selling 23 different Products for sector of PreK-12; Higher Ed; Corporate LLC.
* iStockphoto LP,Google are the only two companies selling Products to each sector of PreK-12; Higher Ed; Corporate LLC.
* USA's most states have county_connection_ratio not more than 0.70.
* only 11% of companies sold products multiple times as per Data of ../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv

# *What is the effect of the COVID-19 pandemic on online and distance learning, and how might this also evolve in the future?*

In [None]:
usa_covid_states=pd.read_csv("../input/covid-usa-data/all-states-history.csv");usa_covid_states.sort_index(ascending=False)
usa_covid_states=usa_covid_states[["date","state","deathIncrease","hospitalized","hospitalizedCumulative","death","onVentilatorCurrently","positive"]]
usa_covid_states_2=usa_covid_states.set_index("date").sort_index();usa_covid_states_2

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
usa_covid_states.groupby("state")["death"].sum().sort_values(ascending=False).plot.barh(ax=ax)
usa_covid_states.groupby("state")["hospitalized"].sum().sort_values(ascending=False).plot.barh(ax=ax,color="red")
plt.title("Comparison of states with death count and hospitalized count ")
plt.legend();plt.show()

In [None]:
#states having highest deaths and Hopitaized peoples
usa_covid_states=usa_covid_states.fillna(0)
usa_covid_states=usa_covid_states.set_index("date").sort_index()
# situation in 2020-2021 in top 5 states where death occured more
usa_covid_states_1=usa_covid_states[usa_covid_states["state"].isin(["NY","CA","NJ","TX","FL"])];usa_covid_states_1
for i in list(usa_covid_states_1["state"].unique()):
    usa_covid_states_1[usa_covid_states_1["state"]==i].plot(kind="line");
    plt.title("situation of "+ i +" in 2020-21");
    plt.xticks(rotation=90)
    plt.show()

In [None]:
usa_covid=pd.read_csv("../input/covid-usa-data/national-history.csv")
usa_covid=usa_covid[["date","death","deathIncrease","hospitalizedCurrently","hospitalizedCumulative","onVentilatorCurrently","positive"]]
usa_covid.rename({"hospitalizedCurrently":"hospitalized"},inplace=False);
usa_covid=usa_covid.set_index("date").sort_index();usa_covid["state"]="US"
usa_covid

In [None]:
#USA's situation in 2020
usa_covid.plot(kind="line");plt.show()

In [None]:
#data taken from 
merged_usa_data=pd.concat([usa_covid,usa_covid_states_2],axis=0).fillna(0)
merged_usa_data["Hopitalized"]=merged_usa_data["hospitalized"]+merged_usa_data["hospitalizedCurrently"]
merged_usa_data=merged_usa_data.drop(columns=["hospitalizedCurrently","hospitalized"]);merged_usa_data

In [None]:
fig, ax = plt.subplots(figsize = (20, 8))
plt.title('Two Y label graph for pct_access mean & engagement_index mean over the time')

# using the twinx() for creating another
# axes object for secondary y-Axis
ax2 = ax.twinx()

ax.plot(x, y1, color = 'g')
ax2.plot(x, y2, color = 'b')
 
# giving labels to the axises
ax.set_xlabel('time', color = 'r')
ax.set_ylabel('pct_access mean', color = 'g')
 
# secondary y-axis label
ax2.set_ylabel('engagement_index mean', color = 'b')
# defining display layout
plt.tight_layout()

ax.xaxis.set_major_locator(matplotlib.dates.YearLocator(base=1))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%B"))

# show plot
plt.show()

In [None]:
fig, (ax,ax1) = plt.subplots(2,1,figsize = (20, 8),sharex=True)
sns.lineplot(x=merged_usa_data[merged_usa_data["state"]=="US"].index,y="death",data=merged_usa_data[merged_usa_data["state"]=="US"],ax=ax1)
sns.lineplot(x=merged_usa_data[merged_usa_data["state"]=="US"].index,y="Hopitalized",data=merged_usa_data[merged_usa_data["state"]=="US"],ax=ax1)
sns.lineplot(x=merged_usa_data[merged_usa_data["state"]=="US"].index,y="onVentilatorCurrently",data=merged_usa_data[merged_usa_data["state"]=="US"],ax=ax1)
ax1.set_ylabel("death,ventilation,hopilatized")
ax.plot(y1)#y1 = vertical_concat.groupby("time")["pct_access"].mean()
ax.set_ylabel("enagagement_index each day")
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%b"))
ax.xaxis.set_major_locator(matplotlib.dates.YearLocator(base=1))
plt.xticks(rotation=90)
ax.legend()
plt.suptitle("Situation of USA in 2020 from Jan 2020 to end 2020")
plt.show()

Somewhere data between engagement_index each day and onVentilatorCurrently correlates to each other

In [None]:
# merging merged_df with merged_usa_data
merged_df_ontime=merged_df.rename(columns = {'time': 'date'}).set_index("date").sort_index()
merged_df_ontime.index= pd.to_datetime(merged_df_ontime.index)
merged_df_ontime_1=merged_df_ontime[(merged_df_ontime.index > '2020-01-12')]
merged_df_ontime_1=merged_df_ontime_1.iloc[:,-1:].groupby(merged_df_ontime_1.iloc[:,-1:].index).mean()
merged_df_ontime_1.plot()

In [None]:
usa_covid_3=usa_covid[(pd.to_datetime(usa_covid.index) < '2021-01-01')].fillna(0).loc[:,["onVentilatorCurrently"]];usa_covid_3.plot()
plt.xticks(rotation=90)
plt.show()

In [None]:
fig, (ax1,ax2) = plt.subplots(2,1,figsize = (20, 8),sharex=False,sharey=False)
usa_covid_3.plot(y="onVentilatorCurrently",ax=ax1,color="red")
ax1.set_ylabel("ventilation cases's counts")
merged_df_ontime_1.plot(y="engagement_index",ax=ax2,color="green")
ax2.set_ylabel("engagement index per day")
plt.suptitle("comparision b/w onVentilatorCurrently and engagement index from 13-Jan to 31-Dec")
plt.show()

In [None]:
us_cov=usa_covid_3.reset_index()
merged_df_ontime_2=merged_df_ontime_1.reset_index()
fig,ax = plt.subplots()
x = range(1,355)
y=us_cov["onVentilatorCurrently"]
y1=merged_df_ontime_2["engagement_index"]
ax.plot(x,y)

month_starts = [1,32,61,92,122,153,183,214,245,275,306,336]
month_names = ['Jan','Feb','Mar','Apr','May','Jun',
               'Jul','Aug','Sep','Oct','Nov','Dec'] 


ax.plot(y,label="no. of patients need ventilation for covid in a day")

ax.plot(y1,label="engagement_index per day")

ax.set_xticks(month_starts)
ax.set_xticklabels(month_names)
plt.legend()
plt.xticks(rotation=90)
plt.show()

*Whats the scope of digital marketing after 2020?*

### *Using Arima to predict to Digital engagement in future till end of 2021*

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from matplotlib.pylab import rcParams

In [None]:
#Determine rolling statistics
rolmean = merged_df_ontime_1.rolling(window=12).mean() #window size 12 denotes 12 months, giving rolling mean at yearly level
rolstd = merged_df_ontime_1.rolling(window=12).std()
print(rolmean,rolstd)

In [None]:
orig = plt.plot(merged_df_ontime_1, color='blue', label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)

In [None]:
print('Results of Dickey Fuller Test:')
dftest = adfuller(merged_df_ontime_1['engagement_index'], autolag='AIC')

dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
    
print(dfoutput)

In [None]:
#merged data in log form
plt.plot(np.log(merged_df_ontime_1))

In [None]:
movingAverage = np.log(merged_df_ontime_1).rolling(window=12).mean()
movingSTD = np.log(merged_df_ontime_1).rolling(window=12).std()
plt.plot(np.log(merged_df_ontime_1))
plt.plot(movingAverage, color='red')

In [None]:
datasetLogScaleMinusMovingAverage = np.log(merged_df_ontime_1)- movingAverage
datasetLogScaleMinusMovingAverage.head(12)

#Remove NAN values
datasetLogScaleMinusMovingAverage.dropna(inplace=True)
datasetLogScaleMinusMovingAverage.head(10)

In [None]:

def test_stationarity(timeseries):
    
    #Determine rolling statistics
    movingAverage = timeseries.rolling(window=12).mean()
    movingSTD = timeseries.rolling(window=12).std()
    
    #Plot rolling statistics
    orig = plt.plot(timeseries, color='blue', label='Original')
    mean = plt.plot(movingAverage, color='red', label='Rolling Mean')
    std = plt.plot(movingSTD, color='black', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey–Fuller test:
    print('Results of Dickey Fuller Test:')
    dftest = adfuller(timeseries['engagement_index'], autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
test_stationarity(datasetLogScaleMinusMovingAverage)

In [None]:
exponentialDecayWeightedAverage = np.log(merged_df_ontime_1).ewm(halflife=12, min_periods=0, adjust=True).mean()
plt.plot(np.log(merged_df_ontime_1))
plt.plot(exponentialDecayWeightedAverage, color='red')

In [None]:
datasetLogScaleMinusExponentialMovingAverage = np.log(merged_df_ontime_1)- exponentialDecayWeightedAverage
test_stationarity(datasetLogScaleMinusExponentialMovingAverage)

In [None]:
datasetLogDiffShifting = np.log(merged_df_ontime_1) - np.log(merged_df_ontime_1).shift()
plt.plot(datasetLogDiffShifting)

In [None]:
datasetLogDiffShifting.dropna(inplace=True)
test_stationarity(datasetLogDiffShifting)

In [None]:
decomposition = seasonal_decompose(np.log(merged_df_ontime_1)) 

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(np.log(merged_df_ontime_1), label='Original')
plt.legend(loc='best')

plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')

plt.subplot(411)
plt.plot(seasonal, label='Seasonality')
plt.legend(loc='best')

plt.subplot(411)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')

plt.tight_layout()

plt.show()

In [None]:
lag_acf = acf(datasetLogDiffShifting, nlags=20)
lag_pacf = pacf(datasetLogDiffShifting, nlags=20, method='ols')

#Plot ACF:
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray')
plt.title('Autocorrelation Function')            

#Plot PACF
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray')
plt.title('Partial Autocorrelation Function')
plt.show()


In [None]:
model = ARIMA(np.log(merged_df_ontime_1), order=(2,1,0))
results_AR = model.fit(disp=-1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'%sum((results_AR.fittedvalues - datasetLogDiffShifting['engagement_index'])**2))
print('Plotting AR model')

In [None]:
model = ARIMA(np.log(merged_df_ontime_1), order=(0,1,2))
results_MA = model.fit(disp=-1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'%sum((results_MA.fittedvalues - datasetLogDiffShifting['engagement_index'])**2))
print('Plotting MA model')

In [None]:
model = ARIMA(np.log(merged_df_ontime_1), order=(2,1,2))
results_ARIMA = model.fit(disp=-1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'%sum((results_ARIMA.fittedvalues - datasetLogDiffShifting['engagement_index'])**2))
print('Plotting ARIMA model')

In [None]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print(predictions_ARIMA_diff.head())

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print(predictions_ARIMA_diff_cumsum)

In [None]:
predictions_ARIMA_log = pd.Series(np.log(merged_df_ontime_1)['engagement_index'].iloc[0], index=np.log(merged_df_ontime_1).index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum, fill_value=0)
predictions_ARIMA_log.head()

In [None]:
# Inverse of log is exp.
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(merged_df_ontime_1)
plt.plot(predictions_ARIMA)

In [None]:
results_ARIMA.plot_predict(1,720) #predict for next 365 days 

## Conclusions:

* from Arima forcasting,the scope of digital marking dipping a bit.
* due to July's peak in covid 19 cases,engagement_index and pct_access also gone down.
* on ventilatorCurrently and engagement_index linked to each other.

# *How does student engagement with different types of education technology change over the course of the pandemic?*

In [None]:
#taking top products from different categories to see change over the cource of pandemic
list_of_products=["Google Classroom","Wikipedia","Netflix","Khan Academy","Zoom","CNN Student News"
                  "Grammarly","MIT App Inventor","CoolMath Games","Quora","Spotify: Music and podcasts","Meet","Encyclopedia Britannica",
                  "Goodreads","Among Us","Code.org"]    

n_df2=merged_df[merged_df["Product Name"].isin(list_of_products)]

In [None]:
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(len(n_df2["Product Name"].unique()), rot=-.25, light=.7)
g = sns.FacetGrid(n_df2, row="Product Name", hue="Product Name", aspect=15, height=1, palette=pal)

# Draw the densities in a few steps
g.map(sns.kdeplot, "engagement_index", bw_adjust=1, clip_on=False, fill=False, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "engagement_index", clip_on=False, color="red", lw=2, bw_adjust=.5)
g.map(plt.axhline, y=0, lw=2, clip_on=False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color, ha="left", va="center", transform=ax.transAxes)

g.map(label, "engagement_index")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

plt.show()

# uncomment the following line if there's a tight layout warning
# g.fig.tight_layout()

* Google classroom and meet has wide spread density among products means children engaged more with these Products

In [None]:
fig2 = px.scatter(n_df2, x="Product Name", y="engagement_index", color='Primary Essential Function')
fig2.show()

* Google classroom performed well in (Learning Mangement system)LMS.
* Zoom and meet performed well in video conferencing and screen sharing.
* Khan academy performed well in digital learning Platform
* Cool Math has highest engagement_index among Games and simulation catergory of primary essential function.

In [None]:
n_df4=n_df2.loc[:,["Product Name","Provider/Company Name","Sector(s)","Primary Essential Function","time","engagement_index"]]
n_df4["time"]=pd.to_datetime(n_df4.time, format='%Y-%m-%d')
n_df4.groupby(n_df4['time'].dt.strftime('%B')).mean()
sns.lineplot(x=n_df4.groupby(n_df4['time'].dt.strftime('%B')).mean().index,y="engagement_index",data=n_df4.groupby(n_df4['time'].dt.strftime('%B')).mean(),
            linestyle="dashed",marker="o")
plt.title("Average Engagements with above given Product in each month")
plt.xticks(rotation=90);plt.show()

In [None]:
n_df3=n_df4.groupby(["Product Name","time","Sector(s)","Primary Essential Function","Provider/Company Name"]).mean().reset_index().sort_values(by=["time"],ascending=True)
fig = px.scatter(n_df3, x="time", y="engagement_index", color="Provider/Company Name", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
fig.show()

In [None]:
days=n_df4.groupby(n_df4['time'].dt.strftime('%B')).mean().index.unique()
from datetime import datetime
sorted_month=sorted(days, key=lambda day: datetime.strptime(day, "%B"))
n_df5=n_df3.groupby(["time","Sector(s)"]).mean().reset_index().sort_values(by=["time"],ascending=True)
n_df5["time"]=n_df5['time'].dt.strftime('%B')

In [None]:
n_df6=n_df5.groupby(["time","Sector(s)"]).mean().reset_index()

In [None]:
empty_list_sec=[]
for i in list(n_df3["Sector(s)"].unique()):
    for j in sorted_month:
        list_months=n_df6[n_df6["Sector(s)"]==i].where(n_df6["time"]==j).dropna(0)["engagement_index"].mean()
        empty_list_sec.append(list_months)

new_list=np.array(empty_list_sec).reshape(3,12)       

In [None]:
def heatmap(data, row_labels, col_labels, ax=None,
            cbar_kw={}, cbarlabel="", **kwargs):

    if not ax:
        ax = plt.gca()

    # Plot the heatmap
    im = ax.imshow(data, **kwargs)

    # Create colorbar
    cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
    cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")

    # We want to show all ticks...
    ax.set_xticks(np.arange(data.shape[1]))
    ax.set_yticks(np.arange(data.shape[0]))
    # ... and label them with the respective list entries.
    ax.set_xticklabels(col_labels)
    ax.set_yticklabels(row_labels)

    # Let the horizontal axes labeling appear on top.
    ax.tick_params(top=True, bottom=False,
                   labeltop=True, labelbottom=False)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
             rotation_mode="anchor")

    # Turn spines off and create white grid.
    ax.spines[:].set_visible(False)

    ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
    ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
    ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
    ax.tick_params(which="minor", bottom=False, left=False)

    return im, cbar


def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
                     textcolors=("black", "white"),
                     threshold=None, **textkw):

    if not isinstance(data, (list, np.ndarray)):
        data = im.get_array()

    # Normalize the threshold to the images color range.
    if threshold is not None:
        threshold = im.norm(threshold)
    else:
        threshold = im.norm(data.max())/2.

    # Set default alignment to center, but allow it to be
    # overwritten by textkw.
    kw = dict(horizontalalignment="center",
              verticalalignment="center")
    kw.update(textkw)

    # Get the formatter in case a string is supplied
    if isinstance(valfmt, str):
        valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)

    # Loop over the data and create a `Text` for each "pixel".
    # Change the text's color depending on the data.
    texts = []
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
            text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
            texts.append(text)

    return texts

In [None]:
vegetables = list(n_df3["Sector(s)"].unique())
farmers = sorted_month
harvest = new_list


fig, ax = plt.subplots(figsize=(15,10))
im = ax.imshow(harvest)

# We want to show all ticks...
ax.set_xticks(np.arange(len(farmers)))
ax.set_yticks(np.arange(len(vegetables)))
# ... and label them with the respective list entries
ax.set_xticklabels(farmers)
ax.set_yticklabels(vegetables)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

im, cbar = heatmap(harvest, vegetables, farmers, ax=ax,
                   cmap="YlGn", cbarlabel="engagement_index average")

texts = annotate_heatmap(im, valfmt="{x:.1f}")

ax.set_title("average of engagement_index every month as per Primary essential function")
fig.tight_layout()
plt.show()

In [None]:
n_bins = 10
x = n_df4["time"]
fig, (ax0, ax1) = plt.subplots(nrows=2, ncols=1,figsize=(20,10))
ax0.hist(x,n_bins, density=True, histtype='bar')
ax0.legend(prop={'size': 10})
ax0.set_title('bars with legend')


ax1.hist(x, n_bins, histtype='step', stacked=True, fill=False)
ax1.set_title('stack step (unfilled)')

plt.suptitle("Histograms for each month of above given Product")
fig.tight_layout()
plt.show()

In [None]:
n_df6=n_df4.drop(columns=["Provider/Company Name","Sector(s)","Primary Essential Function"])
n_df6["time"]=n_df6['time'].dt.month
fig,ax=plt.subplots(figsize=(20,8))
sns.barplot(x="time",y="engagement_index",hue="Product Name",data=
            n_df6.groupby(["time","Product Name"]).mean().reset_index(),ax=ax,palette="jet_r")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 0.5, 0.5, 0.5),facecolor="white",
         title="Products from different categories(selected)")
ax.set_xlabel("time in month");ax.set_ylabel("engagement index mean each month")
ax.set_title("Engagement index each month for different Products")
plt.show()

##  Conclusions:

* Among selected categories products average mean of enagement index of Google classroom, meet, Zoom, Among us did well.
* Highest engagement of all these Product came in 11th month of 2020.
* prek 12,highest ed,cooperate the sector among sectors which did more engagement each month in 2020.
* engagement_index each month came last from Provider Innersloth.
* from April to october 2020, most engagements came but the dip in engagement_index came in july

# *How does student engagement with online learning platforms relate to different geography? Demographic context (e.g., race/ethnicity, ESL, learning disability)? Learning context? Socioeconomic status?*

In [None]:
df41=df2.drop(columns=["district_id","locale"]);df41

In [None]:
#taking top products with highest engagement_index
df42=n_df.drop(columns=["pct_access","lp_id","URL"]);df42

In [None]:
#df_8 is on the below of notebook
df_9=pd.merge(df_8,df41,how="inner",on=["state"])
df_16=df_9.dropna(subset=["time"])

In [None]:
zero_list=[]

try:
    for i in df_9.dropna(subset=["time"])["time"]:
        if i!=0:
            app=i.split()
            zero_list.append(app)
            
        elif i=="N/A":
            continue
        else:
            continue
            
except TypeError:
    "0"
    

In [None]:
another_zero_list=[]
for i in range(0,len(zero_list)):
        a=zero_list[i][0]
        another_zero_list.append(a)

In [None]:
df_16["time"]=another_zero_list;df_16

In [None]:
df_10=pd.merge(df_16,df42,how="outer",on="time")

In [None]:
df_11=df_10.dropna(subset = ['state']);df_11

In [None]:
df_0_drop=df_11[(df_11!= 0).all(1)]
df_0_drop=df_0_drop.loc[:,["emergency","state","engagement_index"]]
df_0_drop=df_0_drop.sort_values(ascending=False,by="engagement_index")

In [None]:
df_0_drop.groupby("state")["engagement_index"].mean().plot.barh()

In [None]:
#showing which state has highest engagement during covid situations
from IPython.display import display_html 
df1_styler = df_11.loc[:,["emergency","state","engagement_index"]].groupby(["emergency"]).mean().style.set_table_attributes("style='display:inline'").set_caption('df1')
df2_styler = df_11.loc[:,["emergency","state","engagement_index"]].groupby(["state"]).mean().style.set_table_attributes("style='display:inline'").set_caption('df2')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

During Second COVID-19 hardship started,highest engagement_index average come & highest engagement came from Florida state.


In [None]:
df13=df_11.loc[:,["emergency","state","engagement_index"]].groupby(["state"]).mean();
fig,ax=plt.subplots(figsize=(20,10))
df13.plot.bar(title="engagements index for state during covid emergencies for top selected item as per engagement index",ax=ax).axhline(y=n_df2["engagement_index"].mean(),color="r",label=
                                                                                         "meanline of engagement index");
plt.legend(facecolor="white")
plt.show()

In [None]:
df_new=pd.merge(df_child_race,df_s_e.head(100),how="inner",on=["state"])
df_new=df_new.drop_duplicates()
df_new["engagement_race_count"]=df_new["engagement_index"]/df_new["Data"]
df_new

In [None]:
#race count is Population of race
plt.figure(figsize=(15,8))
sns.set_style("darkgrid")
sns.violinplot(x="state",y="engagement_race_count",hue="Race",data=df_new)
plt.title("engagement per race_count for each state")
plt.show()

* New Hampshire top among the ratio of engagement_index/ count of students's race for engagement of n_df2 listed products

## Conclusions:

* Mostly students from all states did engagement above than the average in emergency_covid time.
 
* People who used Google Classroom,Canvas during covid situation of closed gyms,closed restuaurents,closed movie theaters etc.
 
* engagement index was quite high for Google classroom as that of canvas.

* engagement_index for high for florida,Wisconsin as for the engagement of online platforms in different geography.

* engagement index during 2nd covid19 hardships were the highest.

# *Do certain state interventions, practices or policies (e.g., stimulus, reopening, eviction moratorium) correlate with the increase or decrease online engagement?*

In [None]:
df4=pd.read_excel("../input/dataset-1/COVID-19 US state policy database 3_29_2021.xlsx");
df4.head()

In [None]:
df4.set_index('STATE').T.category.unique()

In [None]:
df4.columns = df4.loc[0,:]
df4=df4.drop([0,2,3])
df4.head()

In [None]:
df4=df4.set_index('State').T

In [None]:
sns.countplot(x="category",data=df4,palette="jet_r");plt.xticks(rotation=90);plt.show()

In [None]:
df4=df4[df4["category"].isin(['state_of_emergency','Reopening', 'reopening',"quarantines",'physical_distance_closures', 'second_closures',"vaccine",'third_closures', 'housing', 'unemployment'])]

In [None]:
df4=df4.T

In [None]:
df5=df4[["State of emergency issued","State of emergency expired","Reopen day cares","Began to reopen businesses",
     "Closed restaurants except take out","Reopen restaurants","Closed gyms","Reopened gyms","Closed movie theaters","Reopened movie theaters",
    "Mandate quarantine for those entering the state from specific settings","Date all mandated quarantines ended","Date vaccine allocation plan last updated",
    "Overall eviction moratorium start","Overall eviction moratorium expiration","Second overall eviction moratorium start","Second overall eviction moratorium end"
    ,"Freeze enforcement of evictions","Lift freeze of eviction enforcement","COVID-19 hardship start","COVID-19 hardship expiration","Second COVID-19 hardship start",
    "Second COVID-19 hardship expiration","Waived one week waiting period for unemployment insurance","Reinstated one week waiting period for unemployment insurance",
     "Extended Benefits program activated","Extended Benefits program deactivated"]]

In [None]:
df6=df5.drop(["category"])

In [None]:
df_7=df6.T.reset_index()
df_7["emergency"]=df_7[0];df_7=df_7.drop(columns=[0]);df_7.head()

In [None]:
df_8=df_7.melt(id_vars=['emergency'], var_name='state', value_name='time')

df_8["time"]=df_8["time"].astype("string")

df_8

In [None]:
fig , ax=plt.subplots(figsize=(15,8))
df_11.loc[:,["emergency","state","engagement_index"]].groupby(["emergency"]).mean().plot.barh(color="teal",
                                                                                              title="engagement_index during these emergency situation",ax=ax)
plt.legend(facecolor="white")
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()

In [None]:
#shows which state has highest mean engagement_index due to which emergency covid 19 situation(state interventions, practices or policies )"
fig = px.treemap(df_11.groupby(["emergency","state"]).mean().sort_values(ascending=False,by="engagement_index").head(100).reset_index(), path=["state","emergency"], values='engagement_index',
                  color='state',
                  color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(df_11['engagement_index']),title="state has highest engagement_index due to which(state interventions, practices or policies)")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [None]:
fig,ax = plt.subplots(figsize=(24,8))
x = range(1,355)
y1=merged_df_ontime_2["engagement_index"]
ax.plot(x,y1)
month_starts = [1,32,61,92,122,153,183,214,245,275,306,336]
month_names = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] 

ax.plot(y1,label="engagement_index per day")

ax.set_xticks(month_starts)
ax.set_xticklabels(month_names)
plt.legend(loc='upper left')
plt.show()

df6 = df6.apply(pd.to_datetime, errors='coerce')
dicts=[]
for j in df6.columns.tolist():
    dict1 = {"Task":str(j) , "Start":min(df6[j]), "End":max(df6[j])}
    dicts.append(dict1)
source = pd.DataFrame(dicts)
alt.Chart(source).mark_bar().encode(
    x='Start',
    x2='End',
    y='Task'
).properties(width=1000,height=500)

In [None]:
empty_list=[]
for j in range(0,len(df6.nunique()-1)):
    if min(df6[str(df6.columns[j])])=="NaT":
        min(df6[str(df6.columns[j])])== max(df6[str(df6.columns[j-1])])
        
    if max(df6[str(df6.columns[j])])=="NaT":
        max(df6[str(df6.columns[j])])==max(df6[j][~df6[j].isnull()])
    else:
        pass
  
    dframes=dict(Task=str(df6.columns[j]),Start=min(df6[str(df6.columns[j])]),Finish=max(df6[str(df6.columns[j])]),Resource=str(df6.columns[j]))
    empty_list.append(dframes)
    
df = empty_list

fig = ff.create_gantt(df,title="Gnatt chart for state interventions, practices or policies during Covid 2020",group_tasks=True, width=1500, height=800)

fig.show()

In [None]:
#good_news=["reopen","Began","expiration","Closed","ended","updated","activated","Freeze enforcement","Reopen"]

df_gorb=df_11.loc[:,["emergency","state","engagement_index"]].groupby(["emergency"]).mean().reset_index()
df_gorb["Good_News"]=df_gorb["emergency"].str.contains('reopen|Began|expiration|Closed|ended|updated|activated|Freeze|enforcement|Reopen', regex=True)

In [None]:
color=["red","green"]
fig,ax=plt.subplots()
df_gorb.groupby("Good_News").mean().plot.bar(color=color[0]
                                             ,title="Barplot for avg.of engagement_index during start of Good News",ax=ax);
df_gorb.groupby("Good_News").mean().plot(color=color[1]
                                             ,title="lineplot for avg.of engagement_index during start of Good News",ax=ax)
plt.show()

## Conclusions:

* After state interventions, practices or policies from the government engagement index also starting to grow up.

* average engagement index during state interventions, practices or policies or emergency recorded above daily  average engagement_index