# ONTARIO SCHOOLS WITH COVID-19 CASES ANALYSIS
## Peter Stangolis

#### [Data URL:](https://data.ontario.ca/dataset/b1fef838-8784-4338-8ef9-ae7cfd405b41/resource/7fbdbb48-d074-45d9-93cb-f7de58950418/download/schoolcovidsummary.csv) 

### Import the required libraries for the analysis

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from matplotlib.pyplot import figure

from pylab import rcParams
#rcParams['figure.figsize'] = 18, 6

import datetime
import seaborn as sns

In [2]:
# Graph Style Settings:

plt.figure(figsize=(7, 4))

plt.style.use(['default', 'ggplot'])

%matplotlib inline


### Import the data set into a dataframe

In [3]:
url = 'https://data.ontario.ca/dataset/b1fef838-8784-4338-8ef9-ae7cfd405b41/resource/7fbdbb48-d074-45d9-93cb-f7de58950418/download/schoolcovidsummary.csv'

df = pd.read_csv(url)

# View the last 5 rows of the data set
df.tail()

Unnamed: 0,collected_date,reported_date,current_schools_w_cases,current_schools_closed,current_total_number_schools,new_total_school_related_cases,new_school_related_student_cases,new_school_related_staff_cases,new_school_related_unspecified_cases,recent_total_school_related_cases,...,recent_school_related_unspecified_cases,past_total_school_related_cases,past_school_related_student_cases,past_school_related_staff_cases,past_school_related_unspecified_cases,cumulative_school_related_cases,cumulative_school_related_student_cases,cumulative_school_related_staff_cases,cumulative_school_related_unspecified_cases,Unnamed: 21
45,2020-11-13,2020-11-16,683,1,4828,106,63,14,29.0,1143.0,...,354.0,2244.0,1249.0,298.0,697.0,3387,1913,423,1051.0,
46,2020-11-16,2020-11-17,670,1,4828,133,73,23,37.0,1126.0,...,352.0,2392.0,1342.0,314.0,736.0,3518,1985,445,1088.0,
47,2020-11-17,2020-11-18,670,3,4828,109,92,17,0.0,1121.0,...,324.0,2505.0,1419.0,322.0,764.0,3626,2077,461,1088.0,
48,2020-11-18,2020-11-19,680,3,4828,91,75,16,0.0,1143.0,...,299.0,2567.0,1451.0,328.0,788.0,3710,2147,476,1087.0,
49,2020-11-19,2020-11-20,681,3,4828,87,60,27,0.0,1150.0,...,271.0,2653.0,1503.0,334.0,816.0,3803,2209,507,1087.0,


## Initial Exploratory Data Analysis of the data set

In [4]:
# retrieve information concerning the data in columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 22 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   collected_date                               50 non-null     object 
 1   reported_date                                50 non-null     object 
 2   current_schools_w_cases                      50 non-null     int64  
 3   current_schools_closed                       50 non-null     int64  
 4   current_total_number_schools                 50 non-null     int64  
 5   new_total_school_related_cases               50 non-null     int64  
 6   new_school_related_student_cases             50 non-null     int64  
 7   new_school_related_staff_cases               50 non-null     int64  
 8   new_school_related_unspecified_cases         48 non-null     float64
 9   recent_total_school_related_cases            36 non-null     float64
 10  rece

### Change the datatype of the 'reported_date' column to datetime from object

In [5]:
df['reported_date'] = pd.to_datetime(df['reported_date'])


In [6]:
df['reported_date'].head()

0   2020-09-11
1   2020-09-14
2   2020-09-15
3   2020-09-16
4   2020-09-17
Name: reported_date, dtype: datetime64[ns]

In [7]:
df.describe()

Unnamed: 0,current_schools_w_cases,current_schools_closed,current_total_number_schools,new_total_school_related_cases,new_school_related_student_cases,new_school_related_staff_cases,new_school_related_unspecified_cases,recent_total_school_related_cases,recent_school_related_student_cases,recent_school_related_staff_cases,recent_school_related_unspecified_cases,past_total_school_related_cases,past_school_related_student_cases,past_school_related_staff_cases,past_school_related_unspecified_cases,cumulative_school_related_cases,cumulative_school_related_student_cases,cumulative_school_related_staff_cases,cumulative_school_related_unspecified_cases,Unnamed: 21
count,50.0,50.0,50.0,50.0,50.0,50.0,48.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,50.0,50.0,50.0,48.0,0.0
mean,415.88,1.96,4828.0,77.2,44.98,10.2,22.9375,832.305556,476.111111,99.388889,256.75,1162.805556,646.805556,171.027778,344.944444,1476.68,827.46,203.22,464.604167,
std,214.336341,1.538088,0.0,43.238305,26.881061,6.543419,15.213944,216.242565,128.196674,24.108419,77.833293,825.850205,470.260768,102.336277,254.324084,1211.690663,694.32502,148.194755,366.279264,
min,13.0,0.0,4828.0,0.0,0.0,0.0,0.0,386.0,212.0,59.0,114.0,61.0,21.0,23.0,17.0,13.0,4.0,9.0,6.0,
25%,257.25,1.0,4828.0,51.25,29.0,5.0,12.0,679.75,396.75,84.75,184.5,443.0,228.0,82.0,133.0,315.75,170.0,50.5,125.5,
50%,484.0,2.0,4828.0,74.0,41.0,9.0,22.5,889.5,480.5,99.0,290.5,1020.5,582.0,162.5,276.0,1276.0,713.0,198.5,384.0,
75%,581.0,3.0,4828.0,105.25,59.0,13.75,31.25,936.5,532.0,107.75,318.75,1833.0,1027.25,256.75,549.0,2447.0,1378.75,311.25,770.0,
max,683.0,5.0,4828.0,198.0,116.0,27.0,65.0,1150.0,706.0,173.0,354.0,2653.0,1503.0,334.0,816.0,3803.0,2209.0,507.0,1088.0,


### Missing Value Analysis

In [8]:
df.isnull().sum()

collected_date                                  0
reported_date                                   0
current_schools_w_cases                         0
current_schools_closed                          0
current_total_number_schools                    0
new_total_school_related_cases                  0
new_school_related_student_cases                0
new_school_related_staff_cases                  0
new_school_related_unspecified_cases            2
recent_total_school_related_cases              14
recent_school_related_student_cases            14
recent_school_related_staff_cases              14
recent_school_related_unspecified_cases        14
past_total_school_related_cases                14
past_school_related_student_cases              14
past_school_related_staff_cases                14
past_school_related_unspecified_cases          14
cumulative_school_related_cases                 0
cumulative_school_related_student_cases         0
cumulative_school_related_staff_cases           0


### Drop the columns not required for the analysis 


* collected_date                                  
* recent_total_school_related_cases  
* recent_school_related_student_cases            
* recent_school_related_staff_cases              
* recent_school_related_unspecified_cases        
* past_total_school_related_cases                
* past_school_related_student_cases              
* past_school_related_staff_cases                
* past_school_related_unspecified_cases          
* Unnamed: 21



In [9]:
df = df.iloc[:,  [1,2,3,4,5,6,7,8,9, 17, 18, 19, 20]]

In [10]:
df.tail()

Unnamed: 0,reported_date,current_schools_w_cases,current_schools_closed,current_total_number_schools,new_total_school_related_cases,new_school_related_student_cases,new_school_related_staff_cases,new_school_related_unspecified_cases,recent_total_school_related_cases,cumulative_school_related_cases,cumulative_school_related_student_cases,cumulative_school_related_staff_cases,cumulative_school_related_unspecified_cases
45,2020-11-16,683,1,4828,106,63,14,29.0,1143.0,3387,1913,423,1051.0
46,2020-11-17,670,1,4828,133,73,23,37.0,1126.0,3518,1985,445,1088.0
47,2020-11-18,670,3,4828,109,92,17,0.0,1121.0,3626,2077,461,1088.0
48,2020-11-19,680,3,4828,91,75,16,0.0,1143.0,3710,2147,476,1087.0
49,2020-11-20,681,3,4828,87,60,27,0.0,1150.0,3803,2209,507,1087.0


### Save the modified dataframe to a new csv file

In [11]:
df.to_csv("C:/Users/P/data_analyst_role/Ontario_Schools_COVID-19_Analysis_2020/ontario_schools_covid19_analysis/ont_school_covid19_101.csv", 
          index = False, header = True)