In [1]:
import pandas as pd
pd.set_option('display.max_row', 100)
pd.options.mode.chained_assignment = None

import numpy as np

from datetime import datetime
from datetime import timedelta

import altair as alt
from altair import datum
import altair_viewer
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [None]:
"Vicens' second attempt at exploration and wrangling. The wrangling done here was incorrect, and improved upon in EricWrangling.ipynb"

# 1. Describe the data set

The Canada Emergency Wage Subsidy Regional and Community-level Database (the database) is a custom dataset constructed with CEWS microdata and other administrative data sources available at Statistics Canada.

In terms of geographic levels, the database provides values for Canada, province/territory, rural and urban breakdowns by province/territory, census metropolitan area (CMA), census agglomeration (CA), and census subdivision (CSD). These values are provided at the total industry level and by 2- and 3-digit North American Industry Classification System (NAICS) industries.

The main variables included in the database are total CEWS supported employment and total CEWS claim dollar amounts. To generate this regional and community-level database, CEWS microdata were first aggregated to the business enterprise level, and subsequently allocated to the business locations of the enterprise.

# 2. Load the data set

In [2]:
cews = pd.read_csv('CEWS_SSUC_DB_En_v1.0.csv', encoding = "ISO-8859-1") 

FileNotFoundError: [Errno 2] No such file or directory: 'CEWS_SSUC_DB_En_v1.0.csv'

# 3. Explore the data set

In [3]:
cews.head(3)

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
0,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,11,"Agriculture, forestry, fishing and hunting",30,823000,362,0
1,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,111,Crop production,10,X,90,0
2,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,112,Animal production and aquaculture,10,X,X,0


In [4]:
cews.tail(3)

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
562488,2020-09-27,urban,Canada,URBAN,Not applicable,722,Food services and drinking places,23865,194079000,453171,2700
562489,2020-09-27,urban,Canada,URBAN,Not applicable,99,Other and Missing NAICS,21770,162310000,197676,1285
562490,2020-09-27,urban,Canada,URBAN,Not applicable,TOTAL,All Industries,178890,2373105000,2861910,11335


In [5]:
cews.sample(3)

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
448482,2020-08-30,24,Quebec,Not applicable,Not applicable,321,Wood product manufacturing,240,5629000,7824,5
21766,2020-03-15,2458033,Boucherville,URBAN,Not applicable,53,Real estate and rental and leasing,15,81000,34,0
59824,2020-03-15,5901039,Invermere,RURAL,Not applicable,56,"Administrative and support, waste management a...",0,X,X,0


In [6]:
cews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562491 entries, 0 to 562490
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   Start_date_of_CEWS_period  562491 non-null  object
 1   RegionCode                 562491 non-null  object
 2   RegionName                 562491 non-null  object
 3   RuralUrbanFlag             562491 non-null  object
 4   CMACAFlag                  562491 non-null  object
 5   IndustryCode               562491 non-null  object
 6   IndustryName               562491 non-null  object
 7   Number_business_locations  562491 non-null  object
 8   Subsidy_amount             562491 non-null  object
 9   Supported_employees        562491 non-null  object
 10  CEWS_rehire_count          562491 non-null  object
dtypes: object(11)
memory usage: 47.2+ MB


In [7]:
cews.describe()

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
count,562491,562491,562491,562491,562491,562491,562491,562491,562491,562491,562491
unique,8,3597,3340,3,3,108,108,1766,16468,12991,565
top,2020-04-12,35,Canada,URBAN,Not applicable,TOTAL,All Industries,0,X,X,0
freq,77108,864,2585,291820,494555,26772,26772,258810,379698,374068,484306


In [8]:
cews.shape

(562491, 11)

In [9]:
cews['Start_date_of_CEWS_period'].value_counts()

2020-04-12    77108
2020-05-10    76960
2020-07-05    71729
2020-06-07    71669
2020-03-15    71266
2020-08-02    69500
2020-08-30    65319
2020-09-27    58940
Name: Start_date_of_CEWS_period, dtype: int64

In [None]:
cews['Number_business_locations'].value_counts(normalize=True)

In [11]:
cews[cews['Number_business_locations']=='0'].sample(3)

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
421844,2020-08-02,4805012,Wheatland County,RURAL,Not applicable,515,Broadcasting (except Internet),0,X,X,0
3217,2020-03-15,1102052,Lot 34,URBAN,Not applicable,112,Animal production and aquaculture,0,X,X,0
198263,2020-05-10,35595,Thunder Bay,URBAN,CMA,213,"Support activities for mining, and oil and gas...",0,X,X,0


In [12]:
cews['Subsidy_amount'].value_counts(normalize=True)

X              0.675029
0              0.010516
17,000         0.001916
20,000         0.001888
22,000         0.001865
                 ...   
14,214,000     0.000002
47,354,000     0.000002
4,567,000      0.000002
13,817,000     0.000002
104,049,000    0.000002
Name: Subsidy_amount, Length: 16468, dtype: float64

In [13]:
cews[cews['Subsidy_amount']=='X'].sample(3)

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
516600,2020-09-27,2434065,Saint-Marc-des-Carrières,RURAL,Not applicable,327,Non-metallic mineral product manufacturing,5,X,X,0
275281,2020-06-07,4704048,Maple Creek,RURAL,Not applicable,713,"Amusement, gambling and recreation industries",0,X,X,0
226652,2020-06-07,1001559,Witless Bay,URBAN,Not applicable,31-33,Manufacturing,0,X,X,0


In [14]:
cews['Supported_employees'].value_counts(normalize=True)

X         0.665020
0         0.010779
12        0.003127
13        0.003095
14        0.003052
            ...   
7,475     0.000002
48,730    0.000002
4,688     0.000002
11,033    0.000002
27,567    0.000002
Name: Supported_employees, Length: 12991, dtype: float64

In [15]:
cews[cews['Supported_employees']=='X'].sample(3)

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
12653,2020-03-15,2417055,Saint-Aubert,RURAL,Not applicable,722,Food services and drinking places,0,X,X,0
144287,2020-04-12,5951015,Bulkley-Nechako C,RURAL,Not applicable,51,Information and cultural industries,0,X,X,0
5482,2020-03-15,1210012,"Colchester, Subd. A",RURAL,Not applicable,621,Ambulatory health care services,0,X,X,0


In [5]:
cews['CEWS_rehire_count'].value_counts(normalize=True)

0         0.861002
5         0.087699
10        0.015524
15        0.007540
20        0.004647
            ...   
3,480     0.000002
34,595    0.000002
2,395     0.000002
32,480    0.000002
8,030     0.000002
Name: CEWS_rehire_count, Length: 565, dtype: float64

In [17]:
cews[cews['CEWS_rehire_count']=='0'].sample(3)

Unnamed: 0,Start_date_of_CEWS_period,RegionCode,RegionName,RuralUrbanFlag,CMACAFlag,IndustryCode,IndustryName,Number_business_locations,Subsidy_amount,Supported_employees,CEWS_rehire_count
8108,2020-03-15,1307045,Dieppe,URBAN,Not applicable,447,Gasoline stations,0,X,X,0
508928,2020-09-27,1217030,Cape Breton,URBAN,Not applicable,446,Health and personal care stores,5,X,X,0
208489,2020-05-10,4807054,Wainwright,RURAL,Not applicable,453,Miscellaneous store retailers,5,13000,8,0


# 4. Initial thoughts

The database contains variables on the amount of approved CEWS claims and businesses supported, number of CEWS supported and rehired employees, among other variables of interest. Data are available at sub-provincial levels of geography, notably rural and urban breakdowns, and by industry sectors and subsectors.

1. The ***CEWS_rehire_count*** column/attribute has 46% observations with a value of 0.
2. The ***Subsidy_amount*** column/attribute has 68% of observations suppressed to meet the confidentiality requirements of the Statistics Act. These observations are marked by an 'X' value.
3. The ***Supported_employees*** column/attribute has 67% observations suppressed to meet the confidentiality requirements of the Statistics Act. These observations are marked by an 'X' value.
4. The ***CEWS_rehire_count*** column/attribute has 86% observations with a value of 0.
5. The ***Start_date_of_CEWS_period*** column/attribute only has information of 8 4-week periods, from March 15th, 2020 to Septemeber 27, 2020.
6. The ***RegionCodE, RegionName, RuralUrbanFlag*** and ***CMACAFlag*** columns/attributes comprise a hierarchical coding system that provides a unique identifier for each level within the geographic hierarchy. The geographic levels used are: Canada provinces and territories, census metropolitan areas/census agglomerations (CMA/CAs), and census subdivisions (CSDs). Aggregation to Canada's province and territory in ***RegionCode** and **RegionName** needs to be performed.
7. The ***IndustryCode*** and ***IndustryName*** columns/attributes show industry levels at industry sector (2-digit NAICS) and subsector (3-digit NAICS). Some values are aggregated by 'All industries' (IndustryCode = TOTAL).  

Data types are all strings.

# 5. Data wranlging

In [6]:
# Creating a copy of the database to work on and handle new values
cl_cews = cews.replace('X', np.NaN)

# Renaming columns to more familiar names
cl_cews.rename(columns={'Start_date_of_CEWS_period': 'Period',
                        'RegionCode' : 'Region Code', 
                        'RegionName' : 'Region', 
                        'RuralUrbanFlag' : 'Geographical Classification',
                        'CMACAFlag' : 'Census Level', 
                        'IndustryCode': 'Industry Code', 
                        'IndustryName': 'Industry', 
                        'Number_business_locations' : 'Business Locations', 
                        'Subsidy_amount': 'Subsidy', 
                        'Supported_employees' : 'Supported Employees', 
                        'CEWS_rehire_count': 'Rehire Employees'}, inplace=True)

# Correcting data types
num_cols = ['Business Locations', 'Subsidy', 'Supported Employees', 'Rehire Employees']
cl_cews[num_cols] = cl_cews[num_cols].replace(',', '', regex=True).astype(np.float)
cl_cews.Period = pd.to_datetime(cl_cews.Period, infer_datetime_format=True)

# Re-maping code values for analysis and changing data types
dic_ind = {'31-33' : '31', '44-45' : '44', '48-49' : '48', 'TOTAL' : '1'}
cl_cews['Industry Code'].replace(dic_ind, inplace=True)
cl_cews['Industry Code'] = cl_cews['Industry Code'].astype(int)

cl_cews['Region Code'] = cl_cews['Region Code'].str.split('-').str[0].str.rstrip()
dic_reg = {'TOTAL' : '1', 'rural' : '2', 'urban' : '3'}
cl_cews['Region Code'].replace(dic_reg, inplace=True)
cl_cews['Region Code'] = cl_cews['Region Code'].astype(int)

cl_cews['Region'] = cl_cews['Region'].str.split('-').str[0].str.rstrip()

In [7]:
cl_cews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562491 entries, 0 to 562490
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   Period                       562491 non-null  datetime64[ns]
 1   Region Code                  562491 non-null  int32         
 2   Region                       562491 non-null  object        
 3   Geographical Classification  562491 non-null  object        
 4   Census Level                 562491 non-null  object        
 5   Industry Code                562491 non-null  int32         
 6   Industry                     562491 non-null  object        
 7   Business Locations           562491 non-null  float64       
 8   Subsidy                      182793 non-null  float64       
 9   Supported Employees          188423 non-null  float64       
 10  Rehire Employees             562491 non-null  float64       
dtypes: datetime64[ns](1), floa

In [8]:
cl_cews.head(3)

Unnamed: 0,Period,Region Code,Region,Geographical Classification,Census Level,Industry Code,Industry,Business Locations,Subsidy,Supported Employees,Rehire Employees
0,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,11,"Agriculture, forestry, fishing and hunting",30.0,823000.0,362.0,0.0
1,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,111,Crop production,10.0,,90.0,0.0
2,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,112,Animal production and aquaculture,10.0,,,0.0


In [9]:
cl_cews.isnull().sum()

Period                              0
Region Code                         0
Region                              0
Geographical Classification         0
Census Level                        0
Industry Code                       0
Industry                            0
Business Locations                  0
Subsidy                        379698
Supported Employees            374068
Rehire Employees                    0
dtype: int64

# 6. Research questions

Identify trends across numerous levels of geographies, industries, and week-periods of claims with a focus on determining the CEWS outcomes in avoiding layoffs, rehiring employees, and creating new jobs.

# 7. Data analysis and visualizations

## Total geographical classification aggregation

In [10]:
cews_totals = cl_cews.loc[(cl_cews['Region'] == 'Canada') & (cl_cews['Industry Code'] == 1) & (cl_cews['Geographical Classification'] != 'Not applicable')].reset_index(drop=True)
cews_totals.drop(columns=['Region Code', 'Region', 'Census Level', 'Industry Code', 'Industry'], inplace=True)
cews_totals[num_cols] = cews_totals[num_cols].astype(int)

In [11]:
cews_totals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Period                       16 non-null     datetime64[ns]
 1   Geographical Classification  16 non-null     object        
 2   Business Locations           16 non-null     int32         
 3   Subsidy                      16 non-null     int32         
 4   Supported Employees          16 non-null     int32         
 5   Rehire Employees             16 non-null     int32         
dtypes: datetime64[ns](1), int32(4), object(1)
memory usage: 640.0+ bytes


In [12]:
cews_totals.head(3)

Unnamed: 0,Period,Geographical Classification,Business Locations,Subsidy,Supported Employees,Rehire Employees
0,2020-03-15,RURAL,36135,688658000,326654,3380
1,2020-03-15,URBAN,257935,-2147483648,3162560,27140
2,2020-04-12,RURAL,43440,890231000,382773,4350


In [13]:
title = alt.Chart(cews_totals).mark_text(
            ).properties(title={'text': 'CEWS Evolution', 'subtitle': 'A negative trend is visible in all variables',
                                "subtitleFontWeight": 900, "subtitleFontSize": 28})

evolution = alt.Chart(cews_totals).mark_area(point=True).encode(
                x=alt.X('Period:T', title=None),
                y=alt.Y(alt.repeat(), type='quantitative', aggregate='sum', 
                        axis=alt.Axis(titleAngle=0, titleAnchor='start', titleY=-10)),
                color=alt.Color('Geographical Classification:N', legend=alt.Legend(orient='bottom')),
                tooltip=[alt.Tooltip('Period:T'),
                        alt.Tooltip(alt.repeat(), type='quantitative'),
                        alt.Tooltip('Geographical Classification:N')]
                        ).repeat(num_cols, columns=2)

(title & evolution).configure_title(fontSize=32, fontWeight=900, anchor='start'
                                   ).configure_axis(domain=False, grid=False, ticks=False
                                                   ).configure_view(stroke="transparent")

## Province and territories and industry sector aggregation

In [14]:
pt_is_cews = cl_cews.loc[(cl_cews['Region Code'] < 100) & (cl_cews['Industry Code'] < 100)].reset_index(drop=True)

In [15]:
pt_is_cews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6270 entries, 0 to 6269
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Period                       6270 non-null   datetime64[ns]
 1   Region Code                  6270 non-null   int32         
 2   Region                       6270 non-null   object        
 3   Geographical Classification  6270 non-null   object        
 4   Census Level                 6270 non-null   object        
 5   Industry Code                6270 non-null   int32         
 6   Industry                     6270 non-null   object        
 7   Business Locations           6270 non-null   float64       
 8   Subsidy                      4886 non-null   float64       
 9   Supported Employees          5044 non-null   float64       
 10  Rehire Employees             6270 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int32(2),

In [16]:
pt_is_cews.head(3)

Unnamed: 0,Period,Region Code,Region,Geographical Classification,Census Level,Industry Code,Industry,Business Locations,Subsidy,Supported Employees,Rehire Employees
0,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,11,"Agriculture, forestry, fishing and hunting",30.0,823000.0,362.0,0.0
1,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,21,"Mining, quarrying, and oil and gas extraction",35.0,3562000.0,1158.0,0.0
2,2020-03-15,10,Newfoundland and Labrador,Not applicable,Not applicable,22,Utilities,0.0,,,0.0
