### HR Analytics Feature Importances

#### Set source path to import code

In [1]:
%pwd

'C:\\Users\\t0272m1\\Projects\\HR Analytics'

In [2]:
source_path = 'E:\HR-Analytics\source'
source_path

'E:\\HR-Analytics\\source'

In [3]:
import os
os.chdir(source_path)
%pwd

'E:\\HR-Analytics\\source'

In [4]:
ls

 Volume in drive E is DATA
 Volume Serial Number is AEFF-63BE

 Directory of E:\HR-Analytics\source

06/04/2019  02:09 PM    <DIR>          .
06/04/2019  02:09 PM    <DIR>          ..
05/02/2019  11:01 AM                 2 __init__.py
05/16/2019  11:42 AM             8,585 __main__.py
06/04/2019  06:24 PM    <DIR>          __pycache__
05/31/2019  02:33 PM            16,184 base_table.py
06/04/2019  06:26 PM            26,135 calendrical.py
06/04/2019  02:22 PM             2,210 config_jnap.yml
06/04/2019  02:21 PM            10,380 config_shap.yml
06/03/2019  11:21 AM             4,713 config_shap2.yml
06/03/2019  11:23 AM             2,210 config_tac.yml
05/01/2019  11:07 AM               498 config_wap.yml
05/17/2019  09:09 AM             6,131 database.py
06/04/2019  02:27 PM             8,585 main.py
06/04/2019  06:18 PM            23,878 model.py
04/30/2019  03:23 PM             2,842 pipeline.yml
05/01/2019  02:09 PM             3,673 pipeline_brap.py
05/01/2019  02:08 PM        

#### Imports

In [5]:
import calendar
import datetime
import itertools
import jaydebeapi as jdb
import json
import math
import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import psycopg2
import random
import requests
import seaborn as sns
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
import statsmodels as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
import urllib.request

In [6]:
# Internal Python Packages
import calendrical
from database import connect_greenplum
from database import write_frame_to_pg
from database import create_sqlalchemy_engine
from main import get_pipeline_config
from main import get_plant_config

#### Get specifications

In [7]:
pipeline_specs = get_pipeline_config(source_path)
pipeline_specs

{'datalake': {'schema': 'lab_datasci',
  'host': 'shbdmdwp001.servers.chrysler.com',
  'port': 5432,
  'user': 'datasci',
  'password': 'datasci_01',
  'database': 'odshawq'},
 'jdbc': {'driver': 'com.ibm.db2.jcc.DB2Driver',
  'server': 'jdbc:db2://SRVR1874.dbms.chrysler.com:18740/AUCERPTP',
  'user': 'datasci',
  'password': 'datasci_01',
  'jar_file': 'c:/installed/sqllib/java/db2jcc4.jar'},
 'holidays': {'calendar_us': ['2016-01-01',
   '2016-01-18',
   '2016-03-25',
   '2016-03-28',
   '2016-05-30',
   '2016-07-04',
   '2016-09-05',
   '2016-11-08',
   '2016-11-11',
   '2016-11-24',
   '2016-11-25',
   '2016-12-26',
   '2016-12-27',
   '2016-12-28',
   '2016-12-29',
   '2016-12-30',
   '2017-01-02',
   '2017-01-22',
   '2017-04-14',
   '2017-04-17',
   '2017-05-29',
   '2017-07-04',
   '2017-09-04',
   '2017-11-10',
   '2017-11-22',
   '2017-11-23',
   '2017-12-25',
   '2017-12-26',
   '2017-12-29',
   '2017-12-30',
   '2017-12-31',
   '2018-01-01',
   '2018-01-15',
   '2018-03-30'

In [8]:
plant_id = 'jnap'
plant_id

'jnap'

In [9]:
pipeline_specs['plant_id'] = plant_id
pipeline_specs['project_directory'] = source_path

In [10]:
plant_specs = get_plant_config(pipeline_specs)
plant_specs

{'plant': {'code': 4012,
  'shift_days': 4,
  'shift_hours': 10,
  'absence_codes': ['BERC',
   'BERE',
   'BERU',
   'BERX',
   'CARE',
   'CARU',
   'FMLA',
   'FMLD',
   'FMLU',
   'HOMD',
   'ILFE',
   'ILFU',
   'IPBE',
   'IPME',
   'IPNU',
   'IPSE',
   'JURE',
   'MISE',
   'MISU',
   'PERU',
   'PPAA',
   'PPAU',
   'TRAG',
   'WTRU']},
 'base_table': {'start_date': datetime.date(2018, 1, 1),
  'end_date': None,
  'write_table': True},
 'model': {'target': 'absences_unplanned',
  'prediction_date': datetime.date(2019, 6, 9),
  'npreds': 4,
  'p_arima': 1,
  'd_arima': 0,
  'q_arima': 0,
  'top_features': 5,
  'band_pct': 0.3,
  'levels': ['crew', 'production_line'],
  'crews': ['A', 'B', 'C'],
  'departments': {'Material': '3300',
   'Body Shop': '3310',
   'Paint Shop': '3330',
   'Trim': '9150',
   'Chassis': '9170',
   'Final': '9173',
   'Quality 1': '9190',
   'Quality 2': '9193'},
  'production_lines': {'Material': 'Material',
   'Body Shop': 'Body Shop',
   'Paint Shop'

#### Holiday Calendar

In [11]:
chrysler_holiday_table = pipeline_specs['holidays']['calendar_us']
chrysler_holiday_table

['2016-01-01',
 '2016-01-18',
 '2016-03-25',
 '2016-03-28',
 '2016-05-30',
 '2016-07-04',
 '2016-09-05',
 '2016-11-08',
 '2016-11-11',
 '2016-11-24',
 '2016-11-25',
 '2016-12-26',
 '2016-12-27',
 '2016-12-28',
 '2016-12-29',
 '2016-12-30',
 '2017-01-02',
 '2017-01-22',
 '2017-04-14',
 '2017-04-17',
 '2017-05-29',
 '2017-07-04',
 '2017-09-04',
 '2017-11-10',
 '2017-11-22',
 '2017-11-23',
 '2017-12-25',
 '2017-12-26',
 '2017-12-29',
 '2017-12-30',
 '2017-12-31',
 '2018-01-01',
 '2018-01-15',
 '2018-03-30',
 '2018-04-02',
 '2018-05-28',
 '2018-07-04',
 '2018-09-03',
 '2018-11-06',
 '2018-11-12',
 '2018-11-22',
 '2018-11-23',
 '2018-12-24',
 '2018-12-25',
 '2018-12-26',
 '2018-12-27',
 '2018-12-28',
 '2018-12-31',
 '2019-01-01',
 '2019-01-21',
 '2019-04-19',
 '2019-04-22',
 '2019-05-27',
 '2019-07-04',
 '2019-09-02']

#### Read in prediction table

In [12]:
data_path = 'E:/HR-Analytics/data'
data_path

'E:/HR-Analytics/data'

In [182]:
# input_file is a sequenced model file
file_name = '.'.join(['abs_jnap_plant_20190609_tbl', 'csv'])
file_path = '/'.join([data_path, file_name])
dfp = pd.read_csv(file_path)

In [183]:
dfp.columns

Index(['crew', 'model_feature1', 'model_feature2', 'model_feature3',
       'model_feature4', 'model_feature5', 'predicted', 'predicted_high',
       'predicted_low', 'production_line', 'workdate', 'group_total_cid',
       'lost_hours', 'actual_hours', 'paa_hours', 'absences_unplanned',
       'absences_late', 'absences_noshow', 'absences_any', 'mean_absence_pct',
       'mean_experience', 'absences_planned', 'home_canvasses',
       'absences_fmla', 'peia_count', 'tpt_count', 'streak_1', 'streak_2',
       'streak_3', 'streak_4_plus', 'cluster', 'group_total', 'tpt_unplanned',
       'tpt_extra', 'absences_unplanned_rolling_sum_5',
       'absences_unplanned_rolling_median_5',
       'absences_unplanned_rolling_sum_12',
       'absences_unplanned_rolling_median_12',
       'absences_unplanned_rolling_sum_20',
       'absences_unplanned_rolling_median_20', 'actual_hours_rolling_mean_20',
       'actual_hours_rolling_median_20', 'lost_hours_rolling_mean_20',
       'lost_hours_rolling_

In [171]:
dfp.drop(columns=['dept', 'dept_name'], inplace=True)

In [172]:
dfp.shape

(9961, 11)

In [191]:
dfp.sample(50)

Unnamed: 0,crew,model_feature1,model_feature2,model_feature3,model_feature4,model_feature5,predicted,predicted_high,predicted_low,production_line,...,au_holiday_pct,day_name,month_name,feature1,feature2,feature3,feature4,feature5,dept_name,dept
1476,A,absences_unplanned_rolling_median_5,absences_unplanned_rolling_sum_5,absences_unplanned_rolling_sum_12,absences_unplanned_rolling_sum_20,next_day_delta,4.0,5,3,Chassis 4,...,-11,Thursday,March,Thursday: -5%,Week 12: -16%,March: -12%,4th Thursday: -10%,5 days after St. Patrick's Day: -11%,Chassis,9170
742,C,peia_count,actual_hours_rolling_mean_20,absences_unplanned_rolling_sum_20,actual_hours_rolling_median_20,next_day_delta,8.0,10,6,Final 1,...,54,Saturday,February,Saturday: -2%,Week 6: 18%,February: 0%,2nd Saturday: 7%,4 days before Valentine's Day: 54%,Final,9173
429,B,cluster,absences_any,streak_2,streak_3,group_total,5.0,6,4,Chassis 1 & 2,...,25,Thursday,January,Thursday: -11%,Week 4: -17%,January: -23%,4th Thursday: -15%,10 days after MLK Day: 25%,Chassis,9170
6481,C,cluster,absences_any,streak_3,group_total,home_canvasses,2.0,3,1,Rolls,...,-100,Saturday,December,Saturday: -2%,Week 52: -100%,December: -4%,5th Saturday: -10%,3 days before New Year's Day: -100%,Final,9173
3763,A,absences_unplanned_rolling_median_5,lost_hours_rolling_median_20,mean_absence_pct,quarter,absences_unplanned_rolling_sum_5,2.0,3,1,Chassis 3,...,-12,Thursday,July,Thursday: -5%,Week 30: 1%,July: 6%,4th Thursday: -10%,22 days after Independence Day: -12%,Chassis,9170
8480,B,day_of_year,absences_noshow,actual_hours_rolling_median_20,next_day_delta,actual_hours,4.0,5,3,Chassis 4,...,-14,Thursday,April,Thursday: -11%,Week 15: -4%,April: -5%,2nd Thursday: -14%,8 days before Good Friday: -14%,Chassis,9170
9698,C,actual_hours,lost_hours,day,month,week,14.0,18,10,Material,...,1,Saturday,June,Saturday: -2%,Week 23: -13%,June: 5%,2nd Saturday: 7%,8 days before Father's Day: 1%,Material,3300
8298,A,paa_hours,day_of_year,kp_residual_20,mean_experience,home_canvasses,12.0,16,8,Chassis 1 & 2,...,0,Wednesday,April,Wednesday: -9%,Week 14: -11%,April: -11%,1st Wednesday: -6%,16 days before Good Friday: 0%,Chassis,9170
34,B,paa_hours,cluster,mean_experience,absences_unplanned_rolling_sum_12,absences_unplanned_rolling_median_12,1.0,1,1,Chassis 3,...,-60,Wednesday,January,Wednesday: 4%,Week 1: -60%,January: -23%,1st Wednesday: -2%,2 days after New Year's Day: -60%,Chassis,9170
207,B,absences_any,home_canvasses,streak_2,streak_1,group_total,6.0,8,4,Trim 1,...,-13,Friday,January,Friday: 4%,Week 2: -34%,January: -23%,2nd Friday: 12%,3 days before MLK Day: -13%,Trim,9150


In [187]:
dfp['year'].value_counts()

2018    6482
2019    3503
Name: year, dtype: int64

In [156]:
dfp.drop(columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'], inplace=True)

In [174]:
dfp.to_csv(file_path, index=False)

#### Read in model table

In [12]:
data_path = 'E:/HR-Analytics/data'
data_path

'E:/HR-Analytics/data'

In [13]:
table_date = '20190609'
table_date

'20190609'

In [14]:
table_name = '_'.join(['abs', plant_id, 'model', table_date, 'tbl'])
table_name

'abs_jnap_model_20190609_tbl'

In [15]:
# input_file is a sequenced model file
file_name = '.'.join([table_name, 'csv'])
file_path = '/'.join([data_path, file_name])
dfm = pd.read_csv(file_path)

In [16]:
dfm.shape

(14287, 48)

In [17]:
dfm.columns

Index(['workdate', 'crew', 'production_line', 'group_total_cid', 'lost_hours',
       'actual_hours', 'paa_hours', 'absences_unplanned', 'absences_late',
       'absences_noshow', 'absences_any', 'mean_absence_pct',
       'mean_experience', 'absences_planned', 'home_canvasses',
       'absences_fmla', 'peia_count', 'tpt_count', 'streak_1', 'streak_2',
       'streak_3', 'streak_4_plus', 'cluster', 'group_total', 'tpt_unplanned',
       'tpt_extra', 'absences_unplanned_rolling_sum_5',
       'absences_unplanned_rolling_median_5',
       'absences_unplanned_rolling_sum_12',
       'absences_unplanned_rolling_median_12',
       'absences_unplanned_rolling_sum_20',
       'absences_unplanned_rolling_median_20', 'actual_hours_rolling_mean_20',
       'actual_hours_rolling_median_20', 'lost_hours_rolling_mean_20',
       'lost_hours_rolling_median_20', 'kp_residual_5', 'kp_residual_12',
       'kp_residual_20', 'year', 'quarter', 'month', 'week', 'day',
       'day_of_week', 'day_of_year', 

In [18]:
dfm['workdate'].min()

'2018-01-02'

In [19]:
dfm['workdate'].max()

'2019-06-01'

#### Historical Calendrical Analysis

In [20]:
dfm_mean = dfm.groupby(['crew'])[['absences_unplanned']].mean().reset_index()
dfm_mean

Unnamed: 0,crew,absences_unplanned
0,A,4.442304
1,B,4.731236
2,C,5.935333


In [21]:
dfm1 = pd.merge(dfm, dfm_mean, left_on=['crew'], right_on=['crew'], how='left')
dfm1.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_crew_mean'}, inplace=True)
dfm1.sample(20)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,year,quarter,month,week,day,day_of_week,day_of_year,nth_kday,next_day_delta,au_crew_mean
7434,2018-09-20,B,Door Line,51.0,90.0,520.8,0.0,5,0.0,9.0,...,2018,3,9,38,20,3,263,3,1.0,4.731236
4515,2018-06-09,C,Paint Shop,2.0,0.0,22.5,0.0,0,0.0,0.0,...,2018,2,6,23,9,5,160,2,2.0,5.935333
10153,2019-01-07,A,Sunroof Deck,3.0,10.0,30.0,10.0,1,0.0,1.0,...,2019,1,1,2,7,0,7,1,1.0,4.442304
540,2018-01-20,C,Rolls,24.0,20.0,242.9,0.0,3,0.0,2.0,...,2018,1,1,3,20,5,20,3,2.0,5.935333
9183,2018-11-24,B,Chassis 3,74.0,21.1,727.4,20.0,11,2.0,2.0,...,2018,4,11,47,24,5,328,4,4.0,4.731236
11934,2019-03-09,C,Paint Shop,2.0,0.0,20.5,0.0,0,0.0,0.0,...,2019,1,3,10,9,5,68,2,2.0,5.935333
6046,2018-08-02,B,Quality 1,44.0,40.5,413.5,20.0,1,1.0,4.0,...,2018,3,8,31,2,3,214,1,1.0,4.731236
493,2018-01-19,C,,6.0,0.0,65.5,0.0,0,0.0,0.0,...,2018,1,1,3,19,4,19,3,1.0,5.935333
10596,2019-01-23,A,Trim 1,110.0,61.0,1073.6,0.0,22,0.0,6.0,...,2019,1,1,4,23,2,23,4,1.0,4.442304
3701,2018-05-11,C,Quality 2,33.0,60.0,317.8,30.0,4,0.0,6.0,...,2018,2,5,19,11,4,131,2,1.0,5.935333


In [22]:
dfm_dow = dfm.groupby(['crew', 'day_of_week'])[['absences_unplanned']].mean().reset_index()
dfm_dow

Unnamed: 0,crew,day_of_week,absences_unplanned
0,A,0,5.46592
1,A,1,4.149959
2,A,2,4.059689
3,A,3,4.216858
4,B,2,4.965574
5,B,3,4.248157
6,B,4,4.867003
7,B,5,4.84684
8,C,0,6.073763
9,C,1,6.326429


In [23]:
dfm2 = pd.merge(dfm1, dfm_dow, left_on=['crew', 'day_of_week'], right_on=['crew', 'day_of_week'], how='left')
dfm2.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_dow_mean'}, inplace=True)
dfm2.sample(20)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,quarter,month,week,day,day_of_week,day_of_year,nth_kday,next_day_delta,au_crew_mean,au_dow_mean
12297,2019-03-22,B,Trim 2,70.0,40.8,705.7,0.0,8,1.0,4.0,...,1,3,12,22,4,81,4,1.0,4.731236,4.867003
14188,2019-05-30,A,Chassis 3,79.0,31.0,723.7,20.0,7,1.0,3.0,...,2,5,22,30,3,150,5,0.0,4.442304,4.216858
9367,2018-11-30,C,,6.0,0.0,62.5,0.0,0,0.0,0.0,...,4,11,48,30,4,334,5,1.0,5.935333,5.574916
1821,2018-03-06,C,Chassis 1 & 2,89.0,30.9,863.9,5.0,4,1.0,3.0,...,1,3,10,6,1,65,1,3.0,5.935333,6.326429
363,2018-01-13,C,Engine Line,62.0,60.0,587.3,10.0,6,0.0,6.0,...,1,1,2,13,5,13,2,3.0,5.935333,5.779757
9201,2018-11-24,C,Chassis 4,63.0,42.3,597.7,0.0,3,4.0,4.0,...,4,11,47,24,5,328,4,2.0,5.935333,5.779757
12960,2019-04-13,C,Trim 2,75.0,51.1,697.4,10.0,8,2.0,5.0,...,2,4,15,13,5,103,2,2.0,5.935333,5.779757
182,2018-01-08,A,Quality 2,47.0,30.5,424.7,10.0,10,1.0,3.0,...,1,1,2,8,0,8,2,1.0,4.442304,5.46592
12071,2019-03-14,B,Quality 1,46.0,30.9,441.3,10.0,2,1.0,3.0,...,1,3,11,14,3,73,2,1.0,4.731236,4.248157
6954,2018-09-04,C,Body Shop,24.0,28.0,234.2,0.0,3,0.0,2.0,...,3,9,36,4,1,247,1,3.0,5.935333,6.326429


In [24]:
dfm_week = dfm.groupby(['crew', 'week'])[['absences_unplanned']].mean().reset_index()
dfm_week

Unnamed: 0,crew,week,absences_unplanned
0,A,1,1.987805
1,A,2,3.970588
2,A,3,3.462185
3,A,4,3.915966
4,A,5,4.727941
5,A,6,4.823529
6,A,7,4.066176
7,A,8,4.154412
8,A,9,4.272059
9,A,10,4.375000


In [25]:
dfm3 = pd.merge(dfm2, dfm_week, left_on=['crew', 'week'], right_on=['crew', 'week'], how='left')
dfm3.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_week_mean'}, inplace=True)
dfm3.sample(20)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,month,week,day,day_of_week,day_of_year,nth_kday,next_day_delta,au_crew_mean,au_dow_mean,au_week_mean
6957,2018-09-04,C,Chassis 4,60.0,51.0,602.7,20.0,11,1.0,5.0,...,9,36,4,1,247,1,3.0,5.935333,6.326429,6.607843
5119,2018-06-30,C,Chassis 1 & 2,87.0,51.0,826.9,20.0,9,1.0,5.0,...,6,26,30,5,181,5,2.0,5.935333,5.779757,5.970588
1203,2018-02-13,A,Rolls,26.0,20.8,271.5,0.0,2,1.0,2.0,...,2,7,13,1,44,2,1.0,4.442304,4.149959,4.066176
11323,2019-02-16,C,Quality 1,52.0,50.9,501.4,30.0,2,1.0,5.0,...,2,7,16,5,47,3,2.0,5.935333,5.779757,5.455882
11301,2019-02-16,B,Engine Line,60.0,83.3,608.2,10.0,3,5.0,8.0,...,2,7,16,5,47,3,4.0,4.731236,4.84684,4.588235
6686,2018-08-24,C,Door Line,60.0,60.0,532.8,0.0,5,0.0,6.0,...,8,34,24,4,236,4,1.0,5.935333,5.574916,4.676471
4542,2018-06-11,C,Chassis 3,71.0,50.9,703.7,40.0,0,1.0,5.0,...,6,24,11,0,162,2,1.0,5.935333,6.073763,6.573529
9690,2018-12-12,A,,28.0,0.0,280.4,0.0,0,0.0,0.0,...,12,50,12,2,346,2,1.0,4.442304,4.059689,4.602941
244,2018-01-10,A,Engine Line,59.0,50.0,596.4,20.0,3,0.0,5.0,...,1,2,10,2,10,2,1.0,4.442304,4.059689,3.970588
11832,2019-03-06,B,Paint Shop,2.0,0.0,20.5,0.0,0,0.0,0.0,...,3,10,6,2,65,1,1.0,4.731236,4.965574,5.25


In [26]:
dfm_month = dfm.groupby(['crew', 'month'])[['absences_unplanned']].mean().reset_index()
dfm_month

Unnamed: 0,crew,month,absences_unplanned
0,A,1,3.791304
1,A,2,4.297794
2,A,3,3.910873
3,A,4,3.975045
4,A,5,5.420168
5,A,6,5.580882
6,A,7,4.858131
7,A,8,4.330065
8,A,9,4.670588
9,A,10,4.182663


In [27]:
dfm4 = pd.merge(dfm3, dfm_month, left_on=['crew', 'month'], right_on=['crew', 'month'], how='left')
dfm4.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_month_mean'}, inplace=True)
dfm4.sample(20)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,week,day,day_of_week,day_of_year,nth_kday,next_day_delta,au_crew_mean,au_dow_mean,au_week_mean,au_month_mean
7257,2018-09-14,B,Trim 1,116.0,62.4,1167.1,10.0,5,4.0,6.0,...,37,14,4,257,2,1.0,4.731236,4.867003,4.161765,4.387543
11835,2019-03-06,B,Rolls,29.0,0.0,307.7,5.0,2,0.0,0.0,...,10,6,2,65,1,1.0,4.731236,4.965574,5.25,4.648693
3596,2018-05-08,C,Material,115.0,82.0,1100.7,0.0,14,5.0,8.0,...,19,8,1,128,2,3.0,5.935333,6.326429,7.698529,7.196691
4516,2018-06-09,C,Quality 1,50.0,103.1,483.1,50.0,5,3.0,10.0,...,23,9,5,160,2,2.0,5.935333,5.779757,7.338235,6.919505
1000,2018-02-06,A,Sunroof Deck,33.0,20.0,333.3,10.0,3,0.0,2.0,...,6,6,1,37,1,1.0,4.442304,4.149959,4.823529,4.297794
4595,2018-06-13,A,Door Line,48.0,70.0,472.5,20.0,5,0.0,7.0,...,24,13,2,164,2,1.0,4.442304,4.059689,6.205882,5.580882
6318,2018-08-11,C,Quality 1,54.0,28.0,518.1,10.0,4,0.0,2.0,...,32,11,5,223,2,2.0,5.935333,5.779757,6.294118,5.892734
12412,2019-03-26,C,Quality 2,38.0,51.6,378.3,10.0,3,3.0,5.0,...,13,26,1,85,4,3.0,5.935333,6.326429,5.210084,5.660504
4184,2018-05-30,A,Chassis 1 & 2,86.0,34.4,840.3,0.0,10,5.0,3.0,...,22,30,2,150,5,1.0,4.442304,4.059689,5.107843,5.420168
9492,2018-12-05,A,Engine Line,61.0,51.0,611.3,0.0,8,1.0,5.0,...,49,5,2,339,1,1.0,4.442304,4.059689,5.191176,4.676471


In [28]:
dfm_nth = dfm.groupby(['crew', 'nth_kday', 'day_of_week'])[['absences_unplanned']].mean().reset_index()
dfm_nth

Unnamed: 0,crew,nth_kday,day_of_week,absences_unplanned
0,A,1,0,5.361345
1,A,1,1,4.576471
2,A,1,2,4.199262
3,A,1,3,4.125436
4,A,2,0,6.051471
5,A,2,1,4.425606
6,A,2,2,4.055363
7,A,2,3,4.197232
8,A,3,0,5.670588
9,A,3,1,3.986159


In [118]:
dfm5 = pd.merge(dfm4, dfm_nth, left_on=['crew', 'nth_kday', 'day_of_week'], right_on=['crew', 'nth_kday', 'day_of_week'], how='left')
dfm5.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_nth_kday_mean'}, inplace=True)
dfm5.sample(20)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,day,day_of_week,day_of_year,nth_kday,next_day_delta,au_crew_mean,au_dow_mean,au_week_mean,au_month_mean,au_nth_kday_mean
1570,2018-02-26,A,Engine Line,61.0,50.8,605.5,0.0,5,1.0,5.0,...,26,0,57,4,1.0,4.442304,5.46592,4.272059,4.297794,4.80543
9168,2018-11-21,B,Door Line,61.0,11.5,624.0,0.0,15,3.0,1.0,...,21,2,325,3,3.0,4.731236,4.965574,6.352941,4.654412,4.899654
6223,2018-08-09,A,Body Shop,23.0,1.0,234.4,0.0,1,1.0,0.0,...,9,3,221,2,4.0,4.442304,4.216858,4.764706,4.330065,4.197232
10271,2019-01-10,B,Rolls,29.0,10.4,274.8,0.0,2,1.0,1.0,...,10,3,10,2,1.0,4.731236,4.248157,3.205882,3.682676,4.096886
3168,2018-04-24,A,Engine Line,61.0,50.4,607.1,25.0,8,1.0,5.0,...,24,1,114,4,1.0,4.442304,4.149959,3.689076,3.975045,3.713235
11940,2019-03-09,C,Trim 2,71.0,61.9,684.5,0.0,8,2.0,6.0,...,9,5,68,2,2.0,5.935333,5.779757,5.345588,5.660504,6.408304
4932,2018-06-25,A,Chassis 1 & 2,84.0,50.9,833.5,30.0,5,1.0,5.0,...,25,0,176,4,1.0,4.442304,5.46592,4.352941,5.580882,4.80543
4348,2018-06-04,C,Rolls,25.0,40.5,263.1,40.0,6,1.0,4.0,...,4,0,155,1,1.0,5.935333,6.073763,7.338235,6.919505,6.058824
5466,2018-07-13,C,Material,108.0,171.8,1060.3,40.0,15,5.0,17.0,...,13,4,194,2,1.0,5.935333,5.574916,7.558824,6.045752,5.851211
4252,2018-06-01,B,Chassis 1 & 2,83.0,11.6,818.7,10.0,8,2.0,1.0,...,1,4,152,1,1.0,4.731236,4.867003,6.220588,5.696594,4.923345


#### Holiday Calendrical Analysis

In [30]:
def get_holidays(df):
    years = sorted(df['year'].unique().tolist())
    all_holidays = []
    for y in years:
        holidays = calendrical.set_holidays(y, True)
        htuples = list(holidays.items())
        all_holidays += htuples
    return all_holidays

In [51]:
def calendrical_features(row, holidays):
    gyear = row['year']
    gmonth = row['month']
    gday = row['day']
    rdate = calendrical.gdate_to_rdate(gyear, gmonth, gday)
    holiday_rdates = [x[1] for x in holidays]
    rdate_nearest = min(holiday_rdates, key=lambda x: abs(x - rdate))
    rdate_offset = rdate - rdate_nearest
    nearest_holiday = [item for item in holidays if item[1] == rdate_nearest][0][0]
    holiday_dates = []
    for h in holidays:
        if h[0] == nearest_holiday:
            rd = calendrical.rdate_to_gdate(h[1] + rdate_offset)
            date_str = datetime.datetime(rd[0], rd[1], rd[2]).strftime("%Y-%m-%d")
            holiday_dates.append(date_str)
    return pd.Series([nearest_holiday, rdate_offset, holiday_dates], index=['holiday', 'holiday_offset', 'holiday_dates'])

In [32]:
holidays = get_holidays(dfm5)
holidays

[("New Year's Day", 736695),
 ('MLK Day', 736709),
 ("Valentine's Day", 736739),
 ("President's Day", 736744),
 ("St. Patrick's Day", 736770),
 ('Good Friday', 736783),
 ('Easter', 736785),
 ('Cinco de Mayo', 736819),
 ("Mother's Day", 736827),
 ('Memorial Day', 736842),
 ("Father's Day", 736862),
 ('Independence Day', 736879),
 ('Labor Day', 736940),
 ('Halloween', 736695),
 ("Veteran's Day", 737010),
 ('Thanksgiving', 737020),
 ('Christmas', 737053),
 ("New Year's Day", 737060),
 ('MLK Day', 737080),
 ("Valentine's Day", 737104),
 ("President's Day", 737108),
 ("St. Patrick's Day", 737135),
 ('Good Friday', 737168),
 ('Easter', 737170),
 ('Cinco de Mayo', 737184),
 ("Mother's Day", 737191),
 ('Memorial Day', 737206),
 ("Father's Day", 737226),
 ('Independence Day', 737244),
 ('Labor Day', 737304),
 ('Halloween', 737060),
 ("Veteran's Day", 737374),
 ('Thanksgiving', 737391),
 ('Christmas', 737418)]

In [33]:
holiday_rdates = [x[1] for x in holidays]
holiday_rdates

[736695,
 736709,
 736739,
 736744,
 736770,
 736783,
 736785,
 736819,
 736827,
 736842,
 736862,
 736879,
 736940,
 736695,
 737010,
 737020,
 737053,
 737060,
 737080,
 737104,
 737108,
 737135,
 737168,
 737170,
 737184,
 737191,
 737206,
 737226,
 737244,
 737304,
 737060,
 737374,
 737391,
 737418]

In [53]:
dfh = dfm5.apply(calendrical_features, holidays=holidays, axis=1)
dfh.sample(20)

Unnamed: 0,holiday,holiday_offset,holiday_dates
3101,Cinco de Mayo,-14,"[2018-04-21, 2019-04-21]"
1749,President's Day,12,"[2018-03-03, 2019-03-02]"
5524,Independence Day,12,"[2018-07-16, 2019-07-16]"
14202,Memorial Day,3,"[2018-05-31, 2019-05-30]"
5414,Independence Day,8,"[2018-07-12, 2019-07-12]"
6392,Labor Day,-19,"[2018-08-15, 2019-08-14]"
553,MLK Day,7,"[2018-01-22, 2019-01-28]"
9028,Veteran's Day,4,"[2018-11-16, 2019-11-15]"
1193,Valentine's Day,-1,"[2018-02-13, 2019-02-13]"
13468,Cinco de Mayo,-2,"[2018-05-03, 2019-05-03]"


In [119]:
dfm5 = pd.concat([dfm5, dfh], axis=1)

In [120]:
dfm5[['workdate', 'holiday_dates']].tail(20)

Unnamed: 0,workdate,holiday_dates
14267,2019-06-01,"[2018-06-02, 2019-06-01]"
14268,2019-06-01,"[2018-06-02, 2019-06-01]"
14269,2019-06-01,"[2018-06-02, 2019-06-01]"
14270,2019-06-01,"[2018-06-02, 2019-06-01]"
14271,2019-06-01,"[2018-06-02, 2019-06-01]"
14272,2019-06-01,"[2018-06-02, 2019-06-01]"
14273,2019-06-01,"[2018-06-02, 2019-06-01]"
14274,2019-06-01,"[2018-06-02, 2019-06-01]"
14275,2019-06-01,"[2018-06-02, 2019-06-01]"
14276,2019-06-01,"[2018-06-02, 2019-06-01]"


In [35]:
dfm_nth = dfm.groupby(['crew', 'nth_kday', 'day_of_week'])[['absences_unplanned']].mean().reset_index()
dfm_nth

Unnamed: 0,crew,nth_kday,day_of_week,absences_unplanned
0,A,1,0,5.361345
1,A,1,1,4.576471
2,A,1,2,4.199262
3,A,1,3,4.125436
4,A,2,0,6.051471
5,A,2,1,4.425606
6,A,2,2,4.055363
7,A,2,3,4.197232
8,A,3,0,5.670588
9,A,3,1,3.986159


In [44]:
dfm5['holiday_dates'].iloc[1]

['2018-01-02', '2019-01-02']

In [47]:
def holiday_mean(row, df):
    holiday_dates = row['holiday_dates']
    holiday_mean = df['absences_unplanned'][df['workdate'].isin(holiday_dates)].mean()
    return holiday_mean

In [121]:
dfm5['au_holiday_mean'] = dfm5.apply(holiday_mean, df=dfm5, axis=1)
dfm5[['workdate', 'au_holiday_mean']].tail(20)

Unnamed: 0,workdate,au_holiday_mean
14267,2019-06-01,7.676471
14268,2019-06-01,7.676471
14269,2019-06-01,7.676471
14270,2019-06-01,7.676471
14271,2019-06-01,7.676471
14272,2019-06-01,7.676471
14273,2019-06-01,7.676471
14274,2019-06-01,7.676471
14275,2019-06-01,7.676471
14276,2019-06-01,7.676471


In [56]:
dfm5.columns

Index(['workdate', 'crew', 'production_line', 'group_total_cid', 'lost_hours',
       'actual_hours', 'paa_hours', 'absences_unplanned', 'absences_late',
       'absences_noshow', 'absences_any', 'mean_absence_pct',
       'mean_experience', 'absences_planned', 'home_canvasses',
       'absences_fmla', 'peia_count', 'tpt_count', 'streak_1', 'streak_2',
       'streak_3', 'streak_4_plus', 'cluster', 'group_total', 'tpt_unplanned',
       'tpt_extra', 'absences_unplanned_rolling_sum_5',
       'absences_unplanned_rolling_median_5',
       'absences_unplanned_rolling_sum_12',
       'absences_unplanned_rolling_median_12',
       'absences_unplanned_rolling_sum_20',
       'absences_unplanned_rolling_median_20', 'actual_hours_rolling_mean_20',
       'actual_hours_rolling_median_20', 'lost_hours_rolling_mean_20',
       'lost_hours_rolling_median_20', 'kp_residual_5', 'kp_residual_12',
       'kp_residual_20', 'year', 'quarter', 'month', 'week', 'day',
       'day_of_week', 'day_of_year', 

In [122]:
dfm5['au_dow_pct'] = round(100 * (dfm5['au_dow_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)
dfm5['au_week_pct'] = round(100 * (dfm5['au_week_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)
dfm5['au_month_pct'] = round(100 * (dfm5['au_month_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)
dfm5['au_nth_kday_pct'] = round(100 * (dfm5['au_nth_kday_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)   
dfm5['au_holiday_pct'] = round(100 * (dfm5['au_holiday_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)

In [123]:
dfm5['workdate_dt'] = pd.to_datetime(dfm5['workdate'])
dfm5['day_name'] = dfm5['workdate_dt'].dt.day_name()
dfm5['month_name'] = dfm5['workdate_dt'].dt.month_name()

In [124]:
dfm5.sample(20)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,holiday_dates,au_holiday_mean,au_dow_pct,au_week_pct,au_month_pct,au_nth_kday_pct,au_holiday_pct,workdate_dt,day_name,month_name
13581,2019-05-08,A,Final 2,44.0,70.0,443.5,50.0,6,0.0,7.0,...,"[2018-05-08, 2019-05-08]",6.117647,-9,30,22,-9,38,2019-05-08,Wednesday,May
13445,2019-05-03,B,Final 2,43.0,41.2,429.8,8.0,5,2.0,4.0,...,"[2018-05-03, 2019-05-03]",5.411765,3,17,25,4,14,2019-05-03,Friday,May
3608,2018-05-09,A,Chassis 4,62.0,42.5,595.1,5.0,6,3.0,4.0,...,"[2018-05-09, 2019-05-09]",5.367647,-9,30,22,-9,21,2018-05-09,Wednesday,May
8771,2018-11-05,C,Trim 2,74.0,53.6,691.5,20.0,5,4.0,5.0,...,"[2018-11-05, 2019-11-04]",4.323529,2,-8,-2,2,-27,2018-11-05,Monday,November
2524,2018-03-31,B,Final 2,48.0,20.1,475.3,0.0,3,1.0,2.0,...,"[2018-03-31, 2019-04-20]",5.426471,2,-10,-2,-16,15,2018-03-31,Saturday,March
13329,2019-04-29,C,Quality 1,48.0,41.8,487.2,40.0,3,3.0,4.0,...,"[2018-04-29, 2019-04-29]",6.0,2,8,1,2,1,2019-04-29,Monday,April
7588,2018-09-26,A,Engine Line,64.0,30.0,614.8,0.0,2,0.0,3.0,...,"[2018-09-26, 2019-09-25]",3.735294,-9,-4,5,-10,-16,2018-09-26,Wednesday,September
12650,2019-04-03,B,Quality 2,38.0,11.0,369.9,0.0,5,1.0,1.0,...,"[2018-03-14, 2019-04-03]",4.382353,5,-12,-7,0,-7,2019-04-03,Wednesday,April
4641,2018-06-14,B,,21.0,20.0,226.9,0.0,2,0.0,2.0,...,"[2018-06-14, 2019-06-13]",6.147059,-10,22,20,-13,30,2018-06-14,Thursday,June
9604,2018-12-08,B,Trim 2,72.0,51.1,712.1,10.0,6,1.0,5.0,...,"[2018-12-08, 2019-12-14]",5.764706,2,9,-1,0,22,2018-12-08,Saturday,December


In [125]:
dfm5['feature1'] = dfm5['day_name'] + ': ' + dfm5['au_dow_pct'].astype(str) + '%' 
dfm5['feature1'].sample(20)

13143      Tuesday: -7%
9149     Wednesday: -9%
12584        Monday: 2%
4835      Thursday: -5%
11218     Wednesday: 5%
6920      Saturday: -3%
4908       Saturday: 2%
76        Thursday: -5%
794        Tuesday: -7%
1258      Thursday: -5%
475      Thursday: -10%
8499        Friday: -6%
6887        Friday: -6%
10713     Saturday: -3%
1545       Saturday: 2%
1009        Tuesday: 7%
3775       Tuesday: -7%
7253         Friday: 3%
13882      Saturday: 2%
3484         Friday: 3%
Name: feature1, dtype: object

In [126]:
dfm5['feature2'] = 'Week ' + dfm5['week'].astype(str) + ': ' + dfm5['au_week_pct'].astype(str) + '%'
dfm5['feature2'].sample(20)

4070      Week 21: 26%
2730      Week 15: -5%
11588       Week 9: 8%
6713       Week 34: 0%
12557     Week 14: -9%
8402     Week 43: -15%
10981      Week 6: 18%
13469      Week 18: 8%
4774      Week 25: 27%
1598       Week 9: -4%
10522      Week 3: -8%
5840      Week 30: 10%
4902      Week 25: 14%
3695      Week 19: 30%
1989       Week 11: 0%
2076      Week 11: -9%
12708     Week 14: 11%
6406       Week 33: 6%
7252     Week 37: -12%
10835       Week 5: 6%
Name: feature2, dtype: object

In [127]:
dfm5['feature3'] = dfm5['month_name'] + ': ' + dfm5['au_month_pct'].astype(str) + '%'
dfm5['feature3'].sample(20)

9645      December: -3%
7823       October: -6%
12356       March: -12%
12240        March: -2%
13221        April: -7%
2562        April: -11%
5676           July: 2%
2541         March: -5%
8512       October: -4%
3658           May: 25%
2055        March: -12%
5340           July: 9%
4164           May: 22%
9892      December: -3%
6656        August: -5%
7648     September: -7%
2839         April: -7%
8216       October: -4%
441       January: -22%
1419      February: -3%
Name: feature3, dtype: object

In [128]:
dfm5['feature4'] = dfm5['nth_kday'].astype(str) + ' ' + dfm5['day_name'] +  ': ' + dfm5['au_nth_kday_pct'].astype(str) + '%'
dfm5['feature4'] = dfm5['feature4'].str.replace('^1', '1st', regex=True)
dfm5['feature4'] = dfm5['feature4'].str.replace('^2', '2nd', regex=True)
dfm5['feature4'] = dfm5['feature4'].str.replace('^3', '3rd', regex=True)
dfm5['feature4'] = dfm5['feature4'].str.replace('^4', '4th', regex=True)
dfm5['feature4'] = dfm5['feature4'].str.replace('^5', '5th', regex=True)
dfm5['feature4'].sample(20)

164        1st Saturday: 0%
10921       1st Monday: 21%
10262    2nd Thursday: -13%
11482       4th Friday: -9%
3220      4th Wednesday: 4%
1925        2nd Friday: -1%
6029      1st Thursday: -7%
1978        2nd Monday: 36%
1675      1st Thursday: -7%
9933      3rd Thursday: -7%
10062     1st Thursday: -7%
4469        2nd Friday: 10%
4625      2nd Thursday: -6%
4486        2nd Friday: -1%
13700      2nd Saturday: 8%
210         2nd Tuesday: 0%
1857      1st Wednesday: 0%
4894        4th Friday: -9%
102          1st Friday: 4%
6033      1st Thursday: -7%
Name: feature4, dtype: object

In [129]:
dfm5['holiday_plus_minus'] = dfm5['holiday_offset'].apply(lambda x: 'after' if x > 0 else 'before')
dfm5.loc[dfm5['holiday_offset'] == 0, 'holiday_plus_minus'] = 'it\'s'
dfm5['feature5'] = abs(dfm5['holiday_offset']).astype(str) + ' days ' + dfm5['holiday_plus_minus'] + ' ' \
                   + dfm5['holiday'] + ': ' + dfm5['au_holiday_pct'].astype(str) + '%'

2309     5 days after St. Patrick's Day: -14%
6638           11 days before Labor Day: -11%
4156           1 days after Memorial Day: 43%
2113      1 days before St. Patrick's Day: 0%
6346            21 days before Labor Day: 19%
11225       1 days before Valentine's Day: 5%
5576      14 days after Independence Day: -9%
13118           1 days after Good Friday: -9%
9044         5 days after Veteran's Day: -12%
7062             4 days after Labor Day: -10%
11729      12 days after President's Day: -4%
7983       34 days before Veteran's Day: -22%
12006    5 days before St. Patrick's Day: -8%
4107           3 days before Memorial Day: 5%
9064         5 days after Veteran's Day: -30%
11971    6 days before St. Patrick's Day: 15%
2968                16 days after Easter: 41%
5667     16 days after Independence Day: -15%
13407         3 days before Cinco de Mayo: 7%
12696         14 days before Good Friday: -5%
Name: feature5, dtype: object

In [142]:
dfm5['feature5'].sample(20)

7976         34 days before Veteran's Day: -22%
2764                   9 days after Easter: 18%
9217             4 days after Thanksgiving: 52%
12470     11 days after St. Patrick's Day: -24%
11340          0 days it's President's Day: 29%
14146             1 days after Memorial Day: 7%
1828     11 days before St. Patrick's Day: -19%
11003        8 days before Valentine's Day: 37%
10015       3 days before New Year's Day: -100%
4830             4 days after Father's Day: -3%
9676             14 days before Christmas: -20%
6596              12 days before Labor Day: -6%
4885            5 days after Father's Day: -10%
7417               17 days after Labor Day: -5%
12720          14 days before Good Friday: -24%
7607              23 days after Labor Day: -21%
1739         12 days after President's Day: 21%
11295         2 days after Valentine's Day: -7%
872        13 days before Valentine's Day: -10%
5216         1 days after Independence Day: 48%
Name: feature5, dtype: object

In [146]:
def create_calendrical_stats(dfm):
    dfm_mean = dfm.groupby(['crew'])[['absences_unplanned']].mean().reset_index()
    dfm1 = pd.merge(dfm, dfm_mean, left_on=['crew'], right_on=['crew'], how='left')
    dfm1.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_crew_mean'}, inplace=True)
    dfm_dow = dfm.groupby(['crew', 'day_of_week'])[['absences_unplanned']].mean().reset_index()
    dfm2 = pd.merge(dfm1, dfm_dow, left_on=['crew', 'day_of_week'], right_on=['crew', 'day_of_week'], how='left')
    dfm2.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_dow_mean'}, inplace=True)
    dfm_week = dfm.groupby(['crew', 'week'])[['absences_unplanned']].mean().reset_index()
    dfm3 = pd.merge(dfm2, dfm_week, left_on=['crew', 'week'], right_on=['crew', 'week'], how='left')
    dfm3.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_week_mean'}, inplace=True)
    dfm_month = dfm.groupby(['crew', 'month'])[['absences_unplanned']].mean().reset_index()
    dfm4 = pd.merge(dfm3, dfm_month, left_on=['crew', 'month'], right_on=['crew', 'month'], how='left')
    dfm4.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_month_mean'}, inplace=True)
    dfm_nth = dfm.groupby(['crew', 'nth_kday', 'day_of_week'])[['absences_unplanned']].mean().reset_index()
    dfm5 = pd.merge(dfm4, dfm_nth, left_on=['crew', 'nth_kday', 'day_of_week'], right_on=['crew', 'nth_kday', 'day_of_week'], how='left')
    dfm5.rename(index=str, columns={'absences_unplanned_x': 'absences_unplanned', 'absences_unplanned_y': 'au_nth_kday_mean'}, inplace=True)
    holidays = get_holidays(dfm5)
    dfh = dfm5.apply(calendrical_features, holidays=holidays, axis=1)
    dfm5 = pd.concat([dfm5, dfh], axis=1)
    dfm5['au_holiday_mean'] = dfm5.apply(holiday_mean, df=dfm5, axis=1)
    dfm5['au_dow_pct'] = round(100 * (dfm5['au_dow_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)
    dfm5['au_week_pct'] = round(100 * (dfm5['au_week_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)
    dfm5['au_month_pct'] = round(100 * (dfm5['au_month_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)
    dfm5['au_nth_kday_pct'] = round(100 * (dfm5['au_nth_kday_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int) 
    dfm5['au_holiday_pct'] = round(100 * (dfm5['au_holiday_mean'] / dfm5['au_crew_mean'] - 1), 0).astype(int)
    dfm5['workdate_dt'] = pd.to_datetime(dfm5['workdate'])
    dfm5['day_name'] = dfm5['workdate_dt'].dt.day_name()
    dfm5['month_name'] = dfm5['workdate_dt'].dt.month_name()
    dfm5['feature1'] = dfm5['day_name'] + ': ' + dfm5['au_dow_pct'].astype(str) + '%'
    dfm5['feature2'] = 'Week ' + dfm5['week'].astype(str) + ': ' + dfm5['au_week_pct'].astype(str) + '%'
    dfm5['feature3'] = dfm5['month_name'] + ': ' + dfm5['au_month_pct'].astype(str) + '%'
    dfm5['feature4'] = dfm5['nth_kday'].astype(str) + ' ' + dfm5['day_name'] +  ': ' + dfm5['au_nth_kday_pct'].astype(str) + '%'
    dfm5['feature4'] = dfm5['feature4'].str.replace('^1', '1st', regex=True)
    dfm5['feature4'] = dfm5['feature4'].str.replace('^2', '2nd', regex=True)
    dfm5['feature4'] = dfm5['feature4'].str.replace('^3', '3rd', regex=True)
    dfm5['feature4'] = dfm5['feature4'].str.replace('^4', '4th', regex=True)
    dfm5['feature4'] = dfm5['feature4'].str.replace('^5', '5th', regex=True)
    dfm5['holiday_plus_minus'] = dfm5['holiday_offset'].apply(lambda x: 'after' if x > 0 else 'before')
    dfm5.loc[dfm5['holiday_offset'] == 0, 'holiday_plus_minus'] = 'it\'s'
    dfm5['feature5'] = abs(dfm5['holiday_offset']).astype(str) + ' days ' + dfm5['holiday_plus_minus'] + ' ' \
                       + dfm5['holiday'] + ': ' + dfm5['au_holiday_pct'].astype(str) + '%'
    drop_cols = ['workdate_dt', 'holiday_plus_minus']
    dfm5.drop(columns=drop_cols, inplace=True)
    return dfm5

In [147]:
dfm_out = create_calendrical_stats(dfm)

In [151]:
dfm_out[['workdate', 'feature1', 'feature2', 'feature3', 'feature4', 'feature5']].sample(50)

Unnamed: 0,workdate,feature1,feature2,feature3,feature4,feature5
4318,2018-06-04,Monday: 23%,Week 23: 38%,June: 26%,1st Monday: 21%,7 days after Memorial Day: 58%
7399,2018-09-19,Wednesday: 5%,Week 38: 11%,September: -7%,3rd Wednesday: 4%,16 days after Labor Day: -6%
7117,2018-09-10,Monday: 23%,Week 37: 14%,September: 5%,2nd Monday: 36%,7 days after Labor Day: 54%
1538,2018-02-24,Saturday: 2%,Week 8: -7%,February: -3%,4th Saturday: 1%,5 days after President's Day: -1%
12683,2019-04-04,Thursday: -10%,Week 14: -12%,April: -7%,1st Thursday: -5%,15 days before Good Friday: -19%
12370,2019-03-25,Monday: 2%,Week 13: -12%,March: -5%,4th Monday: 6%,8 days after St. Patrick's Day: 2%
3754,2018-05-14,Monday: 23%,Week 20: 44%,May: 22%,2nd Monday: 36%,1 days after Mother's Day: 66%
561,2018-01-22,Monday: 2%,Week 4: -24%,January: -18%,4th Monday: 6%,7 days after MLK Day: 6%
2314,2018-03-23,Friday: 3%,Week 12: -18%,March: -2%,4th Friday: -5%,6 days after St. Patrick's Day: -5%
8610,2018-10-31,Wednesday: -9%,Week 44: -4%,October: -6%,5th Wednesday: 6%,12 days before Veteran's Day: 19%


In [None]:
holidays_2018 = calendrical.set_holidays(2018, True)
holidays_2018

In [None]:
tuples_2018 = list(holidays_2018.items())
tuples_2018

In [None]:
holidays_2019 = calendrical.set_holidays(2019, True)
holidays_2019

In [None]:
tuples_2019 = list(holidays_2019.items())
tuples_2019

In [None]:
all_tuples = tuples_2018 + tuples_2019
all_tuples

In [None]:
test_rdate = calendrical.gdate_to_rdate(2019, 5, 20)
test_rdate

In [None]:
ph = calendrical.previous_holiday(test_rdate, holidays_2019)
ph

In [None]:
htuple = [item for item in all_tuples if item[1] == ph][0]
htuple[0]

In [None]:
nh = calendrical.next_holiday(test_rdate, holidays_2019)
nh

In [None]:
htuple = [item for item in all_tuples if item[1] == nh][0]
htuple[0]

In [None]:
test_gdate = calendrical.rdate_to_gdate(test_rdate)
test_gdate

In [None]:
date_str = datetime.datetime(test_gdate[0], test_gdate[1], test_gdate[2]).strftime("%Y-%m-%d")
date_str

### End of Notebook