### HR Analytics Peak Model

#### Set source path to import code

In [1]:
%pwd

'C:\\Users\\t0272m1\\Projects\\HR Analytics\\notebooks'

In [2]:
source_path = 'E:\HR-Analytics\source'
source_path

'E:\\HR-Analytics\\source'

In [3]:
import os
os.chdir(source_path)
%pwd

'E:\\HR-Analytics\\source'

In [4]:
ls

 Volume in drive E is DATA
 Volume Serial Number is AEFF-63BE

 Directory of E:\HR-Analytics\source

09/10/2019  12:08 PM    <DIR>          .
09/10/2019  12:08 PM    <DIR>          ..
05/02/2019  11:01 AM                 2 __init__.py
09/10/2019  12:25 PM    <DIR>          __pycache__
09/09/2019  12:33 PM        78,023,896 abs_fmla_incapacities_tbl.csv
07/15/2019  02:57 PM            16,924 base_table.py
06/13/2019  11:21 AM            26,727 calendrical.py
09/06/2019  10:06 AM             3,984 config_jnap.yml
09/06/2019  10:06 AM            13,274 config_shap.yml
09/06/2019  10:21 AM             6,370 config_tac.yml
05/01/2019  11:07 AM               498 config_wap.yml
09/06/2019  10:06 AM             6,143 config_wtap.yml
09/06/2019  10:06 AM             6,317 database.py
09/05/2019  11:13 AM             9,793 main.py
09/10/2019  12:25 PM            45,083 model.py
08/27/2019  10:16 AM             3,072 pipeline.yml
05/01/2019  02:09 PM             3,673 pipeline_brap.py
05/01/2019 

#### Imports

In [5]:
import calendar
import datetime
import itertools
import jaydebeapi as jdb
import json
import math
import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import psycopg2
import random
import requests
import seaborn as sns
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
import statsmodels as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
import urllib.request

In [12]:
# Internal Python Packages
from database import connect_greenplum
from database import create_frame_from_pg
from database import create_sqlalchemy_engine
from database import write_frame_to_pg
from main import get_pipeline_config
from main import get_plant_config

#### Get specifications

In [7]:
pipeline_specs = get_pipeline_config(source_path)
pipeline_specs

{'datalake': {'schema': 'lab_datasci',
  'host': 'shbdmdwp001.servers.chrysler.com',
  'port': 5432,
  'user': 'datasci',
  'password': 'datasci_01',
  'database': 'odshawq'},
 'jdbc': {'driver': 'com.ibm.db2.jcc.DB2Driver',
  'server': 'jdbc:db2://SRVR1874.dbms.chrysler.com:18740/AUCERPTP',
  'user': 'datasci',
  'password': 'datasci_01',
  'jar_file': 'c:/installed/sqllib/java/db2jcc4.jar'},
 'holidays': {'calendar_us': ['2016-01-01',
   '2016-01-18',
   '2016-03-25',
   '2016-03-28',
   '2016-05-30',
   '2016-07-04',
   '2016-09-05',
   '2016-11-08',
   '2016-11-11',
   '2016-11-24',
   '2016-11-25',
   '2016-12-26',
   '2016-12-27',
   '2016-12-28',
   '2016-12-29',
   '2016-12-30',
   '2017-01-02',
   '2017-01-22',
   '2017-04-14',
   '2017-04-17',
   '2017-05-29',
   '2017-07-04',
   '2017-09-04',
   '2017-11-10',
   '2017-11-22',
   '2017-11-23',
   '2017-12-25',
   '2017-12-26',
   '2017-12-29',
   '2017-12-30',
   '2017-12-31',
   '2018-01-01',
   '2018-01-15',
   '2018-03-30'

In [8]:
pipeline_specs['datalake']

{'schema': 'lab_datasci',
 'host': 'shbdmdwp001.servers.chrysler.com',
 'port': 5432,
 'user': 'datasci',
 'password': 'datasci_01',
 'database': 'odshawq'}

In [9]:
plant_id = 'jnap'
plant_id

'jnap'

In [10]:
pipeline_specs['plant_id'] = plant_id
pipeline_specs['project_directory'] = source_path
pipeline_specs

{'datalake': {'schema': 'lab_datasci',
  'host': 'shbdmdwp001.servers.chrysler.com',
  'port': 5432,
  'user': 'datasci',
  'password': 'datasci_01',
  'database': 'odshawq'},
 'jdbc': {'driver': 'com.ibm.db2.jcc.DB2Driver',
  'server': 'jdbc:db2://SRVR1874.dbms.chrysler.com:18740/AUCERPTP',
  'user': 'datasci',
  'password': 'datasci_01',
  'jar_file': 'c:/installed/sqllib/java/db2jcc4.jar'},
 'holidays': {'calendar_us': ['2016-01-01',
   '2016-01-18',
   '2016-03-25',
   '2016-03-28',
   '2016-05-30',
   '2016-07-04',
   '2016-09-05',
   '2016-11-08',
   '2016-11-11',
   '2016-11-24',
   '2016-11-25',
   '2016-12-26',
   '2016-12-27',
   '2016-12-28',
   '2016-12-29',
   '2016-12-30',
   '2017-01-02',
   '2017-01-22',
   '2017-04-14',
   '2017-04-17',
   '2017-05-29',
   '2017-07-04',
   '2017-09-04',
   '2017-11-10',
   '2017-11-22',
   '2017-11-23',
   '2017-12-25',
   '2017-12-26',
   '2017-12-29',
   '2017-12-30',
   '2017-12-31',
   '2018-01-01',
   '2018-01-15',
   '2018-03-30'

In [11]:
plant_specs = get_plant_config(pipeline_specs)
plant_specs

{'plant': {'code': 4012,
  'market_id': 7,
  'shift_days': 4,
  'shift_hours': 10,
  'absence_codes': ['BERC',
   'BERE',
   'BERU',
   'BERX',
   'CARE',
   'CARU',
   'FMLA',
   'FMLD',
   'FMLU',
   'HOMD',
   'ILFE',
   'ILFU',
   'IPBE',
   'IPME',
   'IPNU',
   'IPSE',
   'JURE',
   'MISE',
   'MISU',
   'PERU',
   'PPAA',
   'PPAU',
   'TRAG',
   'WTRU'],
  'exclude_dates': [['2018-08-25', '2018-08-31'],
   ['2019-01-02', '2019-01-05'],
   ['2020-08-10', '2020-08-14']]},
 'base_table': {'start_date': datetime.date(2017, 1, 1),
  'end_date': None,
  'write_table': True},
 'model': {'models': ['sarimax'],
  'target': 'absences_unplanned',
  'npreds': 4,
  'p_arima': 1,
  'd_arima': 0,
  'q_arima': 0,
  'features': ['actual_hours',
   'lost_hours',
   'absences_unplanned_rolling_median_12',
   'absences_unplanned_rolling_median_20',
   'quarter',
   'month',
   'week',
   'day',
   'day_of_week',
   'day_of_year'],
  'top_features': 5,
  'band_pct': 0.3,
  'levels': ['crew', 'produ

#### Connect to Postgres

In [13]:
engine_dl = create_sqlalchemy_engine(pipeline_specs['datalake'])
engine_dl

Engine(postgresql://datasci:***@shbdmdwp001.servers.chrysler.com:5432/odshawq)

In [14]:
conn_dl, curs_dl = connect_greenplum(pipeline_specs['datalake'])
conn_dl, curs_dl

(<connection object at 0x0000000016F92CA8; dsn: 'user=datasci password=xxx dbname=odshawq host=shbdmdwp001.servers.chrysler.com port=5432', closed: 0>,
 <cursor object at 0x00000000056F3748; closed: 0>)

#### Read in model table

In [23]:
output_path = 'E:/HR-Analytics/data'
output_path

'E:/HR-Analytics/data'

In [24]:
table_date = '20190915'
table_date

'20190915'

In [25]:
table_name = '_'.join(['abs', plant_id, 'model', table_date, 'tbl'])
table_name

'abs_jnap_model_20190915_tbl'

In [26]:
# input_file is a model file
file_name = '.'.join([table_name, 'csv'])
file_path = '/'.join([output_path, file_name])
df_model = pd.read_csv(file_path)

In [28]:
df_model.shape

(26770, 48)

In [29]:
df_model.columns

Index(['workdate', 'crew', 'production_line', 'group_total_cid', 'lost_hours',
       'actual_hours', 'paa_hours', 'absences_unplanned', 'absences_late',
       'absences_noshow', 'absences_any', 'mean_absence_pct',
       'mean_experience', 'absences_planned', 'home_canvasses',
       'absences_fmla', 'peia_count', 'tpt_count', 'streak_1', 'streak_2',
       'streak_3', 'streak_4_plus', 'cluster', 'group_total', 'tpt_unplanned',
       'tpt_extra', 'absences_unplanned_rolling_sum_5',
       'absences_unplanned_rolling_median_5',
       'absences_unplanned_rolling_sum_12',
       'absences_unplanned_rolling_median_12',
       'absences_unplanned_rolling_sum_20',
       'absences_unplanned_rolling_median_20', 'actual_hours_rolling_mean_20',
       'actual_hours_rolling_median_20', 'lost_hours_rolling_mean_20',
       'lost_hours_rolling_median_20', 'kp_residual_5', 'kp_residual_12',
       'kp_residual_20', 'year', 'quarter', 'month', 'week', 'day',
       'day_of_week', 'day_of_year', 

In [30]:
df_model['workdate'].min()

'2017-01-03'

In [31]:
df_model['workdate'].max()

'2019-09-07'

#### Read in Peak Table

In [15]:
schema = pipeline_specs['datalake']['schema']
schema

'lab_datasci'

In [16]:
peak_table = 'abs_ad_median_replace_tbl'
peak_table

'abs_ad_median_replace_tbl'

In [18]:
query = "select * from \"" + schema + "\".\"" + peak_table + "\""
query

'select * from "lab_datasci"."abs_ad_median_replace_tbl"'

In [19]:
df_peak = create_frame_from_pg(conn_dl, peak_table, query)

In [20]:
df_peak.shape

(80775, 7)

In [21]:
df_peak.columns

Index(['workdate', 'plant', 'crew', 'production_line', 'absences_unplanned',
       'extrema', 'imp_extrema'],
      dtype='object')

In [34]:
df_peak = df_peak[df_peak['plant'] == plant_id.upper()]

In [35]:
df_peak.sample(20)

Unnamed: 0,workdate,plant,crew,production_line,absences_unplanned,extrema,imp_extrema
6926,2019-03-04,JNAP,C,Trim 1,12.0,not extrema,not important extrema
5106,2019-03-18,JNAP,C,Door Line,7.0,min,imp_min
851,2018-12-12,JNAP,B,Trim 2,7.0,max,not important extrema
7570,2019-04-01,JNAP,A,Body Shop,7.0,max,imp_max
13367,2019-07-03,JNAP,B,Chassis 4,3.0,not extrema,not important extrema
12295,2019-03-05,JNAP,C,Material,11.0,min,not important extrema
3900,2019-06-11,JNAP,C,Material,20.0,max,not important extrema
10308,2018-07-24,JNAP,A,Chassis 1 & 2,5.0,not extrema,not important extrema
1436,2019-05-23,JNAP,B,Body Shop,8.0,not extrema,not important extrema
5890,2018-10-13,JNAP,B,Door Line,8.0,max,imp_max


#### Replace Model Table Values with Peak Values

In [36]:
join_cols = ['workdate', 'crew', 'production_line']
join_cols

['workdate', 'crew', 'production_line']

In [37]:
dfmp = pd.merge(df_model, df_peak,
                left_on=join_cols,
                right_on=join_cols,
                how='left')

In [50]:
dfmp[['absences_unplanned_x', 'absences_unplanned_y']].sample(50)

Unnamed: 0,absences_unplanned_x,absences_unplanned_y
25042,6,6.0
13721,9,
25459,6,10.0
23026,3,3.0
8421,9,
4094,8,
6407,3,
9199,7,
9878,2,
4907,12,


In [51]:
dfmp.columns

Index(['workdate', 'crew', 'production_line', 'group_total_cid', 'lost_hours',
       'actual_hours', 'paa_hours', 'absences_unplanned_x', 'absences_late',
       'absences_noshow', 'absences_any', 'mean_absence_pct',
       'mean_experience', 'absences_planned', 'home_canvasses',
       'absences_fmla', 'peia_count', 'tpt_count', 'streak_1', 'streak_2',
       'streak_3', 'streak_4_plus', 'cluster', 'group_total', 'tpt_unplanned',
       'tpt_extra', 'absences_unplanned_rolling_sum_5',
       'absences_unplanned_rolling_median_5',
       'absences_unplanned_rolling_sum_12',
       'absences_unplanned_rolling_median_12',
       'absences_unplanned_rolling_sum_20',
       'absences_unplanned_rolling_median_20', 'actual_hours_rolling_mean_20',
       'actual_hours_rolling_median_20', 'lost_hours_rolling_mean_20',
       'lost_hours_rolling_median_20', 'kp_residual_5', 'kp_residual_12',
       'kp_residual_20', 'year', 'quarter', 'month', 'week', 'day',
       'day_of_week', 'day_of_year'

In [49]:
dfmp['absences_unplanned'] = dfmp['absences_unplanned_y'].fillna(dfmp['absences_unplanned_x'])

In [52]:
drop_cols = ['absences_unplanned_x',
             'absences_unplanned_y',
             'plant',
             'extrema',
             'imp_extrema']
drop_cols

['absences_unplanned_x',
 'absences_unplanned_y',
 'plant',
 'extrema',
 'imp_extrema']

In [53]:
dfmp.drop(columns=drop_cols, inplace=True)

In [54]:
dfmp.sample(10)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_late,absences_noshow,absences_any,...,year,quarter,month,week,day,day_of_week,day_of_year,nth_kday,next_day_delta,absences_unplanned
18350,2018-10-29,A,Quality 1,58,50.5,580.4,10.0,1,5,15,...,2018,4,10,44,29,0,302,5,1.0,6.0
18071,2018-10-19,B,Chassis 4,65,31.0,634.0,10.0,1,3,8,...,2018,4,10,42,19,4,292,3,1.0,4.0
18749,2018-11-14,A,Chassis 1 & 2,85,70.0,825.4,10.0,0,7,15,...,2018,4,11,46,14,2,318,2,1.0,7.0
7353,2017-09-20,A,Door Line,57,30.5,537.0,0.0,1,3,12,...,2017,3,9,38,20,2,263,3,1.0,4.0
15445,2018-07-12,B,Rolls,25,10.0,275.7,0.0,0,1,4,...,2018,3,7,28,12,3,193,2,1.0,2.0
11737,2018-03-02,C,Quality 1,63,60.9,591.6,20.0,1,6,17,...,2018,1,3,9,2,4,61,1,1.0,7.0
3119,2017-04-22,B,Trim 1,108,92.6,1074.2,20.0,3,9,26,...,2017,2,4,16,22,5,112,4,4.0,12.0
7814,2017-10-06,C,Final 1,61,41.0,582.9,10.0,1,4,28,...,2017,4,10,40,6,4,279,1,1.0,5.0
18315,2018-10-27,B,Paint Shop,108,50.5,1096.5,10.0,1,5,20,...,2018,4,10,43,27,5,300,4,4.0,6.0
3316,2017-04-29,B,Final 2,42,11.6,414.4,0.0,2,1,7,...,2017,2,4,17,29,5,119,5,4.0,3.0


In [55]:
df_model_seq = df_model.copy(deep=True)

In [56]:
group_levels = plant_specs['model']['levels']
group_levels

['crew', 'production_line']

In [57]:
shift_cols = ['group_total_cid',
              'lost_hours',
              'actual_hours',
              'paa_hours',
              'absences_late',
              'absences_noshow',
              'absences_any',
              'mean_absence_pct',
              'mean_experience',
              'absences_planned',
              'home_canvasses',
              'absences_fmla',
              'peia_count',
              'tpt_count',
              'streak_1',
              'streak_2',
              'streak_3',
              'streak_4_plus',
              'cluster',
              'absences_unplanned_rolling_sum_5',
              'absences_unplanned_rolling_median_5',
              'absences_unplanned_rolling_sum_12',
              'absences_unplanned_rolling_median_12',
              'absences_unplanned_rolling_sum_20',
              'absences_unplanned_rolling_median_20',
              'actual_hours_rolling_mean_20',
              'actual_hours_rolling_median_20',
              'lost_hours_rolling_mean_20',
              'lost_hours_rolling_median_20',
              'group_total',
              'tpt_unplanned',
              'tpt_extra',
              'kp_residual_5',
              'kp_residual_12',
              'kp_residual_20']
shift_cols

['group_total_cid',
 'lost_hours',
 'actual_hours',
 'paa_hours',
 'absences_late',
 'absences_noshow',
 'absences_any',
 'mean_absence_pct',
 'mean_experience',
 'absences_planned',
 'home_canvasses',
 'absences_fmla',
 'peia_count',
 'tpt_count',
 'streak_1',
 'streak_2',
 'streak_3',
 'streak_4_plus',
 'cluster',
 'absences_unplanned_rolling_sum_5',
 'absences_unplanned_rolling_median_5',
 'absences_unplanned_rolling_sum_12',
 'absences_unplanned_rolling_median_12',
 'absences_unplanned_rolling_sum_20',
 'absences_unplanned_rolling_median_20',
 'actual_hours_rolling_mean_20',
 'actual_hours_rolling_median_20',
 'lost_hours_rolling_mean_20',
 'lost_hours_rolling_median_20',
 'group_total',
 'tpt_unplanned',
 'tpt_extra',
 'kp_residual_5',
 'kp_residual_12',
 'kp_residual_20']

In [58]:
df_model_seq[shift_cols] = df_model_seq.groupby(group_levels)[shift_cols].transform('shift')

In [59]:
df_model_seq.sample(20)

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,kp_residual_20,year,quarter,month,week,day,day_of_week,day_of_year,nth_kday,next_day_delta
16685,2018-08-24,B,Quality 2,42.0,30.7,419.1,10.0,3,1.0,3.0,...,0.0,2018,3,8,34,24,4,236,4,8.0
13559,2018-05-07,A,Sunroof Deck,39.0,10.0,365.2,0.0,5,0.0,1.0,...,0.0,2018,2,5,19,7,0,127,1,1.0
15359,2018-07-10,A,Quality 2,40.0,60.0,400.2,20.0,6,0.0,6.0,...,-4.0,2018,3,7,28,10,1,191,2,1.0
20825,2019-02-11,A,Door Line,53.0,30.8,522.1,0.0,5,1.0,3.0,...,1.111111,2019,1,2,7,11,0,42,2,1.0
25828,2019-08-06,A,Paint Shop,100.0,102.0,993.3,20.0,5,2.0,10.0,...,-1.333333,2019,3,8,32,6,1,218,1,1.0
8277,2017-10-23,A,Quality 1,57.0,0.0,581.4,0.0,6,0.0,0.0,...,-0.4,2017,4,10,43,23,0,296,4,1.0
21766,2019-03-14,B,Quality 1,46.0,30.9,441.3,10.0,2,1.0,3.0,...,-2.0,2019,1,3,11,14,3,73,2,1.0
16510,2018-08-18,B,Final 1,64.0,50.5,627.2,0.0,9,2.0,5.0,...,0.888889,2018,3,8,33,18,5,230,3,4.0
14678,2018-06-15,B,Quality 1,40.0,60.9,399.8,10.0,1,2.0,6.0,...,-0.285714,2018,2,6,24,15,4,166,3,1.0
14072,2018-05-24,B,,18.0,21.0,183.5,10.0,5,1.0,2.0,...,1.2,2018,2,5,21,24,3,144,4,1.0


#### Historical Predictions

In [60]:
departments = plant_specs['model']['departments']
departments

{'Material': '3300',
 'Body Shop': '9110',
 'Paint Shop': '9130',
 'Trim': '9150',
 'Chassis': '9170',
 'Final': '9173',
 'Quality 1': '9190',
 'Quality 2': '9193'}

In [61]:
crews = plant_specs['model']['crews']
crews

['A', 'B', 'C']

In [62]:
pline_specs = plant_specs['model']['production_lines']
pline_map = {}
for k, v in pline_specs.items():
    pline_map[k] = v[0]
pline_map

{'Material': 'Material',
 'Body Shop': 'Body Shop',
 'Paint Shop': 'Paint Shop',
 'Trim 1': 'Trim',
 'Trim 2': 'Trim',
 'Door Line': 'Trim',
 'Engine Line': 'Chassis',
 'Chassis 1 & 2': 'Chassis',
 'Sunroof Deck': 'Chassis',
 'Chassis 3': 'Chassis',
 'Chassis 4': 'Chassis',
 'Final 1': 'Final',
 'Final 2': 'Final',
 'Rolls': 'Final',
 'Quality 1': 'Quality 1',
 'Quality 2': 'Quality 2'}

In [63]:
plines = list(pline_map.keys())
plines

['Material',
 'Body Shop',
 'Paint Shop',
 'Trim 1',
 'Trim 2',
 'Door Line',
 'Engine Line',
 'Chassis 1 & 2',
 'Sunroof Deck',
 'Chassis 3',
 'Chassis 4',
 'Final 1',
 'Final 2',
 'Rolls',
 'Quality 1',
 'Quality 2']

In [64]:
df_model_seq.dropna(axis=0, subset=['production_line'], inplace=True)

In [66]:
df_model_seq.dropna(axis=0, subset=['group_total_cid'], inplace=True)

In [67]:
df_model_seq[df_model_seq.isin([np.nan, np.inf, -np.inf]).any(1)]

Unnamed: 0,workdate,crew,production_line,group_total_cid,lost_hours,actual_hours,paa_hours,absences_unplanned,absences_late,absences_noshow,...,kp_residual_20,year,quarter,month,week,day,day_of_week,day_of_year,nth_kday,next_day_delta


In [68]:
exog_cols = df_model_seq.columns.tolist()
remove_cols = ['workdate', 'crew', 'production_line', 'absences_unplanned',
               'precip', 'snowfall', 'snow_depth', 'temp_avg', 'temp_max',
               'temp_min', 'fog', 'temp_delta']
exog_cols = [x for x in exog_cols if x not in remove_cols]
exog_cols

['group_total_cid',
 'lost_hours',
 'actual_hours',
 'paa_hours',
 'absences_late',
 'absences_noshow',
 'absences_any',
 'mean_absence_pct',
 'mean_experience',
 'absences_planned',
 'home_canvasses',
 'absences_fmla',
 'peia_count',
 'tpt_count',
 'streak_1',
 'streak_2',
 'streak_3',
 'streak_4_plus',
 'cluster',
 'group_total',
 'tpt_unplanned',
 'tpt_extra',
 'absences_unplanned_rolling_sum_5',
 'absences_unplanned_rolling_median_5',
 'absences_unplanned_rolling_sum_12',
 'absences_unplanned_rolling_median_12',
 'absences_unplanned_rolling_sum_20',
 'absences_unplanned_rolling_median_20',
 'actual_hours_rolling_mean_20',
 'actual_hours_rolling_median_20',
 'lost_hours_rolling_mean_20',
 'lost_hours_rolling_median_20',
 'kp_residual_5',
 'kp_residual_12',
 'kp_residual_20',
 'year',
 'quarter',
 'month',
 'week',
 'day',
 'day_of_week',
 'day_of_year',
 'nth_kday',
 'next_day_delta']

In [69]:
def model_predict(npreds, model, df, split_index):
    test_len = df.shape[0] - split_index
    predictions = []
    for t in range(test_len):
        offset = split_index + t
        yhat = model.predict(offset, offset+npreds-1).tolist()[0]
        # print('\nIteration: %d of %d, Next Predicted: %f, Expected: %f' % (t+1, test_len, yhat))
        # print('All Predictions: %s' % yhat)
        predictions.append((df['workdate'].iloc[offset], int(max(round(yhat), 0))))
    return predictions

In [70]:
pred_target = 'absences_unplanned'
predict_start_date = '2019-01-01'
predict_end_date = '2019-08-31'
npreds = 1
features = exog_cols
p = 1
d = 0
q = 0
ntop = 5
ar_pattern = "^ar.L"
minimum_rows = 20

In [71]:
dfp = pd.DataFrame()
for crew, pline in itertools.product(crews, plines):
    print("\nCrew %s, Production Line %s" % (crew, pline))
    # subset the frame
    df_sub = df_model_seq.copy(deep=True)
    df_sub = df_sub[df_sub['workdate'] <= predict_end_date]
    df_sub = df_sub[(df_sub['crew'] == crew) & (df_sub['production_line'] == pline)]
    df_sub = df_sub.iloc[1:].reset_index(drop=True)
    nrows = df_sub.shape[0]
    print("Rows: %d" % nrows)
    if nrows >= minimum_rows:
        try:
            # fit the model
            model_sm = SARIMAX(df_sub[pred_target],
                               df_sub[features],
                               order=(p, d, q),
                               simple_differencing=True)
            model_fit = model_sm.fit(method='powell')
            split_index = np.where(df_sub['workdate'] >= predict_start_date)[0].tolist()[0]
            preds = model_predict(npreds, model_fit, df_sub, split_index)
            # create dataframe
            df = pd.DataFrame(preds, columns=['workdate', 'predicted'])
            df['crew'] = crew
            df['production_line'] = pline
            # feature importances
            features_html = model_fit.summary().tables[1].as_html()
            df_feat = pd.read_html(features_html)[0].iloc[1:, :]
            df_feat.columns = ['feature', 'coef', 'std err', 'Z', 'P>|z|', 'ci_low', 'ci_high']
            df_feat = df_feat[df_feat.feature != 'sigma2']
            ar_filter = df_feat['feature'].str.contains(ar_pattern)
            df_feat = df_feat[~ar_filter]
            df_feat['Z'] = df_feat['Z'].astype('float')
            df_feat['Z_abs'] = abs(df_feat['Z'])
            df_feat.sort_values(by=['Z_abs'], ascending=False, inplace=True)
            features_ntop = df_feat['feature'].head(ntop).tolist()
            for i in range(ntop):
                fname = 'model_feature' + str(i+1)
                df[fname] = features_ntop[i]
            # store predictions in the data frame
            dfp = pd.concat([dfp, df])
        except:
            print("Could not fit model")
    else:
        print("Zero Rows Found")


Crew A, Production Line Material
Rows: 519
Optimization terminated successfully.
         Current function value: 2.484728
         Iterations: 3
         Function evaluations: 1262

Crew A, Production Line Body Shop
Rows: 520
Optimization terminated successfully.
         Current function value: 2.141189
         Iterations: 1
         Function evaluations: 427

Crew A, Production Line Paint Shop
Rows: 520
Optimization terminated successfully.
         Current function value: 2.571251
         Iterations: 1
         Function evaluations: 419

Crew A, Production Line Trim 1
Rows: 519
Optimization terminated successfully.
         Current function value: 2.582556
         Iterations: 1
         Function evaluations: 420

Crew A, Production Line Trim 2
Rows: 518
Optimization terminated successfully.
         Current function value: 2.392275
         Iterations: 1
         Function evaluations: 443

Crew A, Production Line Door Line
Rows: 519
Optimization terminated successfully.
       

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)



Crew A, Production Line Sunroof Deck
Rows: 518
Optimization terminated successfully.
         Current function value: 1.947838
         Iterations: 1
         Function evaluations: 425

Crew A, Production Line Chassis 3
Rows: 518
Optimization terminated successfully.
         Current function value: 2.360161
         Iterations: 1
         Function evaluations: 439

Crew A, Production Line Chassis 4
Rows: 518
Optimization terminated successfully.
         Current function value: 2.340765
         Iterations: 5
         Function evaluations: 2121

Crew A, Production Line Final 1
Rows: 518
Optimization terminated successfully.
         Current function value: 2.547590
         Iterations: 6
         Function evaluations: 2532

Crew A, Production Line Final 2
Rows: 518
Optimization terminated successfully.
         Current function value: 2.040805
         Iterations: 1
         Function evaluations: 423

Crew A, Production Line Rolls
Rows: 519
Optimization terminated successfully.
     

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)



Crew B, Production Line Rolls
Rows: 528
Optimization terminated successfully.
         Current function value: 1.608572
         Iterations: 1
         Function evaluations: 428


  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)



Crew B, Production Line Quality 1
Rows: 529
Optimization terminated successfully.
         Current function value: 2.010866
         Iterations: 1
         Function evaluations: 444

Crew B, Production Line Quality 2
Rows: 521
Optimization terminated successfully.
         Current function value: 1.917075
         Iterations: 1
         Function evaluations: 424

Crew C, Production Line Material
Rows: 517
Optimization terminated successfully.
         Current function value: 2.773988
         Iterations: 8
         Function evaluations: 3401

Crew C, Production Line Body Shop
Rows: 518
Optimization terminated successfully.
         Current function value: 2.430313
         Iterations: 5
         Function evaluations: 2115

Crew C, Production Line Paint Shop
Rows: 518
Optimization terminated successfully.
         Current function value: 2.789635
         Iterations: 1
         Function evaluations: 421

Crew C, Production Line Trim 1
Rows: 515
Optimization terminated successfully.
   

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)



Crew C, Production Line Chassis 3
Rows: 513
Optimization terminated successfully.
         Current function value: 2.454837
         Iterations: 1
         Function evaluations: 420

Crew C, Production Line Chassis 4
Rows: 514
Optimization terminated successfully.
         Current function value: 2.327354
         Iterations: 7
         Function evaluations: 2968

Crew C, Production Line Final 1
Rows: 516
Optimization terminated successfully.
         Current function value: 2.428036
         Iterations: 1
         Function evaluations: 423

Crew C, Production Line Final 2
Rows: 511
Optimization terminated successfully.
         Current function value: 2.234003
         Iterations: 1
         Function evaluations: 422

Crew C, Production Line Rolls
Rows: 516
Optimization terminated successfully.
         Current function value: 1.738654
         Iterations: 1
         Function evaluations: 424


  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)



Crew C, Production Line Quality 1
Rows: 518
Optimization terminated successfully.
         Current function value: 2.135041
         Iterations: 1
         Function evaluations: 424

Crew C, Production Line Quality 2
Rows: 510
Optimization terminated successfully.
         Current function value: 2.083699
         Iterations: 1
         Function evaluations: 434


In [72]:
dfp.shape

(6368, 9)

In [73]:
output_path = 'E:/HR-Analytics/data_test'
output_path

'E:/HR-Analytics/data_test'

In [74]:
pred_file = '_'.join(['abs', plant_id, 'peak_predictions_20190911.csv'])
pred_path = '/'.join([output_path, pred_file])
dfp.to_csv(pred_path, index=False)

### End of Notebook