Working notes
* Site: https://www.soa.org/resources/experience-studies/2017/2009-13-indiv-life-ins-mort-exp/
* Appendices: https://www.soa.org/globalassets/assets/files/research/exp-study/2009-13-indiv-life-ins-mort-exp-appendix.xlsx

Note: the zip of the csv is also at the site.

This work is based off duplicating appendix G.  My old data selection did not quite duplicate appendix G.  The totals in the published appendix across products do not match the rows, so I'm not going to sweat duplicating that here.

# Initialize libraries

In [1]:
import os
import sys
import re
import pandas as pd
import numpy as np
from pathlib import Path

# In case you haven't installed ILECTools, this bit will get the ilectools directory into 
# your search path to its functions can be imported.

dir_home = str(Path.home())
if not any([Path(_p).name=='ILECTools' for _p in sys.path]):
    sys.path.append(dir_home + '/dev/ILECTools') # is where it is on this machine

# Fields aggregated here, value columns
valcols = ['policy_actual', 'policy_2015vbt', 'amount_actual', 'amount_2015vbt']    

# variables used in the model
exogvars = 'uw,face_amount_band,observation_year,dur_band1,ia_band1,gender,insurance_plan,ltp,iy_band1'.split(',')


from importlib import reload
from ilectools import glmtools
glmtools = reload(glmtools)
from ilectools.glmtools import GLMRun

# Utility functions

In [2]:
def add_cols(df):
    '''Add certain columns to the dataframe and reteturn it.'''
    
    # I don't like 'Not Level Term' and "N/A (Not Term)" both being in there, should probably have consolidated them
    # level term period for this purpose
    df['ltp'] = (df.soa_anticipated_level_term_period.replace(
        {'20 yr anticipated': '20 yr or N/A (Not Term)',
         'N/A (Not Term)': '20 yr or N/A (Not Term)'}
        ).str.replace(' anticipated', '')
                )
    
    # duration bands in the original paper
    if not 'dur_band1' in df and 'duration' in df:
        df['dur_band1'] = 'n/a' # a default, which will get overridden
        for band, durs in {
            '01':[1],
            '02':[2],
            '03':[3],
            '04-05':[4,5],
            '06-10':range(6,11),
            '11-15':range(11,16),
            '16-20':range(16,21),
            '21-25':range(21,26)}.items():
            df.loc[df.duration.isin(durs), 'dur_band1'] = band
        df.loc[df.duration>25, 'dur_band1'] = '26-150'

    if not 'ia_band1' in df and 'issue_age' in df:
        df['ia_band1'] = '18-24'
        for a in range(25, 95, 5):
            df.loc[df.issue_age.between(a, a+4), 'ia_band1'] = f'{a}-{a+4}'
        df.loc[df.issue_age>94, 'ia_band1'] = '95+'
        if df.issue_age.min() < 18:
            df.loc[df.issue_age<18, 'ia_band1'] = '00-18'
            
    if not 'iy_band1' in df and 'issue_year' in df:    
        df['iy_band1'] = '1900-1989'
        df.loc[df.issue_year.between(1990, 1999), 'iy_band1'] = '1990-1999'
        df.loc[df.issue_year.between(2000, 2009), 'iy_band1'] = '2000-2009'
        df.loc[df.issue_year >=  2010, 'iy_band1'] = '2010-'

    return df

# Initialize data

In [3]:
%config Completer.use_jedi=False

In [4]:
%%time
data = add_cols(pd.read_parquet(dir_home + '/data/ilec/Data/Processed/2021_published/ILEC 2009-18 20210528_v2.parquet'))

# There happen to be some records with no exposure, they also have no tabular expecteds, and will hurt
# when the log of the tabular expecteds are taken later.
data= data[(data.amount_exposure>0) & (data.policy_exposure>0)]

# subset of data for exhibit g, the example used in 
data_exhibit_g = data[(data.soa_post_level_term_indicator != 'Post Level Term')
                      & (data.duration < 26) # select only
                      & (data.observation_year < 2014) # for consistency with the report, update if you like
                      & (data.issue_age > 17) # no juvies pls
                     ]

del data # why not free up a few gb of RAM
# tidiness
import gc
gc.collect()

CPU times: user 37.1 s, sys: 13.5 s, total: 50.6 s
Wall time: 47.6 s


35

# Models

In [5]:
pd.options.display.float_format = '{:,.0f}'.format
pd.options.display.width=255

In [6]:
# model will run better on a summary - or: will run at all
data_exhibit_g_summary = data_exhibit_g.pivot_table(index=exogvars, values=valcols, aggfunc=np.sum).reset_index()

In [7]:
%%time
# get policy and amount models
models = {
    m:GLMRun(
        offsetcol=f'{m}_2015vbt'
      , formula = f"""{m}_actual ~
    + C(uw, Treatment(reference='N/2/1'))
    + C(dur_band1, Treatment(reference='06-10'))
    + C(face_amount_band, Treatment(reference='  100000-249999'))
    + C(ia_band1, Treatment(reference='40-44'))
    + C(gender)
    + C(observation_year)
    + C(insurance_plan, Treatment(reference='Term'))
    + C(ltp, Treatment(reference='20 yr or N/A (Not Term)'))
    + C(iy_band1)
    """
      , data = data_exhibit_g_summary
    )
    
    for m
    in ['policy', 'amount']
}

2022-05-01 14:14:40.127930 Fitting ...
2022-05-01 14:15:13.545355 ... fit.
2022-05-01 14:15:13.545453 Fitting ...
2022-05-01 14:15:51.400372 ... fit.
CPU times: user 6min 18s, sys: 1min 51s, total: 8min 10s
Wall time: 1min 11s


# Now finish the exhibits

In [10]:
m = models['policy']

In [68]:
import networkx as nx
import itertools
from functools import reduce

In [250]:
from ilectools.glmtools import results_summary_to_dataframe, getKV


ilectools.glmtools.GLMRun.getCoef = getCoef

z = getCoef(m)

In [251]:
z[0].index.names

FrozenList(['ia_band1'])

In [238]:
tuple(z[0].index.names)

('ia_band1',)

In [254]:
jnk = pd.DataFrame(index=range(3), columns=['x','y'])

In [256]:
jnk.index.ndim

1

In [260]:

coefOrig = getCoef(m) #m.getCoef()

In [261]:
x = getSubgraphs(coefOrig)

In [266]:
jnk = {_.index.names[0]:_ for _ in compareFactorsAE(m)}#.style.format('{:,.2%}')

In [268]:
jnk['insurance_plan'].style.format('{:,.2%}')

Unnamed: 0_level_0,Factor,A/Table
insurance_plan,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,134.06%,158.84%
Perm,94.12%,121.00%
Term,100.00%,104.76%
UL,106.72%,120.73%
ULSG,102.03%,102.34%
VL,97.73%,107.18%
VLSG,96.87%,111.05%


In [269]:
m.factorAnalysisExhibit('insurance_plan')

TypeError: reduce() of empty iterable with no initial value

In [37]:
m.compareFactorsAE()

TypeError: reduce() of empty iterable with no initial value

In [12]:
coefOrig = m.getCoef()

In [36]:
pd.options.display.float_format = '{:,.4e}'.format
coefOrig[2]

 5 yr             3.0820e-02
10 yr             3.9954e-02
15 yr             9.6909e-02
25 yr             2.3994e-01
30 yr             3.4621e-02
Not Level Term   -1.3419e-01
Unknown           2.3679e-04
Name: coef, dtype: float64

In [25]:
import ilectools

In [31]:
list(list(ilectools.glmtools.getSubgraphs(coefOrig))[0])

[]

In [34]:
m.compareFactorsAE()

TypeError: reduce() of empty iterable with no initial value

In [33]:
m.showFactorsVsTableBetweenCatgories(plot=False)

TypeError: reduce() of empty iterable with no initial value

# End