In [None]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import textwrap 
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext line_profiler

np.set_printoptions(linewidth =  160)

def wdid(ob, ex=False):
    ''' what does object do? 
    '''
    print('\n'.join(textwrap.wrap(' '.join([i for i in dir(ob) if i[0] != '_']), 80)))
    if ex:
    # optional pause for something more advanced... 
        for m in [ i for i in dir(np) if i[0] >= 'a' and i[0]<='z']:
            print(f'\n\n{m}\n{"="*len(m)}\n')
            print(np.__getattribute__(m).__doc__)

# Manipulating Triangle Data in Pandas 1: WC Triangles  

Building a multi-dimensional IBNR model in numpy

### Load Data and Basic Cleansing

In [None]:
cas = pd.read_csv(r'http://www.casact.org/research/reserve_data/wkcomp_pos.csv')

In [None]:
cas.head()

In [None]:
# Let's make the pandas dataframe look more triangle like
triangle_frame = pd.pivot_table(cas[cas['DevelopmentYear']<=1997], 
                                values='CumPaidLoss_D', 
                                index=['GRNAME','AccidentYear'], 
                                columns='DevelopmentLag')

In [None]:
triangle_frame.head(30)

In [None]:
# get rid of zero triangles
triangle_frame = triangle_frame.groupby(level=0).filter(lambda x : np.nansum(x)  > 0)
triangle_frame.iloc[10:20, :]

In [None]:
triangle_frame.iloc[-10:, :]

### Age-to-age factors

In [None]:
triangle_frame.iloc[0:20, 1:] / triangle_frame.iloc[0:20, :-1]

In [None]:
triangle_frame.iloc[0:20, 1:].values / triangle_frame.iloc[0:20, :-1].values

In [None]:
triangle_frame.iloc[0:20, 1:].values / triangle_frame.iloc[0:20, :-1]

In [None]:
ata_df = triangle_frame.iloc[:, 1:].values / triangle_frame.iloc[:, :-1]
ata_df.head(20)

### LDFs and CDFs

In [None]:
ldf_df = ata_df.groupby(level=0).mean().fillna(1.)

In [None]:
ldf_df.head(20)

In [None]:
# cdfs need cumulative product in reverse...easy to reverse and re-reverse
cdf_df = ldf_df.iloc[:, ::-1].cumprod(axis=1).iloc[:, ::-1]
cdf_df[10] = 1.
cdf_df.head()

### Ultimates and IBNR

In [None]:
pulling off the diagonal is a bit tricky 
diag_df = triangle_frame.groupby(level=0).apply(lambda x : pd.Series(np.diagonal(x.values[:, ::-1])[::-1], index=range(1,11)))
diag_df.head(10)

In [None]:
ult_df = (diag_df * cdf_df).fillna(0)
ibnr_df = ult_df - diag_df
ult.head(10)

In [None]:
ibnr_df.head()

### The business questions answered by our model

In [None]:
# complte in pd

In [None]:
companies = np.array(triangle_frame.index.levels[0])[triangle_sum!=0]
print('How much IBNR does the entire industry need according to this model?')
print(np.sum(ibnr).round(0))
print()
print('What is the average ultimate to paid ratio across the industry?')
print((np.sum(ultimate)/np.sum(latest_diagonal)).round(3))
print()
print('Which company has the highest 12-Ultimate CDF?')
print(companies[np.argmax(cdf_array[:,0])])
print()
print('Which company has the lowest 12-24 LDF?')
print(companies[np.argmin(ldf_array[:,1])])
print()
print('What is the 95% confidence interval on the estimate of 12-Ultimate CDF?')
print((np.sort(cdf_array[:,1])[int(.025*len(cdf_array[:,1]))],cdf_array[:,1][int(.975*len(cdf_array[:,1]))]))

Performance test of the above code 

In [None]:
# start from cas
triangle_frame = pd.pivot_table(cas[cas['DevelopmentYear']<=1997], 
                                values='CumPaidLoss_D', 
                                index=['GRNAME','AccidentYear'], 
                                columns='DevelopmentLag')

In [None]:
def develop_np(triangle_frame):
    '''
    create latest ldfs, cdfs, diagonal, ultimate and ibnr ndarrays from
    input pandas dataframe:
    
        pd.pivot_table(cas[cas['DevelopmentYear']<=1997], 
                                values='CumPaidLoss_D', 
                                index=['GRNAME','AccidentYear'], 
                                columns='DevelopmentLag')

    John's code
    
    '''
    
    # use reshape method to create a 3-D Matrix of triangles
    # triangle array is a set 10x10 triangles for more than 100 companies.
    triangle_array = np.array(triangle_frame).reshape(
        len(cas['GRNAME'].unique()),
        len(cas['AccidentYear'].unique()),
        len(cas['DevelopmentLag'].unique())
    )
    
    # get rid of completely empty triangles
    triangle_sum = np.nansum(np.nansum(triangle_array, axis=1),axis=1)
    triangle_array = triangle_array[triangle_sum!=0,:,:]
    triangle_array[triangle_array==0]=np.nan

    # use slicing to create age-to-age factors
    ata_array = triangle_array[:,:-1,1:]/triangle_array[:,:-1,:-1]
    
    # default the completely blank age-to-age columns to 1.0
#     accident_periods = len(cas['DevelopmentLag'].unique())
#     ata_array_defaults = np.expand_dims(np.all(np.isnan(ata_array),axis=1),axis=1)
#     ata_array[np.repeat(ata_array_defaults,accident_periods-1,axis=1)]=1.0

    # create an array of LDFs, by taking simple averages of the age-to-age factors; default missing to 1
    ldf_array = np.nanmean(ata_array, axis=1)
    ldf_array[np.isnan(ldf_array)] = 1.0

    # create an array of CDFs with a tail factor from our LDFs
    cdf_array = ldf_array[:,::-1].cumprod(axis=1)[:,::-1]
    tail_factor = 1.0
    cdf_array = np.append(cdf_array,np.expand_dims(np.repeat(tail_factor,cdf_array.shape[0]),1),axis=1)[:,::-1]

    # strip latest diagonal and develop 
    latest_diagonal = np.nan_to_num(np.diagonal(triangle_array[:,::-1,],axis1=1,axis2=2)[:, ::-1])
    ultimate = latest_diagonal * cdf_array
    ibnr = ultimate - latest_diagonal
    
    # return the interesting bits 
    return triangle_array, triangle_sum, ldf_array, cdf_array, latest_diagonal, ultimate, ibnr

In [None]:
%%prun -s "time" -l 20
# %%timeit
triangle_array, triangle_sum, ldf_array, cdf_array, latest_diagonal, ultimate, ibnr = develop_np(triangle_frame)

In [None]:
triangle_array, triangle_sum, ldf_array, cdf_array, latest_diagonal, ultimate, ibnr = develop_np(triangle_frame)

In [None]:
companies = np.array(triangle_frame.index.levels[0])[triangle_sum!=0]
print('How much IBNR does the entire industry need according to this model?')
print(np.sum(ibnr).round(0))
print()
print('What is the average ultimate to paid ratio across the industry?')
print((np.sum(ultimate)/np.sum(latest_diagonal)).round(3))
print()
print('Which company has the highest 12-Ultimate CDF?')
print(companies[np.argmax(cdf_array[:,0])])
print()
print('Which company has the lowest 12-24 LDF?')
print(companies[np.argmin(ldf_array[:,1])])
print()
print('What is the 95% confidence interval on the estimate of 12-Ultimate CDF?')
print(*np.sort(cdf_array[:,-1])[[int(.025*len(cdf_array)),
       int(.975*len(cdf_array[:,1]))]])

# Using Pandas

In [None]:
def develop_pd(triangle_frame):
    '''
    Same thing in pandas
    '''

    triangle_frame1 = triangle_frame.groupby(level=0).filter(lambda x : np.nansum(x)  > 0)

    # ata factors, picks up index from second data frame 
    ata_df = triangle_frame1.iloc[:, 1:].values / triangle_frame1.iloc[:, :-1] 

    # ldfs with default 1 and tail factor in column 10
    ldf_df = ata_df.groupby(level=0).mean().fillna(1.)
    ldf_df[10] = 1.0

    # cdfs 
    cdf_df = ldf_df.iloc[:, ::-1].cumprod(axis=1).iloc[:, ::-1]

    # diagonal
    diag_df = triangle_frame1.groupby(level=0).apply(lambda x : pd.Series(np.diagonal(x.values[:, ::-1])[::-1], index=x.columns))

    # ultimate and ibnr
    ult_df = (diag_df * cdf_df).fillna(0)
    ibnr_df = ult_df - diag_df
    ibnr_df['Tot'] = ibnr_df.sum(1)

    # return interesting bits 
    return ldf_df, cdf_df, diag_df, ult_df, ibnr_df

In [None]:
%%prun -s "time" -l 20
ldf_df, cdf_df, diag_df, ult_df, ibnr_df = develop_pd(triangle_frame)

In [None]:
ldf_df, cdf_df, diag_df, ult_df, ibnr_df = develop_pd(triangle_frame)

In [None]:
ibnr_df.sort_values('Tot', ascending=False).head(40).style

Check we get the same answer 

Look at the pieces

In [None]:
display(ibnr_df.head(20).style)
display(pd.DataFrame(ibnr[:,::-1]).head(20).style)

In [None]:
display(ult_df.head(10))
display(pd.DataFrame(ultimate).iloc[0:10, ::-1])

In [None]:
display(diag_df.head(10))
display(pd.DataFrame(latest_diagonal).iloc[:10, 10::-1])

In [None]:
display(cdf_df.head(10))
display(pd.DataFrame(cdf_array).iloc[0:10, ::-1])

In [None]:
display(ldf_df.head(10))
display(pd.DataFrame(ldf_array).head(10))

# SM Triangles

Load and develop all triangles in the CAS database.



In [None]:
N1 = pd.read_csv(r'http://www.mynl.com/RPM/masterdata.csv')

In [None]:
N1.head()

In [None]:
N1.describe().style

In [None]:
bit = N1.query(' Lag == 10 ')[['GRName', 'Line', 'UltIncLoss', 'EarnedPrem']]  # .head(1000).copy()

In [None]:
bit.groupby('GRName').agg({ 'EarnedPrem': sum } ).sort_values('EarnedPrem', ascending=False).head(20) 

In [None]:
ilist = ['GRName', 'Line'] 
ans = pd.concat([ bit.assign( **{x: 'total' for x in ilist[i:]} ).groupby(ilist).sum()
          for i in range(len(ilist)+1)]).sort_index()
ans['LR'] = ans.UltIncLoss / ans.EarnedPrem
ans.head(20)

In [None]:
ans[['EarnedPrem']].unstack(level=1, fill_value=0). \
    sort_values(('EarnedPrem', 'total'), ascending=False).head(20)

In [None]:
ans[['EarnedPrem', 'LR']].unstack(level=1, fill_value=0). \
    sort_values(('EarnedPrem', 'total'), ascending=False)[['LR']].head(10)

In [None]:
bit = N1.query(' Lag == 1 ')[['GRName', 'Line', 'PaidLoss', 'CaseIncLoss', 'UltIncLoss', 'EarnedPrem']] 
bit.head()

In [None]:
plt.figure(figsize=(12,12))
plt.plot(np.log(bit.PaidLoss), np.log(bit.UltIncLoss), 'x', alpha=0.1)

In [None]:
plt.figure(figsize=(12,12))
plt.plot(np.log(np.log(N1.PaidLoss)), np.log(np.log(N1.UltIncLoss)), 'x', alpha=0.05)

In [None]:
pd.unique(N1.Line) # , pd.unique(N1.GRName)

In [None]:
%timeit N1[ (N1.GRName == 'Alaska Nat Ins Co') & (N1.Line=='Comm Auto')].head(5)

In [None]:
%timeit N1.query(' GRName == "Alaska Nat Ins Co" and Line=="Comm Auto" ').head(5)

In [None]:
%timeit bit = N1.query(' AY + Lag <= 1999 ')

In [None]:
N2 = N1.set_index(keys=['GRName', 'Line', 'AY', 'Lag'])

In [None]:
%timeit N2.loc[("Alaska Nat Ins Co", "Comm Auto"), :].head(5)

In [None]:
N2.loc[ 'FM Global', :].head(5)

In [None]:
N2.loc[(slice(None), 'Comm Auto'), :].head(5)

In [None]:
N2.loc[(slice(None), slice(None), 1990), :].head(5)

In [None]:
N2.xs(('Canal Ins Co Grp', 'Comm Auto'), level=('GRName', 'Line')).head(3)

In [None]:
N2.xs('Comm Auto', level='Line').head(3)

In [None]:
# big_cos = list( N1.query(' Lag == 10 ').groupby('GRName')[['EarnedPrem']].sum().sort_values('EarnedPrem').tail(20).index ) 
big_cos = list( N1.query(' Lag == 10 ').groupby('GRName')[['EarnedPrem']].sum().nlargest(20, 'EarnedPrem').index ) 
big_cos

In [None]:
bit.loc[bit.GRName.isin(big_cos), :]

In [None]:
bit = N1.query(' AY + Lag <= 1998 ')[['GRName', 'Line', 'PaidLoss', 'CaseIncLoss', 'UltIncLoss', 'EarnedPrem', 'AY', 'Lag']] 
# just the big cos
bit = bit.loc[bit.GRName.isin(big_cos), :]
bit.head(20)

In [None]:
sfm = 'State Farm Mut Grp' 

In [None]:
G = pd.pivot_table(bit, values=['CaseIncLoss', 'PaidLoss'], index=['GRName', 'Line', 'AY'], columns='Lag')
G.head(20)

In [None]:
N1.columns

In [None]:
G = pd.pivot_table(N1.query(" AY+Lag <= 1998 and GRName=='State Farm Mut Grp' "), values=['PaidLoss', 'CaseIncLoss'], index=['GRName', 'Line', 'AY'], columns='Lag')
G.head(20)

In [None]:
def add_link_ratios_from_raw_data(N1, opt_filter=''):
    '''
    Add link ratios to loss triangles
    e.g. opt_filter = " and GRName=='State Farm Mut Grp' "
    '''
    G = pd.pivot_table(
            N1.query(" AY+Lag <= 1998 " + opt_filter ), 
            values=['PaidLoss', 'CaseIncLoss'], 
            index=['GRName', 'Line', 'AY'], 
            columns='Lag'
        )

    return pd.concat((G, 
                      pd.DataFrame(G.iloc[:, 1:10].values / G.iloc[:, 0:9].values, 
                                   index=G.index, 
                                   columns=pd.MultiIndex.from_tuples([('CaseIncLink', i) for i in range(1,10)])),
                      pd.DataFrame(G.iloc[:, 11:].values / G.iloc[:, 10:-1].values, 
                                   index=G.index, 
                                   columns=pd.MultiIndex.from_tuples([('PaidLink', i) for i in range(1,10)]))
                     ), axis=1)

In [None]:
G2 = add_link_ratios_from_raw_data(N1)

In [None]:
G2.xs(sfm, level=0).filter(regex='Paid').head(10)

In [None]:
G2.loc[sfm, 'PaidLink'].head(10)

In [None]:
# just the complete triangles 
comp = G2.loc[G2.groupby(['GRName', 'Line']).apply(lambda x : x.isna().sum().sum()) == 180, :]

In [None]:
G2.shape, comp.shape

In [None]:
def mask(n, size, kind):
    """ 
    mask for avg last n in a size x size triangle 
    """
    nyrs = size - 1
    if kind=='loss_den':
        ans = np.array([[1 if i + j < nyrs and i + j >= nyrs - n else 0 for i in range(size)] for j in range(size)])
    elif kind=='loss_num':
        ans = np.array([[1 if i > 0 and i + j < size and i + j >= size - n else 0 for i in range(size)] for j in range(size)])
    else:
        ans = np.array([[1 if i + j < nyrs and i + j >= nyrs - n else 0 for i in range(nyrs)] for j in range(size)])
    return ans

def make_links(x, avg_tuple=(3, 5, 10)):
    '''
    Compute paid and incurred average link ratios, weight and straight, 3, 5 and all year averages (2x2x3=12 sets)
    '''
    return pd.DataFrame({ \
        **{ ('Inc', f'str {i}') : np.nansum(x.loc[:, 'CaseIncLink'].values * mask(i, 10, 'link'), 0) / np.nansum( mask(i, 10, 'link'), 0) for i in avg_tuple}, \
        **{ ('Pd', f'str {i}') :  np.nansum(x.loc[:, 'PaidLink'].values * mask(i, 10, 'link'), 0) /  np.nansum( mask(i, 10, 'link'), 0) for i in avg_tuple}, \
        **{ ('Inc', f'wtd {i}') : np.nansum((x.loc[:, 'CaseIncLoss'].values * mask(i, 10, 'loss_num')), 0)[1:] / \
                                  np.nansum((x.loc[:, 'CaseIncLoss'].values * mask(i, 10, 'loss_den')), 0)[:-1] for i in avg_tuple}, \
        **{ ('Pd', f'wtd {i}') :  np.nansum((x.loc[:, 'PaidLoss'].values * mask(i, 10, 'loss_num')), 0)[1:] / \
                                  np.nansum((x.loc[:, 'PaidLoss'].values * mask(i, 10, 'loss_den')), 0)[:-1] for i in avg_tuple}, \
        }, \
        index=pd.Index(range(1,10), name='Lag')).T

# def make_links2(x, avg_tuple=(3, 5, 10)):
#     '''
#     Compute paid and incurred average link ratios, weight and straight, 3, 5 and all year averages (2x2x3=12 sets)
#     Use masked arrays and dictionary list comprehensions 
#     SLOWER but probably correct for incomplete triangles.... 
#     '''
#     return pd.DataFrame({ \
#         **{ (j, f'str {i}') : ma.masked_array(x.loc[:, j], mask(i, 10, 'link')).mean(0) \
#            for i in avg_tuple for j in ['CaseIncLink', 'PaidLink']}, \
#         **{ (j, f'wtd {i}') : ma.masked_array(x.loc[:, k], mask(i, 10, 'loss_num')).sum(0)[1:] / \
#            ma.masked_array(x.loc[:, k], mask2(i, 10, 'loss_den')).sum(0)[:-1] \
#            for i in avg_tuple for j, k in [('Inc', 'CaseIncLoss'), ('Pd', 'PaidLoss')]}, \
#         }, \
#         index=pd.Index(range(1,10), name='Lag')).T

In [None]:
links = comp.groupby(level=['GRName', 'Line']).apply(make_links)
links.index.names = ['GRName', 'Line', 'Kind', 'Method']

In [None]:
links.head(24)

In [None]:
links.xs(sfm, level=0).head()

In [None]:
%timeit comp.groupby(level=['GRName', 'Line']).apply(make_links)

In [None]:
# pull out SFM triangles
bit = links.loc[[sfm]].head(24)
bit

In [None]:
mbit.columns

In [None]:
bit = links.iloc[0:288*2, :]
mbit = bit.stack('Lag').reset_index()  
mbit.rename(mapper={0: 'FTU'}, inplace=True, axis=1)
mbit.head()

In [None]:
def plotit(b):
    if plotit.first: 
        sns.relplot(data=b, kind='line', x='Lag', y='FTU', hue='Kind', style='Method', 
            style_order=['wtd 10', 'wtd 5', 'wtd 3', 'str 10', 'str 5', 'str 3'], 
            col='Line', row='GRName', 
            height=5, aspect=1.3)
    else:
        plotit.first += 1
plotit.first = 0 
# need some nifty footwork because apply gets called twice first go around 
mbit.groupby('GRName').apply( plotit )

#  Bootstrapping 

In [None]:
comp.loc[[sfm]].filter(regex="Paid").head(10)

In [None]:
comp.head()

In [None]:
a, b, c = comp.index[0]

In [None]:
list(comp.index.get_level_values('AY').unique())

In [None]:
1997-1988

In [None]:
sfm

In [None]:
bit = comp.loc[[(sfm, 'Comm Auto')]]
bit

In [None]:
import re

def shorten(s):
    if len(s) < 12:
        return s
    else:
        re.sub
        s = re.sub(' (Co|Ins|Grp|Exchange|Of|Inc|of)', '', s)
        s = s.replace('Agricultural', 'Ag').replace('Exchange', 'Ex'). replace('Associated', 'Assoc')
    if len(s) > 12:
        s = ' '.join([i[:4] for i in s.split(' ')][:3])
    return s

In [None]:
def pd_inc_plot(df, co_name='', line_name='', bins=201, dd=True, ax=None, legend=False):
    '''
    bootstrap from paid and incurred and create product distribution 
    input is result of running
    
        links = comp.groupby(level=['GRName', 'Line']).apply(make_links)
        links.index.names = ['GRName', 'Line', 'Kind', 'Method']
    
    index GRName, Line, AY, col groups for Paid, CaseInc loss and links  and lag 
    '''
    
    # allows use with groupby
    if co_name == '':
        co_name, line_name, _ = df.index[0]
   
    yrs = list(df.index.get_level_values('AY').unique())
    nyrs = yrs[-1] - yrs[0]
    
    # piece of interest
    bit = df.xs((co_name, line_name), level=('GRName', 'Line'))
    
    if len(bit) < 10:
        return
    
    # make kronecker products 
    # pull off most recent year losses 
    kpi = np.array(bit.loc[yrs[-1], ('CaseIncLoss', 1)])
    kpp = np.array(bit.loc[yrs[-1], ('PaidLoss', 1)])
    
    # and complete with link ratios 
    for i in range(0, nyrs):
        kpp = np.kron(kpp, bit.loc[yrs[0]:yrs[0]+i, ('PaidLink', nyrs - i)])
        kpi = np.kron(kpi, bit.loc[yrs[0]:yrs[0]+i, ('CaseIncLink', nyrs - i)])

    ult = pd.DataFrame( {'inc' : kpi, 'pd' : kpp})
    # stats 
    d = ult.describe().iloc[1:, :]
    if dd:
        display(d)
    
    if ax is None:
        f = plt.figure()
        a = f.gca()
    else:
        a = next(ax)
    
    bp = np.linspace(d.loc['min', :].min(), d.loc['max', :].max(), bins)
    mnn = d.loc['mean', :].min()
    mnx = d.loc['mean', :].max()
    sd = d.loc['std', : ].max()
    bp = np.linspace(max(0, mnn - 4*sd), mnx + 4*sd, bins)
    npd,  _, _ = a.hist(kpp, bins=bp, color='b', alpha=0.5, label='Paid')
    ninc, _, _ = a.hist(kpi, bins=bp, color='r', alpha=0.5, label='Incurred')
    bay = ninc*npd / sum(ninc*npd) * sum(npd)
    xs = (bp[1:]+bp[0:-1])/2
    a.plot(xs, bay, '-g', label='Posterior')
    if legend:
        a.legend(frameon=False)
    a.set(title='{:}/{:}\nMLE={:,.1f}, CV(I/Pd)={:.3f}/{:.3f}'.format(shorten(co_name), line_name, xs[bay.argmax()]/1e3, 
                                                                *(d.loc['std']/d.loc['mean']) ))
    return ult

In [None]:
lines = ['Comm Auto', 'PP Auto', 'Other Liab', 'Work Comp', 'Products Liab', 'Med Mal']

In [None]:
f, ax = plt.subplots(2, 3, figsize=(12,8))
ax = iter(ax.flatten())
for l in lines:
    ult = pd_inc_plot(comp, sfm, l, dd=False, ax=ax, legend=(l==lines[0]))
# tidy up 
for a in ax:
    f.delaxes(a)
plt.tight_layout()

In [None]:
def plot_all(df, line='', co='', threshold=250000):
    '''
    all lines for given co or all cos for given line 
    '''
    if line=='' and co=='':
        return 
    
    if line != '':
        bit = df.query(f' Line=="{line}" ')        
        ncos = len(bit) / 10 
        nr = int(ncos/6)
        if nr < ncos/6: nr += 1
        f, ax = plt.subplots(nr, 6, figsize=(18, 2.4*nr))
        ax = iter(ax.flatten())
        
    elif co != '':
        bit = df.query(f' GRName=="{co}" ')
        f, ax = plt.subplots(2, 3, figsize=(12,6))
        ax = iter(ax.flatten())
    
    g = bit.groupby(['GRName', 'Line'])

    l = True
    for k, v in g.groups.items():
        grp = bit.loc[v]
        if grp.CaseIncLoss.sum().sum() > threshold:
            ult = pd_inc_plot(grp, dd=False, ax=ax, legend=l)
            l = False
        
    # tidy up 
    for a in ax:
        f.delaxes(a)
    plt.tight_layout()

In [None]:
[i for i in comp.index.get_level_values('GRName').unique() if i[:5] == 'Canal']

In [None]:
plot_all(comp, 'Comm Auto', 100000)

In [None]:
plot_all(comp, 'PP Auto', 100000)

In [None]:
plot_all(comp, 'Work Comp', 1000000)

# Data For SciKit-Learn Intro

In [None]:
# Read in the CAS data
data_url = 'https://www.casact.org/research/reserve_data'
lobs = ['medmal','ppauto','wkcomp']
data = pd.DataFrame()
data = []
columns = ['GRCODE','GRNAME','AccidentYear','DevelopmentYear','DevelopmentLag'
           ,'IncurLoss', 'CumPaidLoss','BulkLoss','EarnedPremDIR'
           ,'EarnedPremCeded','EarnedPremNet', 'Single','PostedReserve97']
for lob in lobs:
    file_url = f'{data_url}/{lob}_pos.csv'
    subset = pd.read_csv(file_url, names=columns, skiprows=1)
    subset['LOB'] = lob
    data.append(subset)
data1 = pd.concat(data)
data = data1.query(" DevelopmentYear <= 1997 ").reset_index(drop=True)

In [None]:
# alternative using append
data_url = 'https://www.casact.org/research/reserve_data'
# Read in the data
lobs = ['medmal','ppauto','wkcomp']
data = pd.DataFrame()
columns = ['GRCODE','GRNAME','AccidentYear','DevelopmentYear','DevelopmentLag'
           ,'IncurLoss', 'CumPaidLoss','BulkLoss','EarnedPremDIR'
           ,'EarnedPremCeded','EarnedPremNet', 'Single','PostedReserve97']
for lob in lobs:
    file_url = f'{data_url}/{lob}_pos.csv'
    subset = pd.read_csv(file_url, names=columns, skiprows=1)
    subset['LOB'] = lob
    data = data.append(subset, sort=True)
data = data[data['DevelopmentYear']<=1997].reset_index()

In [None]:
data.head()

In [None]:
# original
def make_trg(data):
    # Find largest 20 companies by premium size for each LOB
    aggregates = (data[data['DevelopmentYear']==1997].groupby(['LOB','GRNAME']) \
                                                 .sum()['IncurLoss']) \
                                                 .reset_index()
    top_20_by_lob = aggregates.iloc[aggregates.groupby('LOB')['IncurLoss'] \
                              .nlargest(19).index.levels[1]]
    data2 = data.merge(top_20_by_lob, how='left', on=['LOB','GRNAME'])
    data2.loc[data2.iloc[:,-1].isna(),'GRNAME'] = 'Other'
    
    # Create Triangles
    triangles = pd.pivot_table(data2, index=['GRNAME','LOB','AccidentYear'],
                               columns='DevelopmentLag', values='CumPaidLoss',
                               aggfunc='sum')
    
    # Determine LDF Weights
    weight = np.array(~triangles.iloc[:,1:].isna())
    columns = [f'{triangles.columns[num]}-{triangles.columns[num+1]}'
               for num, item in enumerate(triangles.columns[:-1])]

    # Volume-weighted numerator and demoninator
    numerator = (
        (triangles.iloc[:,1:]).reset_index() 
                                     .drop('AccidentYear',axis=1)
                                     .groupby(['GRNAME','LOB'])
                                     .sum(axis=0))
    denominator = (
        (weight*triangles.iloc[:,:-1]).reset_index()
                                      .drop('AccidentYear',axis=1)
                                      .groupby(['GRNAME','LOB'])
                                      .sum(axis=0))
    numerator.columns = denominator.columns = columns

    # Development Patterns
    ldf = (numerator/denominator).fillna(1.0)
    
    return ldf

In [None]:
%timeit ldf_orig = make_trg(data)

In [None]:
# alternatives, including original
def make_trg_2(data):
    '''
    see _alt that this method is fastest
    '''
    aggregates2 = data.query(' DevelopmentYear ==  1997 ').groupby(['LOB','GRNAME'])['IncurLoss'].sum() 
    top_20_by_lob = aggregates2.groupby(level='LOB').apply(lambda x : x.nlargest(19).reset_index(level=0, drop=True))
    
    data_alt2 = data.merge(top_20_by_lob.to_frame(), how='left', left_on=['LOB','GRNAME'], right_index=True)
    data_alt2.loc[data_alt2.loc[:,'IncurLoss_y'].isna(), 'GRNAME'] = 'Other'
    
    # create triangles 
    triangles = pd.pivot_table(data_alt2, index=['GRNAME','LOB','AccidentYear'],
                           columns='DevelopmentLag', values='CumPaidLoss')
    
    # Determine LDF Weights ORIG
    w = pd.DataFrame(np.array([[1 if i+j<9 else 0 for i in range(9)] for j in range(10)]))
    weight = np.tile(w, (int(triangles.shape[0]/10), 1))
    columns = [f'{triangles.columns[num]}-{triangles.columns[num+1]}'
               for num, item in enumerate(triangles.columns[:-1])]

    # Volume-weighted numerator and demoninator mask for denom only; values on num because want index from num 
    ldf = (triangles.iloc[:,1:].groupby(level=['GRNAME','LOB']).sum().values / \
           (weight*triangles.iloc[:,:-1]).groupby(level=['GRNAME','LOB']).sum()).fillna(1.0) 
    return ldf

In [None]:
%timeit ldf_alt = make_trg_2(data)

In [None]:
ldf_orig = make_trg(data)
ldf_alt = make_trg_2(data)
np.allclose(ldf_orig, ldf_alt)

In [None]:
ldf_alt.head(10)

# Performance

In [None]:
n = 100000000  # 100 million rows 
df = pd.DataFrame({
    'a': np.random.randn(n),
    'b': np.random.randn(n),
    'c': np.random.randn(n),
})
a =  np.random.randn(n)

In [None]:
%timeit r = np.sin(a - 1) + 1

In [None]:
%timeit r = np.sin(df['a'] - 1) + 1

In [None]:
%timeit r = np.sin(df['a'].values - 1) + 1

In [None]:
import numexpr

In [None]:
expr = 'sin(a - 1) + 1'

In [None]:
%timeit r = numexpr.evaluate(expr)

In [None]:
def dowork(a):
    expr = 'sin(a - 1) + 1'
    return numexpr.evaluate(expr)

In [None]:
%timeit r = dowork(df['a'])

# Pandas Intro

## Function We Will Discuss

* DataFrame
* head, tail, describe, summary 
* unique
* from csv, dictionary 
* loc, slices
* create_index, reset_index 
* MultiIndex 
* loc, slices and xs
* query 
* pivot, stack and unstack
* melt
* **concat**, append, keys 
* pivot_table (crosstab)
* **merge** (indicator) and join
* groupby (.groups, .get_group, as_index)
* sum, mean, std etc. 
* aggregate
* transform (same size as input whiten)
* apply
* assign 
* plot

## Functions not covered but check out on your own
* map (series), applymap (dataframes) 
* evaluate 
* str
* dt
* style


# Seaborn Plotting 


In [None]:
x = np.random.randn(5,5); x

In [None]:
df0 = pd.DataFrame(x)
df0

In [None]:
df0.columns = list('abcde')

In [None]:
df0['class'] = list('αβββζ')
df0

In [None]:
df0.index.name = 'id'
df0.columns.name = 'var'
df0

In [None]:
df = pd.DataFrame({'class': list('vwxxy'), 'subclass': list('aaabb'), 'a': np.random.randn(5), 'c': np.arange(5, dtype=np.float)}, index=pd.Index(range(5), name='idx'))

In [None]:
df

In [None]:
pd.concat( (df0, df), sort=True) 

In [None]:
df0.select_dtypes(np.number) / df.select_dtypes(np.number)

In [None]:
df.dtypes

In [None]:
df['sinb'] = np.sin(df.b)

In [None]:
df.head()

In [None]:
df1 = df.set_index('x')
df1.columns.name = 'variable'
df1.head()

In [None]:
df1.corr()

In [None]:
df1.dtypes

In [None]:
df1.select_dtypes(object)

In [None]:
df1.append(df1)

In [None]:
df2 = pd.DataFrame({'y': list('lmnop'), 'a': np.random.randn(5), 'b': np.arange(5, dtype=np.float)}, index=pd.Index(list('abcjk'), name='x'))
df2

In [None]:
pd.concat((df1,df2), sort=True)

In [None]:
df3 = pd.concat((df1,df2), sort=True, keys=['df1', 'df2'], names=['src'])
df3

In [None]:
df3.b

In [None]:
df3['b']

In [None]:
df3.b.unique()

In [None]:
df3.index

In [None]:
df3.index.get_level_values(1).unique()

In [None]:
df3.loc['df1']

In [None]:
df3.loc[:, 'a']

In [None]:
df3.loc[:, 'a':'b']

In [None]:
df3[['a']]

In [None]:
df3.unstack()

In [None]:
df3.unstack(0)

In [None]:
df3

In [None]:
df3[df3.a < 0]

In [None]:
df3.loc[df3.a < 0]

In [None]:
df3.loc[df3.a < 0, :]

In [None]:
df3.query(' a < 0 ')

In [None]:
df3.loc['df1']

In [None]:
df3.loc['b']

In [None]:
df3

In [None]:
df3.loc[(slice(None), 'b'), :]

In [None]:
df3.loc[(slice(None), slice('b','d')), :]

In [None]:
df3.loc[(slice(None), 'b')], df3.loc[:, 'b']

In [None]:
df3.xs('b', level=1)

In [None]:
df3.xs('b', axis=1)

In [None]:
df3

In [None]:
df4 = df3.reset_index()
df4

In [None]:
df4.pivot(index='src', columns='b', values='a')

In [None]:
df4.pivot(index='src', columns='b', values=['a', 'sinb'])

In [None]:
df4.pivot_table(index=['src', 'x'], columns='b', values=['a', 'sinb'])

In [None]:
g3 = df3.groupby(level='x') 

In [None]:
g3.groups 

In [None]:
g3.get_group('a')

In [None]:
g4 = df4.groupby('x')
g4.groups

In [None]:
g4.get_group('a')

In [None]:
g3.sum()

In [None]:
g3.aggregate(sum)

In [None]:
g3.agg(sum)

In [None]:
g3.agg([sum, np.std, np.min, np.max, np.size])

In [None]:
g3.agg({'a' : [sum, np.std, np.min, np.max, np.size], 'b': [sum, np.std] })

In [None]:
g3.apply(lambda x : display(x))

In [None]:
g3.apply(lambda x : print(x.a * x.b))

In [None]:
y = g3.get_group('c')
y

In [None]:
pd.Series( (y.a * y.b).values, name='ab', index=[1,2])

In [None]:
g3.apply( lambda y : pd.Series((y.a * y.b).values))

In [None]:
g3.apply( lambda y : pd.DataFrame((y.a * y.b).values, index=pd.Index(range(10, 10+len(y)), name='idx'), columns=['ab']))  

In [None]:
np.vstack((np.array([1,2,3]),np.array([1,2,3])))

In [None]:
g3.get_group('a')

In [None]:
g3.apply( lambda y : pd.DataFrame(np.hstack([y.a, y.b, (y.a * y.b).values]) ).T)

In [None]:
df3

In [None]:
df = pd.DataFrame({ 'x': range(10), 'a':list('abcdefghij')})
df.assign(a = lambda x :  't' if x.a == 'g' else x.a)

In [None]:
import scipy.stats as ss

In [None]:
s1, s2 = 0.2, .4
fz1 = ss.lognorm(s1, scale=np.exp(-s1**2/2))
fz2 = ss.lognorm(s2, scale=np.exp(-s1**2/2))
xs = np.linspace(0,10, 101)

In [None]:
xs = np.linspace(0,10, 101)
x1, x2 = np.meshgrid(xs, xs)
z = fz1.cdf(x1) * fz2.cdf(x2)

plt.imshow(z, origin='lower')
plt.colorbar()

In [None]:
1 if True else 0

In [None]:
def pit(s1, s2, x_list, biv_den=True):
    '''
    s1, s2 = sigmas of lognormals of mean 1 
    x = line to plot
    '''
    
    fz1 = ss.lognorm(s1, scale=np.exp(-s1**2/2))
    fz2 = ss.lognorm(s2, scale=np.exp(-s1**2/2))
    xs = np.linspace(0,10, 101)
    ts = np.linspace(0,1,101)
    
    n_plots = 4 if biv_den else 2
    plot_w = 10 if biv_den else 2
    plt.figure(figsize=(plot_w, 2.4))
    plt.subplot(1,n_plots,1)
    
    for x in x_list:
        y1 = fz1.cdf(ts * x)
        y2 = fz2.cdf((1 - ts) * x)
        plt.plot(y1, y2)

    plt.subplot(1,n_plots,2)
    for x in x_list: 
        y1 = fz1.pdf(ts * x)
        y2 = fz2.pdf((1 - ts) * x)
        plt.plot(ts, y1, label='1')
        plt.plot(ts, y2, label='2')
        plt.plot(ts, y2/y1, label='2/1')
    plt.legend(frameon=False)
    plt.ylim(0, 10)

    if biv_den:
        plt.subplot(1,n_plots,3)
        # bivariate density 
        x1, x2 = np.meshgrid(xs, xs)
        z = fz1.pdf(x1) * fz2.pdf(x2)
        plt.imshow(np.log(z), origin='lower', extent=[0,10,0,10])
#         plt.xlim(0, xs[-1])
#         plt.ylim(0, xs[-1])
        plt.colorbar();

        plt.subplot(1,n_plots,4)
        # bivariate density 
        x1, x2 = np.meshgrid(ts, ts)
        z = fz1.pdf(fz1.isf(1-x1)) * fz2.pdf(fz2.isf(1-x2))
        plt.imshow(np.log(z), origin='lower', extent=[0,1,0,1])
#         plt.xlim(0, xs[-1])
#         plt.ylim(0, xs[-1])
        plt.colorbar();

    plt.suptitle(f'$\sigma_1={s1}, \sigma_2={s2}, x={x}$')
    plt.tight_layout()

In [None]:
for s1, s2 in zip((.3, .3, 5), (.3, 2, 5)):
    pit(s1, s2, [4.], True)

In [None]:
for s1, s2 in zip((.3, .3, 1, 1, 3, 3), (.3, .5, 1, 2, 3, 5)):
    pit(s1, s2, [1.], True)