In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
%matplotlib inline
import textwrap 
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")


def wdid(ob):
    ''' what does object do? 
    '''
    print('\n'.join(textwrap.wrap(' '.join([i for i in dir(ob) if i[0] != '_']), 80)))
    # optional pause for something more advanced... 
#     for m in [ i for i in dir(np) if i[0] >= 'a' and i[0]<='z']:
#         print(f'\n\n{m}\n{"="*len(m)}\n')
#         print(np.__getattribute__(m).__doc__)

## Real world example

Building a multi-dimensional IBNR model in numpy

### Create Initial multi-dimensional array

In [None]:
import pandas as pd
cas = pd.read_csv(r'http://www.casact.org/research/reserve_data/wkcomp_pos.csv')
# Let's make the pandas dataframe look more triangle like
triangle_frame = pd.pivot_table(cas[cas['DevelopmentYear']<=1997], 
                                values='CumPaidLoss_D', 
                                index=['GRNAME','AccidentYear'], 
                                columns='DevelopmentLag')
# Let's use the reshape method to create a 3-D Matrix of triangles
triangle_array = np.array(triangle_frame).reshape(len(cas['GRNAME'].unique()),
                                                  len(cas['AccidentYear'].unique()),
                                                  len(cas['DevelopmentLag'].unique()))

In [None]:
cas.head()

In [None]:
triangle_frame.head(30)

In [None]:
triangle_array[-1,:,:]

In [None]:
# triangle array is a set 10x10 triangles for more than 100 companies.
triangle_array.shape

### Clean up missing and zero values

In [None]:
# Let's get rid of completely empty triangles
triangle_sum = np.nansum(np.nansum(triangle_array, axis=1),axis=1)
triangle_sum==0

In [None]:
triangle_array = triangle_array[triangle_sum!=0,:,:]
# let's turn 0's to nan - this will alleviate issues around dividing by zero
triangle_array[triangle_array==0]=np.nan

In [None]:
triangle_frame = triangle_frame.groupby(level=0).filter(lambda x : np.nansum(x)  > 0)
triangle_frame.iloc[10:20, :]

In [None]:
triangle_frame.iloc[-10:, :]

### Age-to-age factors

In [None]:
?display

In [None]:
# Let's use slicing to create age-to-age factors
%timeit ata_array = triangle_array[:,:-1,1:]/triangle_array[:,:-1,:-1]

In [None]:
display(pd.DataFrame(ata_array[1, :, :]))
ata_array

In [None]:
triangle_frame.iloc[0:20, :]

In [None]:
triangle_frame.iloc[0:20, 1:] / triangle_frame.iloc[0:20, :-1]

In [None]:
triangle_frame.iloc[0:20, 1:].values / triangle_frame.iloc[0:20, :-1].values

In [None]:
%timeit pd.DataFrame(triangle_frame.iloc[:, 1:].values / triangle_frame.iloc[:, :-1].values, index=triangle_frame.index, columns=range(1,10))

In [None]:
ata_df = pd.DataFrame(triangle_frame.iloc[:, 1:].values / triangle_frame.iloc[:, :-1].values, index=triangle_frame.index, columns=range(1,10))
ata_df.head(20)

In [None]:
# Let's default the completely blank age-to-age columns to 1.0
accident_periods = len(cas['DevelopmentLag'].unique())
ata_array_defaults = np.expand_dims(np.all(np.isnan(ata_array),axis=1),axis=1)
# pd.DataFrame(ata_array_defaults[:, 0 ,:])
ata_array_defaults, ata_array_defaults.shape

In [None]:
np.set_printoptions(linewidth =  160)

In [None]:
ata_array[0:3, :, :]

In [None]:
ata_array[np.repeat(ata_array_defaults,accident_periods-1,axis=1)]=1.0

In [None]:
ata_array[0:3, :, :]

### LDFs and CDFs

In [None]:
# Let's create an array of LDFs, by taking simple averages of the age-to-age factors.
ldf_array = np.nanmean(ata_array, axis=1)
ldf_array[np.isnan(ldf_array)]=1.0
# Let's create an array of CDFs with a tail factor from our LDFs
cdf_array = ldf_array[:,::-1].cumprod(axis=1)[:,::-1]
tail_factor = 1.0
cdf_array = np.append(cdf_array,np.expand_dims(np.repeat(tail_factor,cdf_array.shape[0]),1),axis=1)

In [None]:
pd.DataFrame(ldf_array).head(20)

In [None]:
ldf_df = ata_df.groupby(level=0).mean().fillna(1.)

In [None]:
ldf_df.head(20)

In [None]:
cdf_df = ldf_df.sort_index(axis=1, ascending=False).cumprod(axis=1).sort_index(axis=1)
cdf_df[10] = 1.
cdf_df.head()

In [None]:
pd.DataFrame(cdf_array).head(5)

In [None]:
cdf_df.shape, cdf_array.shape

In [None]:
cdf_df.head(20)

In [None]:
cdf_array[:20, :]

### Ultimates and IBNR

In [None]:
triangle_frame.head(30)

In [None]:
cdf_df.head(10)

In [None]:
diag_df = triangle_frame.groupby(level=0).apply(lambda x : pd.Series(np.diagonal(x.values[:, ::-1])[::-1], index=range(1,11)))
diag_df.head(10)

In [None]:
ult_df = (diag_df * cdf_df).fillna(0)
ibnr_df = ult_df - diag_df
ult.head(10)

In [None]:
ibnr_df.head()

In [None]:
latest_diagonal = np.nan_to_num(np.diagonal(triangle_array[:,::-1,],axis1=1,axis2=2)[:,::-1])
ultimate = latest_diagonal * cdf_array[:,::-1]
ibnr = ultimate - latest_diagonal
ibnr[:10,::-1]

In [None]:
ibnr.shape, ibnr_df.shape

### The business questions answered by our model

In [None]:
companies = np.array(triangle_frame.index.levels[0])[triangle_sum!=0]
print('How much IBNR does the entire industry need according to this model?')
print(np.sum(ibnr).round(0))
print()
print('What is the average ultimate to paid ratio across the industry?')
print((np.sum(ultimate)/np.sum(latest_diagonal)).round(3))
print()
print('Which company has the highest 12-Ultimate CDF?')
print(companies[np.argmax(cdf_array[:,0])])
print()
print('Which company has the lowest 12-24 LDF?')
print(companies[np.argmin(ldf_array[:,1])])
print()
print('What is the 95% confidence interval on the estimate of 12-Ultimate CDF?')
print((np.sort(cdf_array[:,1])[int(.025*len(cdf_array[:,1]))],cdf_array[:,1][int(.975*len(cdf_array[:,1]))]))

Performance test of the above code 

In [None]:
# start from cas
triangle_frame = pd.pivot_table(cas[cas['DevelopmentYear']<=1997], 
                                values='CumPaidLoss_D', 
                                index=['GRNAME','AccidentYear'], 
                                columns='DevelopmentLag')

In [None]:
# simplify index to maximum possible extent... 
triangle_frame.index = [j for j in range(132) for i in range(10)]

In [456]:
%%timeit
#%%prun -s "time" -l 20
# %%timeit
triangle_array = np.array(triangle_frame).reshape(
    len(cas['GRNAME'].unique()),len(cas['AccidentYear'].unique()),len(cas['DevelopmentLag'].unique()))
triangle_sum = np.nansum(np.nansum(triangle_array, axis=1),axis=1)
triangle_array = triangle_array[triangle_sum!=0,:,:]
triangle_array[triangle_array==0]=np.nan

ata_array = triangle_array[:,:-1,1:]/triangle_array[:,:-1,:-1]
accident_periods = len(cas['DevelopmentLag'].unique())
ata_array_defaults = np.expand_dims(np.all(np.isnan(ata_array),axis=1),axis=1)
ata_array[np.repeat(ata_array_defaults,accident_periods-1,axis=1)]=1.0

ldf_array = np.nanmean(ata_array, axis=1)
ldf_array[np.isnan(ldf_array)]=1.0

cdf_array = ldf_array[:,::-1].cumprod(axis=1)[:,::-1]
tail_factor = 1.0
cdf_array = np.append(cdf_array,np.expand_dims(np.repeat(tail_factor,cdf_array.shape[0]),1),axis=1)[:,::-1]

latest_diagonal = np.nan_to_num(np.diagonal(triangle_array[:,::-1,],axis1=1,axis2=2)[:, ::-1])
ultimate = latest_diagonal * cdf_array
ibnr = ultimate - latest_diagonal

1.56 ms ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Using Pandas

In [428]:
triangle_frame1 = triangle_frame.groupby(level=0).filter(lambda x : np.nansum(x)  > 0)
triangle_frame1.head()

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9,10
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Agway Ins Co,1988,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1989,0.0,0.0,0.0,0.0,0.0,0.0,23.0,23.0,31.0,
Agway Ins Co,1990,0.0,2.0,2.0,2.0,2.0,16.0,16.0,23.0,,
Agway Ins Co,1991,8.0,17.0,25.0,31.0,26.0,29.0,38.0,,,
Agway Ins Co,1992,0.0,0.0,0.0,0.0,0.0,0.0,,,,


In [431]:
ata_df = triangle_frame1.iloc[:, 1:].values / triangle_frame1.iloc[:, :-1] 
ata_df.iloc[10:20, :]

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alaska Nat Ins Co,1988,1.803922,1.221941,1.086264,1.037041,1.026627,1.014131,1.00732,1.006216,1.00174
Alaska Nat Ins Co,1989,1.921255,1.18967,1.073097,1.037085,1.014703,1.014126,1.006749,1.006133,
Alaska Nat Ins Co,1990,2.218639,1.276572,1.097087,1.034778,1.013287,1.011827,1.008957,,
Alaska Nat Ins Co,1991,2.254197,1.222632,1.09924,1.042958,1.028682,1.015861,,,
Alaska Nat Ins Co,1992,2.059459,1.174339,1.049215,1.029666,1.014614,,,,
Alaska Nat Ins Co,1993,1.989665,1.180176,1.075695,1.054239,,,,,
Alaska Nat Ins Co,1994,2.048829,1.18976,1.084901,,,,,,
Alaska Nat Ins Co,1995,2.076105,1.212061,,,,,,,
Alaska Nat Ins Co,1996,1.950295,,,,,,,,
Alaska Nat Ins Co,1997,,,,,,,,,


In [430]:
ata_df = triangle_frame1.iloc[:, 1:].values / triangle_frame1.iloc[:, :-1] 
ata_df.iloc[10:20, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,3,4,5,6,7,8,9
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alaska Nat Ins Co,1988,1.803922,1.221941,1.086264,1.037041,1.026627,1.014131,1.00732,1.006216,1.00174
Alaska Nat Ins Co,1989,1.921255,1.18967,1.073097,1.037085,1.014703,1.014126,1.006749,1.006133,
Alaska Nat Ins Co,1990,2.218639,1.276572,1.097087,1.034778,1.013287,1.011827,1.008957,,
Alaska Nat Ins Co,1991,2.254197,1.222632,1.09924,1.042958,1.028682,1.015861,,,
Alaska Nat Ins Co,1992,2.059459,1.174339,1.049215,1.029666,1.014614,,,,
Alaska Nat Ins Co,1993,1.989665,1.180176,1.075695,1.054239,,,,,
Alaska Nat Ins Co,1994,2.048829,1.18976,1.084901,,,,,,
Alaska Nat Ins Co,1995,2.076105,1.212061,,,,,,,
Alaska Nat Ins Co,1996,1.950295,,,,,,,,
Alaska Nat Ins Co,1997,,,,,,,,,


In [439]:
%timeit diag_df = triangle_frame1.groupby(level=0).apply(lambda x : pd.Series(np.diagonal(x.values[:, ::-1])[::-1], index=range(1,11)))


28.4 ms ± 1.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [445]:
%timeit diag_df = triangle_frame1.groupby(level=0).apply(lambda x : pd.Series(np.diagonal(x.values[:, ::-1]), index=range(11,1,-1)))


27.6 ms ± 542 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [442]:
diag_df = triangle_frame1.groupby(level=0).apply(lambda x : pd.Series(np.diagonal(x.values[:, ::-1]), index=x.columns[::-1]))
diag_df.head()    

DevelopmentLag,10,9,8,7,6,5,4,3,2,1
GRNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Agway Ins Co,0.0,31.0,23.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0
Alaska Nat Ins Co,11513.0,14108.0,15882.0,22225.0,24299.0,22158.0,18631.0,18450.0,12870.0,7048.0
Alaska Timber Ins Exchange,6146.0,6921.0,9387.0,6242.0,4354.0,4610.0,5392.0,5559.0,4342.0,1778.0
Allstate Ins Co Grp,325322.0,273873.0,256788.0,239195.0,159496.0,87215.0,91077.0,87311.0,44916.0,691.0
American Contractors Ins Grp,0.0,0.0,689.0,555.0,414.0,835.0,264.0,764.0,3295.0,787.0


In [453]:
triangle_frame1.head(20)

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9,10
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Agway Ins Co,1988,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1989,0.0,0.0,0.0,0.0,0.0,0.0,23.0,23.0,31.0,
Agway Ins Co,1990,0.0,2.0,2.0,2.0,2.0,16.0,16.0,23.0,,
Agway Ins Co,1991,8.0,17.0,25.0,31.0,26.0,29.0,38.0,,,
Agway Ins Co,1992,0.0,0.0,0.0,0.0,0.0,0.0,,,,
Agway Ins Co,1993,0.0,0.0,0.0,0.0,0.0,,,,,
Agway Ins Co,1994,0.0,0.0,0.0,0.0,,,,,,
Agway Ins Co,1995,0.0,0.0,0.0,,,,,,,
Agway Ins Co,1996,0.0,0.0,,,,,,,,
Agway Ins Co,1997,0.0,,,,,,,,,


In [471]:
# %%timeit
# %%prun -l 20
# %%timeit
# remove zero rows
triangle_frame1 = triangle_frame.groupby(level=0).filter(lambda x : np.nansum(x)  > 0)

# ata factors, picks up index from second data frame 
ata_df = triangle_frame1.iloc[:, 1:].values / triangle_frame1.iloc[:, :-1] 

# ldfs with default 1 and tail factor.... 
ldf_df = ata_df.groupby(level=0).mean().fillna(1.)
ldf_df[10] = 1.0

# cdfs and add 1 for oldest year 
cdf_df = ldf_df.iloc[:, ::-1].cumprod(axis=1).iloc[:, ::-1]
# cdf_df[10] = 1.0

# diagonal
diag_df = triangle_frame1.groupby(level=0).apply(lambda x : pd.Series(np.diagonal(x.values[:, ::-1])[::-1], index=x.columns))

# ultimate and ibnr
ult_df = (diag_df * cdf_df).fillna(0)
ibnr_df = ult_df - diag_df
ibnr_df['Tot'] = ibnr_df.sum(1)

In [474]:
ibnr_df.sort_values('Tot', ascending=False).head(40).style

DevelopmentLag,1,2,3,4,5,6,7,8,9,10,Tot
GRNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Dowa Fire & Marine Ins Co Ltd Us Br,inf,81.886,33.9627,7.71429,0.0,0.0,0.0,0.0,0.0,0,inf
Dorinco Rein Co,inf,0.0,inf,inf,20.2002,336.654,24.2135,194.921,2.48115,0,inf
Rainier Ins Co,0.0,inf,0.0,937.991,20.721,30.3297,0.0,0.0,0.0,0,inf
Red Shield Ins Co,0.0,inf,0.0,150.368,34.9112,77.6159,113.191,3.90645,88.07,0,inf
Hyundai Marine & Fire Ins Co Ltd,inf,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0,inf
Homestead Ins Co,inf,inf,-0.0,-64.9147,0.0,0.0,0.0,0.0,0.0,0,inf
Pacific Ind Ins Co,inf,176.027,125.837,26.097,0.0,0.0,0.0,0.0,0.0,0,inf
British Amer Ins Co,inf,0.0,0.0,222.146,53.1504,4.48129,19.6794,12.2358,4.36983,0,inf
GA Resaurant Mut Captive Ins Co,inf,inf,inf,inf,0.0,0.0,0.0,0.0,0.0,0,inf
New Jersey Manufacturers Grp,106145.0,81059.0,60029.2,45643.1,31695.1,22571.2,14549.1,8154.31,3397.67,0,373243.0


In [473]:
(ibnr_df.iloc[:, 0:10] - ibnr[:, ::-1]).sum(1).sort_values(ascending=False)

GRNAME
Hyundai Marine & Fire Ins Co Ltd            inf
British Amer Ins Co                         inf
GA Resaurant Mut Captive Ins Co             inf
Dowa Fire & Marine Ins Co Ltd Us Br         inf
Dorinco Rein Co                             inf
Pacific Ind Ins Co                          inf
Rainier Ins Co                              inf
Red Shield Ins Co                           inf
Homestead Ins Co                            inf
Associated Industries Ins Co           0.000000
Associated Loggers Exch                0.000000
Employers Security Ins Co              0.000000
Erie Ins Exchange Grp                  0.000000
FFVA Mut Ins Co                        0.000000
FL Farm Bureau Grp                     0.000000
FM Global                              0.000000
Farm Bureau Grp                        0.000000
Farm Bureau Of MI Grp                  0.000000
Farmers Alliance Mut & Affiliates      0.000000
Farmers Automobile Grp                 0.000000
Farmers Ins Co of Flemington     

In [None]:
display(ibnr_df.head(20).style)
display(pd.DataFrame(ibnr[:,::-1]).head(20).style)

In [None]:
display(ult_df.head(10))
display(pd.DataFrame(ultimate).iloc[0:10, ::-1])

In [None]:
sorted((ult_df - ultimate[:, ::-1]).abs().sum(1))

In [None]:
display(diag_df.head(10))
display(pd.DataFrame(latest_diagonal).iloc[0:10, :])

In [None]:
display(cdf_df.head(10))
display(pd.DataFrame(cdf_array).iloc[0:10, ::-1])

In [None]:
display(ldf_df.head(10))
display(pd.DataFrame(ldf_array).head(10))

In [None]:
%load_ext line_profiler

In [None]:
%%prun -s "time" -l 20

triangle_frame1 = triangle_frame.groupby(level=0).filter(lambda x : np.nansum(x)  > 0)

# ata factors
ata_df = pd.DataFrame(triangle_frame1.iloc[:, 1:].values / \
    triangle_frame1.iloc[:, :-1].values, index=triangle_frame1.index, columns=range(1,10))

# ldfs with default 1
ldf_df = ata_df.groupby(level=0).mean().fillna(1.)

# cdfs
# cdf_df = ldf_df.sort_index(axis=1, ascending=False).cumprod(axis=1).sort_index(axis=1)
cdf_df = ldf_df.iloc[:, ::-1].cumprod(axis=1).iloc[:, ::-1]

# diagonal
diag_df = triangle_frame1.groupby(level=0).apply(
    lambda x : pd.Series(np.diagonal(x.values[:, ::-1])[::-1], index=range(1,11)))
# ultimate and ibnr
ult_df = (diag_df * cdf_df).fillna(0)
ibnr_df = ult_df - diag_df 

In [None]:
%lprun -f test2()

In [None]:
(ibnr_df - ibnr[:, ::-1]).sum(1).sort_values(ascending=False)

In [None]:
cas.columns

# SM Triangles

In [None]:
N1 = pd.read_csv(r'http://www.mynl.com/RPM/masterdata.csv')

In [None]:
plt.figure(figsize=(12,12))
plt.plot(np.log(np.log(N1.PaidLoss)), np.log(np.log(N1.UltIncLoss)), 'x', alpha=0.05)

In [None]:
N2 = N1.set_index(keys=['Line', 'GRName', 'AY', 'Lag'], inplace=False)

In [None]:
N2.head()

In [None]:
pd.unique(N1.Line) # , pd.unique(N1.GRName)

In [None]:
%timeit N1[ (N1.GRName == 'Alaska Nat Ins Co') & (N1.Line=='Comm Auto')].head(5)

In [None]:
%timeit N1.query(' GRName == "Alaska Nat Ins Co" and Line=="Comm Auto" ').head(5)

In [None]:
N2.loc[(slice(None), 'FM Global'), :].head(5)

In [None]:
N2.loc['Comm Auto', :].head(5)

In [None]:
N2.loc[(slice(None), slice(None), 1990), :].head(5)

In [None]:
N2.xs(('Canal Ins Co Grp', 'Comm Auto'), level=('GRName', 'Line')).head(3)

In [None]:
N2.xs('Comm Auto', level='Line').head(3)

In [None]:
N1.query(" GRName=='State Farm Mut Grp' and Line=='Comm Auto' ").head()
sfm = 'State Farm Mut Grp' 

In [None]:
# G = pd.pivot_table(N1.query(" GRName=='State Farm Mut Grp' "), values='PaidLoss', index=['GRName', 'Line', 'AY'], columns='Lag')
G = pd.pivot_table(N1, values='PaidLoss', index=['GRName', 'Line', 'AY'], columns='Lag')
G.head(20)

In [None]:
N1.columns

In [None]:
G = pd.pivot_table(N1.query(" AY+Lag <= 1998 and GRName=='State Farm Mut Grp' "), values=['PaidLoss', 'CaseIncLoss'], index=['GRName', 'Line', 'AY'], columns='Lag')
G.head(20)

In [None]:
def meth1(G):
    return pd.concat((G, 
                      pd.DataFrame(G.iloc[:, 1:10].values / G.iloc[:, 0:9].values, index=G.index, columns=pd.MultiIndex.from_tuples([('Inc', f'{i}_{i-1}') for i in range(2,11)])),
                      pd.DataFrame(G.iloc[:, 11:].values / G.iloc[:, 10:-1].values, index=G.index, columns=pd.MultiIndex.from_tuples([('Pd', f'{i}_{i-1}') for i in range(2,11)]))
                     ), axis=1)

In [None]:
G2 = meth1(G)


In [None]:
G2.loc['State Farm Mut Grp', 'Pd'].head(20)

In [None]:
# just the complete triangles 
comp = G2.loc[G2.groupby(['GRName', 'Line']).apply(lambda x : x.isna().sum().sum()) == 180, :]

In [None]:
bit = comp.loc[[sfm]]

In [None]:
bit

In [None]:
%timeit meth1(G)

In [None]:
def meth2(G):
    for i in range(2,11):
        G[str(i) + '_' + str(i-1)] = G[i] / G[i-1]
    return G

In [None]:
%timeit meth2(G)

In [None]:
def meth3(G):
    temp1 = G.iloc[:, 1:]
    temp2 = G.iloc[:, :-1]
    temp1.columns = temp2.columns = [f'{i}_{i-1}' for i in range(2,11)]
    
    return pd.concat((G, temp1 / temp2), axis=1)

In [None]:
%timeit meth3(G)

In [None]:
pd.DataFrame(G.iloc[:, 1:].values / G.iloc[:, :-1].values, index=G.index,columns=[f'{i}_{i-1}' for i in range(2,11)]).head()

In [None]:
pd.set_option('display.multi_sparse', False)

In [None]:
G2.loc[[sfm]].groupby(level=['GRName', 'Line']).apply(lambda x : display( pd.DataFrame({ f'str {i}' : np.nansum(x.loc[:, 'Inc'].values * mask(i, 10, 'link'), axis=0) / \
                                                                      np.nansum( mask(i, 10, 'link'), axis=0) for i in [3, 5, 10]})))

In [None]:
import numpy.ma as ma

In [None]:
x = comp.loc[(sfm, 'Comm Auto')]

In [None]:
np.tile(mask(2, 10, 'loss_num'), (3,1))

In [None]:
mask(2,10,'link')

In [None]:
ma.masked_array(x.loc[:, 'Inc'], 1-mask(2, 10, 'link')).mean(0)

In [None]:
def mask(n, size, kind):
    """ 
    mask for avg last n in a size x size triangle 
    """
    nyrs = size - 1
    if kind=='loss_den':
        ans = np.array([[1 if i + j < nyrs and i + j >= nyrs - n else 0 for i in range(size)] for j in range(size)])
    elif kind=='loss_num':
        ans = np.array([[1 if i > 0 and i + j < size and i + j >= size - n else 0 for i in range(size)] for j in range(size)])
    else:
        ans = np.array([[1 if i + j < nyrs and i + j >= nyrs - n else 0 for i in range(nyrs)] for j in range(size)])
    return ans

def mask2(n, size, kind):
    """ 
    mask for avg last n in a size x size triangle 
    """
    nyrs = size - 1
    if kind=='loss_den':
        ans = np.array([[0 if i + j < nyrs and i + j >= nyrs - n else 1 for i in range(size)] for j in range(size)])
    elif kind=='loss_num':
        ans = np.array([[0 if i > 0 and i + j < size and i + j >= size - n else 1 for i in range(size)] for j in range(size)])
    else:
        ans = np.array([[0 if i + j < nyrs and i + j >= nyrs - n else 1 for i in range(nyrs)] for j in range(size)])
    return ans

def make_links(x, avg_tuple=(3, 5, 10)):
    return pd.DataFrame({ \
        **{ ('Inc', f'str {i}') : np.nansum(x.loc[:, 'Inc'].values * mask(i, 10, 'link'), 0) / np.nansum( mask(i, 10, 'link'), 0) for i in avg_tuple}, \
        **{ ('Pd', f'str {i}') :  np.nansum(x.loc[:, 'Pd'].values * mask(i, 10, 'link'), 0) /  np.nansum( mask(i, 10, 'link'), 0) for i in avg_tuple}, \
        **{ ('Inc', f'wtd {i}') : np.nansum((x.loc[:, 'CaseIncLoss'].values * mask(i, 10, 'loss_num')), 0)[1:] / \
                                  np.nansum((x.loc[:, 'CaseIncLoss'].values * mask(i, 10, 'loss_den')), 0)[:-1] for i in avg_tuple}, \
        **{ ('Pd', f'wtd {i}') :  np.nansum((x.loc[:, 'PaidLoss'].values * mask(i, 10, 'loss_num')), 0)[1:] / \
                                  np.nansum((x.loc[:, 'PaidLoss'].values * mask(i, 10, 'loss_den')), 0)[:-1] for i in avg_tuple}, \
        }, \
        index=range(1,10)).T

def make_links2(x, avg_tuple=(3, 5, 10)):
    return pd.DataFrame({ \
        **{ (j, f'str {i}') : ma.masked_array(x.loc[:, j], mask2(i, 10, 'link')).mean(0) for i in avg_tuple for j in ['Inc', 'Pd']}, \
        **{ (j, f'wtd {i}') : ma.masked_array(x.loc[:, k], mask2(i, 10, 'loss_num')).sum(0)[1:] / ma.masked_array(x.loc[:, k], mask2(i, 10, 'loss_den')).sum(0)[:-1] \
           for i in avg_tuple for j, k in [('Inc', 'CaseIncLoss'), ('Pd', 'PaidLoss')]}, \
        }, \
        index=range(1,10)).T

In [None]:
links2 = comp.groupby(level=['GRName', 'Line']).apply(make_links2)

In [None]:
links = comp.groupby(level=['GRName', 'Line']).apply(make_links)

In [None]:
(links - links2).abs().sum(1).sum()

In [None]:
%timeit comp.groupby(level=['GRName', 'Line']).apply(make_links2)

In [None]:
%timeit comp.groupby(level=['GRName', 'Line']).apply(make_links)

In [None]:
links.loc[(sfm, 'Comm Auto')].T.plot()

In [None]:
links.head(30).groupby('GRName').apply(lambda x : display(x.xs('Comm Auto', level=1))) # loc[['Comm Auto']].T.plot())

In [None]:
links.to_csv('links.csv')

In [None]:
links.loc[(slice(None), 'Comm Auto'), :].head()

In [None]:
f, axs = plt.subplots(10, 6, figsize=(18,24))
axs = axs.flatten()
it = iter(axs)
# links.iloc[:300, :].groupby(['Line', 'GRName']).apply(lambda x : x.reset_index(level=[0,1], drop=True).T.plot(legend=None, ax=next(it), title=' '.join(x.name) ))
links.loc[(slice(None), 'Work Comp'), :].groupby(['GRName']).apply(lambda x : x.reset_index(level=[0], drop=True).T.plot(legend=None, ax=next(it), title=x.name) )
# for ax in it:
#     f.delaxes(ax)
plt.tight_layout()

In [None]:
big_cos = ['State Farm Mut Grp', 'Federal Ins Co Grp', 'Canal Ins Co Grp', 'Erie Ins Exchange Grp', 
           'Employers Mut Co Of Des Moines', 'New Jersey Manufacturers Grp', 'Pennsylvania Natl Ins Grp', 
           'Vanliner Ins Co', 'Lancer Ins Co', 'Protective Ins Grp', 'FL Farm Bureau Grp', 'Harco Natl Ins Co', 
           'Century-Natl Ins Co', 'NC Farm Bureau Ins Grp', 'National American Ins Co', 'Philadelphia Ind Ins Co & Aff', 
           'West Bend Mut Ins Grp', 'Church Mut Ins Co', 'Lumber Ins Cos', 'Farmers Automobile Grp', 'Grinnell Mut Grp']
                    

In [None]:
def known_ctrs_ex( line_name, curr_year):
    '''
    add all company triangles for given line 
    '''
    G = pd.pivot_table(N1[ (N1.AY+N1.Lag <= curr_year+1) & (N1.Line==line_name) & (N1['GRName'].isin(big_cos))], 
                       values=['PaidLoss', 'CaseIncLoss'], 
                       index=['GRName', 'AY'], columns='Lag')
    ## add link ratios 
    for ls in ['CaseIncLoss', 'PaidLoss']:
        for i in range(2,11):
            G[(ls+'Link', i-1)] = G[(ls, i)] / G[(ls, i-1)]
    G.columns.names = ['LossType', 'Lag']
    G.sort_index(axis=1, inplace=True)
    return G

In [None]:
H = known_ctrs_ex('Comm Auto', 1997)

In [None]:
H.loc[('Vanliner Ins Co', 1988):('Vanliner Ins Co', 1998), :]

In [None]:
def pdIncPlot(H, co_name, bins=201):
    '''
    bootstrap from paid and incurred and create product distribution 
    '''
    v = {}
    kpi = np.array(H.loc[(co_name, 1997), ('CaseIncLoss', 1)])
    kpp = np.array(H.loc[(co_name, 1997), ('PaidLoss', 1)])
    for i in range(1,10):
        kpp = np.kron(kpp, H.loc[(co_name, 1988):(co_name, 1997-i), ('PaidLossLink', i)])
        kpi = np.kron(kpi, H.loc[(co_name, 1988):(co_name, 1997-i), ('CaseIncLossLink', i)])

    T = pd.DataFrame( {'inc' : kpi, 'pd' : kpp})
    display(T.describe())
    plt.figure()
    bp = np.linspace(0, 1.05*max(max(kpi), max( kpp)), bins)
    npd,  _, _ =plt.hist(kpp, bins=bp, color='b', alpha=0.5, label='pd')
    ninc, _, _ =plt.hist(kpi, bins=bp, color='r', alpha=0.5, label='inc')
    bay = ninc*npd / sum(ninc*npd) * sum(npd)
    xs = (bp[1:]+bp[0:-1])/2
    plt.plot(xs, bay, '-g', label='post')
    plt.legend()
    plt.title('Co: {:},  MLE = {:,.1f}'.format(co_name, xs[bay.argmax()]))

In [None]:
big_cos

In [None]:
pdIncPlot(H,   big_cos[-1])

# Performance

In [None]:
n = 100000000
df = pd.DataFrame({
    'a': np.random.randn(n),
    'b': np.random.randn(n),
    'c': np.random.randn(n),
})
a =  np.random.randn(n)

In [None]:
%timeit r = np.sin(a - 1) + 1

In [None]:
%timeit r = np.sin(df['a'] - 1) + 1

In [None]:
%timeit r = np.sin(df['a'].values - 1) + 1

In [None]:
import numexpr

In [None]:
expr = 'sin(a - 1) + 1'

In [None]:
%timeit r = numexpr.evaluate(expr)

In [None]:
def dowork(a):
    expr = 'sin(a - 1) + 1'
    return numexpr.evaluate(expr)

In [None]:
%timeit r = dowork(df['a'])

# Great Supply of Datasets!

In [None]:
test = pd.read_html('https://vincentarelbundock.github.io/Rdatasets/datasets.html', header=0, attrs={"class" : "dataframe"})[0]

In [None]:
test3 = pd.read_csv('http://www.mynl.com/RPM/Datasets.csv').iloc[:, 1:]  # first column is blank

In [None]:
test3.iloc[:, 1:].head()

In [None]:
test3.head()

In [None]:
def explore(f):
    return pd.DataFrame([('type', type(f))] + [(i, getattr(f, i).__doc__) for i in dir(f)], columns=['Method', 'Help'])

In [None]:
explore(list)

In [None]:
a = list(range(20))

In [None]:
u, v, z = a

In [None]:
u, v, w, z

In [None]:
N1.head(20)

In [None]:
G = pd.pivot_table(N1.query(" AY+Lag <= 1998 "), values=['PaidLoss', 'CaseIncLoss'], index=['GRName', 'Line', 'AY'], columns='Lag')

In [None]:
G.head(20)

In [None]:
dir(pd.MultiIndex)

In [None]:
G.index.get_level_values(0)

In [None]:
Nco = len(N1.GRName.unique())
Nli = len(N1.Line.unique())
NAY = len(N1.AY.unique())
Nco, Nli, NAY

In [None]:
max(map(len, N1.Line.unique()))

In [None]:
cont = np.zeros(3, dtype=[('co_name', 'S36'), ('line', 'S13', Nli), ('paid', 'f8', (NAY, NAY)), ('inc', 'f8', (NAY, NAY))]) 

In [None]:
cont

In [None]:
cont['co_name'] = sorted(N1.GRName.unique())

In [None]:
G.xs('Comm Auto', level=1).index.get_level_values(0).unique()

In [None]:
cont['paid'] = G['CaseIncLoss'].values.reshape(Nco, NAY, NAY)

In [None]:
cont['paid'] = np.random.rand(300).reshape((3,10,10))

In [None]:
cont

# Pandas Functions

* DataFrame
* head, tail, describe, summary 
* unique
* from csv, dictionary 
* loc, slices
* create_index, reset_index 
* MultiIndex 
* loc, slices and xs
* query 
* pivot, stack and unstack
* melt
* **concat**, append, keys 
* pivot_table (crosstab)
* **merge** (indicator) and join
* groupby (.groups, .get_group, as_index)
* sum, mean, std etc. 
* aggregate
* transform (same size as input whiten)
* apply
* plot

## Not covered but check out on your own
* map (series), applymap (dataframes) 
* evaluate 
* str
* dt
* style


# Seaborn Plotting 


In [None]:
x = np.random.randn(5,5); x

In [None]:
df0 = pd.DataFrame(x)
df0

In [None]:
df0.columns = list('abcde')

In [None]:
df0['class'] = list('wxxxy')
df0

In [None]:
df0.index.name = 'id'
df0.columns.name = 'var'
df0

In [None]:
df = pd.DataFrame({'class': list('vwxxy'), 'subclass': list('aaabb'), 'a': np.random.randn(5), 'c': np.arange(5, dtype=np.float)}, index=pd.Index(range(5), name='idx'))

In [None]:
df

In [None]:
pd.concat( (df0, df), sort=True) 

In [None]:
df0.select_dtypes(np.number) / df.select_dtypes(np.number)

In [None]:
df.dtypes

In [None]:
df['sinb'] = np.sin(df.b)

In [None]:
df.head()

In [None]:
df1 = df.set_index('x')
df1.columns.name = 'variable'
df1.head()

In [None]:
df1.corr()

In [None]:
df1.dtypes

In [None]:
df1.select_dtypes(object)

In [None]:
df1.append(df1)

In [None]:
df2 = pd.DataFrame({'y': list('lmnop'), 'a': np.random.randn(5), 'b': np.arange(5, dtype=np.float)}, index=pd.Index(list('abcjk'), name='x'))
df2

In [None]:
pd.concat((df1,df2), sort=True)

In [None]:
df3 = pd.concat((df1,df2), sort=True, keys=['df1', 'df2'], names=['src'])
df3

In [None]:
df3.b

In [None]:
df3['b']

In [None]:
df3.b.unique()

In [None]:
df3.index

In [None]:
df3.index.get_level_values(1).unique()

In [None]:
df3.loc['df1']

In [None]:
df3.loc[:, 'a']

In [None]:
df3.loc[:, 'a':'b']

In [None]:
df3[['a']]

In [None]:
df3.unstack()

In [None]:
df3.unstack(0)

In [None]:
df3

In [None]:
df3[df3.a < 0]

In [None]:
df3.loc[df3.a < 0]

In [None]:
df3.loc[df3.a < 0, :]

In [None]:
df3.query(' a < 0 ')

In [None]:
df3.loc['df1']

In [None]:
df3.loc['b']

In [None]:
df3

In [None]:
df3.loc[(slice(None), 'b'), :]

In [None]:
df3.loc[(slice(None), slice('b','d')), :]

In [None]:
df3.loc[(slice(None), 'b')], df3.loc[:, 'b']

In [None]:
df3.xs('b', level=1)

In [None]:
df3.xs('b', axis=1)

In [None]:
df3

In [None]:
df4 = df3.reset_index()
df4

In [None]:
df4.pivot(index='src', columns='b', values='a')

In [None]:
df4.pivot(index='src', columns='b', values=['a', 'sinb'])

In [None]:
df4.pivot_table(index=['src', 'x'], columns='b', values=['a', 'sinb'])

In [None]:
g3 = df3.groupby(level='x') 

In [None]:
g3.groups 

In [None]:
g3.get_group('a')

In [None]:
g4 = df4.groupby('x')
g4.groups

In [None]:
g4.get_group('a')

In [None]:
g3.sum()

In [None]:
g3.aggregate(sum)

In [None]:
g3.agg(sum)

In [None]:
g3.agg([sum, np.std, np.min, np.max, np.size])

In [None]:
g3.agg({'a' : [sum, np.std, np.min, np.max, np.size], 'b': [sum, np.std] })

In [None]:
g3.apply(lambda x : display(x))

In [None]:
g3.apply(lambda x : print(x.a * x.b))

In [None]:
y = g3.get_group('c')
y

In [None]:
pd.Series( (y.a * y.b).values, name='ab', index=[1,2])

In [None]:
g3.apply( lambda y : pd.Series((y.a * y.b).values))

In [None]:
g3.apply( lambda y : pd.DataFrame((y.a * y.b).values, index=pd.Index(range(10, 10+len(y)), name='idx'), columns=['ab']))  

In [None]:
np.vstack((np.array([1,2,3]),np.array([1,2,3])))

In [None]:
g3.get_group('a')

In [None]:
g3.apply( lambda y : pd.DataFrame(np.hstack([y.a, y.b, (y.a * y.b).values]) ).T)

In [None]:
df3