In [187]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import textwrap 
from IPython import display
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("once")


def wdid(ob):
    ''' what does object do? 
    '''
    print('\n'.join(textwrap.wrap(' '.join([i for i in dir(ob) if i[0] != '_']), 80)))

![](https://upload.wikimedia.org/wikipedia/commons/thumb/1/1a/NumPy_logo.svg/775px-NumPy_logo.svg.png)

## The Scientific Python ecosystem is built on Numpy
![](https://gcpy.readthedocs.io/en/latest/_images/state_of_the_stack_2015.png)

## Real world example

Building a multi-dimensional IBNR model in numpy

### Create Initial multi-dimensional array

In [167]:
import pandas as pd
cas = pd.read_csv(r'http://www.casact.org/research/reserve_data/wkcomp_pos.csv')
# Let's make the pandas dataframe look more triangle like
triangle_frame = pd.pivot_table(cas[cas['DevelopmentYear']<=1997], 
                                values='CumPaidLoss_D', 
                                index=['GRNAME','AccidentYear'], 
                                columns='DevelopmentLag')
# Let's use the reshape method to create a 3-D Matrix of triangles
triangle_array = np.array(triangle_frame).reshape(len(cas['GRNAME'].unique()),
                                                  len(cas['AccidentYear'].unique()),
                                                  len(cas['DevelopmentLag'].unique()))

In [81]:
cas.head()

Unnamed: 0,GRCODE,GRNAME,AccidentYear,DevelopmentYear,DevelopmentLag,IncurLoss_D,CumPaidLoss_D,BulkLoss_D,EarnedPremDIR_D,EarnedPremCeded_D,EarnedPremNet_D,Single,PostedReserve97_D
0,86,Allstate Ins Co Grp,1988,1988,1,367404,70571,127737,400699,5957,394742,0,281872
1,86,Allstate Ins Co Grp,1988,1989,2,362988,155905,60173,400699,5957,394742,0,281872
2,86,Allstate Ins Co Grp,1988,1990,3,347288,220744,27763,400699,5957,394742,0,281872
3,86,Allstate Ins Co Grp,1988,1991,4,330648,251595,15280,400699,5957,394742,0,281872
4,86,Allstate Ins Co Grp,1988,1992,5,354690,274156,27689,400699,5957,394742,0,281872


In [80]:
triangle_frame.head(30)

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9,10
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Agway Ins Co,1988,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1989,0.0,0.0,0.0,0.0,0.0,0.0,23.0,23.0,31.0,
Agway Ins Co,1990,0.0,2.0,2.0,2.0,2.0,16.0,16.0,23.0,,
Agway Ins Co,1991,8.0,17.0,25.0,31.0,26.0,29.0,38.0,,,
Agway Ins Co,1992,0.0,0.0,0.0,0.0,0.0,0.0,,,,
Agway Ins Co,1993,0.0,0.0,0.0,0.0,0.0,,,,,
Agway Ins Co,1994,0.0,0.0,0.0,0.0,,,,,,
Agway Ins Co,1995,0.0,0.0,0.0,,,,,,,
Agway Ins Co,1996,0.0,0.0,,,,,,,,
Agway Ins Co,1997,0.0,,,,,,,,,


In [133]:
triangle_array[-1,:,:]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., nan],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0., nan, nan, nan],
       [ 0.,  0.,  0.,  0.,  0.,  0., nan, nan, nan, nan],
       [ 0.,  0.,  0.,  0.,  0., nan, nan, nan, nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan, nan, nan, nan, nan],
       [ 0.,  0.,  0., nan, nan, nan, nan, nan, nan, nan],
       [ 0.,  0., nan, nan, nan, nan, nan, nan, nan, nan],
       [ 0., nan, nan, nan, nan, nan, nan, nan, nan, nan]])

In [132]:
# triangle array is a set 10x10 triangles for more than 100 companies.
triangle_array.shape

(132, 10, 10)

### Clean up missing and zero values

In [136]:
# Let's get rid of completely empty triangles
triangle_sum = np.nansum(np.nansum(triangle_array, axis=1),axis=1)
triangle_sum==0

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
        True, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False,

In [137]:
triangle_array = triangle_array[triangle_sum!=0,:,:]
# let's turn 0's to nan - this will alleviate issues around dividing by zero
triangle_array[triangle_array==0]=np.nan

In [168]:
triangle_frame = triangle_frame.groupby(level=0).filter(lambda x : np.nansum(x)  > 0)
triangle_frame.iloc[10:20, :]

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9,10
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alaska Nat Ins Co,1988,4386.0,7912.0,9668.0,10502.0,10891.0,11181.0,11339.0,11422.0,11493.0,11513.0
Alaska Nat Ins Co,1989,5321.0,10223.0,12162.0,13051.0,13535.0,13734.0,13928.0,14022.0,14108.0,
Alaska Nat Ins Co,1990,4775.0,10594.0,13524.0,14837.0,15353.0,15557.0,15741.0,15882.0,,
Alaska Nat Ins Co,1991,6731.0,15173.0,18551.0,20392.0,21268.0,21878.0,22225.0,,,
Alaska Nat Ins Co,1992,9166.0,18877.0,22168.0,23259.0,23949.0,24299.0,,,,
Alaska Nat Ins Co,1993,8321.0,16556.0,19539.0,21018.0,22158.0,,,,,
Alaska Nat Ins Co,1994,7045.0,14434.0,17173.0,18631.0,,,,,,
Alaska Nat Ins Co,1995,7332.0,15222.0,18450.0,,,,,,,
Alaska Nat Ins Co,1996,6599.0,12870.0,,,,,,,,
Alaska Nat Ins Co,1997,7048.0,,,,,,,,,


In [169]:
triangle_frame.iloc[-10:, :]

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9,10
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Yasuda Fire & Marine Ins Co Of Amer,1988,1773.0,3152.0,4097.0,4649.0,5004.0,5178.0,5247.0,5316.0,5333.0,5385.0
Yasuda Fire & Marine Ins Co Of Amer,1989,1652.0,4710.0,6715.0,7680.0,8611.0,8840.0,8969.0,9041.0,9080.0,
Yasuda Fire & Marine Ins Co Of Amer,1990,3527.0,9360.0,13075.0,15646.0,16894.0,17460.0,17658.0,17777.0,,
Yasuda Fire & Marine Ins Co Of Amer,1991,5977.0,14660.0,20553.0,23104.0,24857.0,25540.0,26003.0,,,
Yasuda Fire & Marine Ins Co Of Amer,1992,7568.0,15487.0,20504.0,23615.0,24746.0,26045.0,,,,
Yasuda Fire & Marine Ins Co Of Amer,1993,3723.0,7138.0,9212.0,10150.0,10637.0,,,,,
Yasuda Fire & Marine Ins Co Of Amer,1994,2690.0,5389.0,6533.0,7210.0,,,,,,
Yasuda Fire & Marine Ins Co Of Amer,1995,2454.0,4948.0,6349.0,,,,,,,
Yasuda Fire & Marine Ins Co Of Amer,1996,3443.0,6938.0,,,,,,,,
Yasuda Fire & Marine Ins Co Of Amer,1997,3850.0,,,,,,,,,


### Age-to-age factors

In [189]:
?display

[1;31mType:[0m        module
[1;31mString form:[0m <module 'IPython.display' from 'C:\\Users\\steve\\Anaconda3\\lib\\site-packages\\IPython\\display.py'>
[1;31mFile:[0m        c:\users\steve\anaconda3\lib\site-packages\ipython\display.py
[1;31mDocstring:[0m   Public API for display tools in IPython.


In [188]:
# Let's use slicing to create age-to-age factors
ata_array = triangle_array[:,:-1,1:]/triangle_array[:,:-1,:-1]
display(pd.DataFrame(ata_array[1, :, :]))
ata_array

  
  


TypeError: 'module' object is not callable

In [164]:
triangle_frame.iloc[0:20, :]

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9,10
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Agway Ins Co,1988,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1989,0.0,0.0,0.0,0.0,0.0,0.0,23.0,23.0,31.0,0.0
Agway Ins Co,1990,0.0,2.0,2.0,2.0,2.0,16.0,16.0,23.0,0.0,0.0
Agway Ins Co,1991,8.0,17.0,25.0,31.0,26.0,29.0,38.0,0.0,0.0,0.0
Agway Ins Co,1992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agway Ins Co,1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [170]:
triangle_frame.iloc[0:20, 1:] / triangle_frame.iloc[0:20, :-1]

Unnamed: 0_level_0,DevelopmentLag,1,2,3,4,5,6,7,8,9,10
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Agway Ins Co,1988,,,,,,,,,,
Agway Ins Co,1989,,,,,,,1.0,1.0,1.0,
Agway Ins Co,1990,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
Agway Ins Co,1991,,1.0,1.0,1.0,1.0,1.0,1.0,,,
Agway Ins Co,1992,,,,,,,,,,
Agway Ins Co,1993,,,,,,,,,,
Agway Ins Co,1994,,,,,,,,,,
Agway Ins Co,1995,,,,,,,,,,
Agway Ins Co,1996,,,,,,,,,,
Agway Ins Co,1997,,,,,,,,,,


In [184]:
triangle_frame.iloc[0:20, 1:].values / triangle_frame.iloc[0:20, :-1].values

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


array([[0.        ,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               inf, 1.        , 1.34782609,        nan],
       [       inf, 1.        , 1.        , 1.        , 8.        ,
        1.        , 1.4375    ,        nan,        nan],
       [2.125     , 1.47058824, 1.24      , 0.83870968, 1.11538462,
        1.31034483,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan],


In [186]:
pd.DataFrame(triangle_frame.iloc[:, 1:].values / triangle_frame.iloc[:, :-1].values, index=triangle_frame.index, columns=range(1,10))

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,3,4,5,6,7,8,9
GRNAME,AccidentYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Agway Ins Co,1988,0.000000,,,,,,,,
Agway Ins Co,1989,,,,,,inf,1.000000,1.347826,
Agway Ins Co,1990,inf,1.000000,1.000000,1.000000,8.000000,1.000000,1.437500,,
Agway Ins Co,1991,2.125000,1.470588,1.240000,0.838710,1.115385,1.310345,,,
Agway Ins Co,1992,,,,,,,,,
Agway Ins Co,1993,,,,,,,,,
Agway Ins Co,1994,,,,,,,,,
Agway Ins Co,1995,,,,,,,,,
Agway Ins Co,1996,,,,,,,,,
Agway Ins Co,1997,,,,,,,,,


In [None]:
# Let's default the completely blank age-to-age columns with 1.0
accident_periods = len(cas['DevelopmentLag'].unique())
ata_array_defaults = np.expand_dims(np.all(np.isnan(ata_array),axis=1),axis=1)
pd.DataFrame(ata_array_defaults[:, 0 ,:])
# ata_array_defaults, ata_array_defaults.shape

In [None]:
ata_array[np.repeat(ata_array_defaults,accident_periods-1,axis=1)]=1.0

### LDFs and CDFs

In [None]:
# Let's create an array of LDFs, by taking simple averages of the age-to-age factors.
ldf_array = np.nanmean(ata_array, axis=1)
ldf_array[np.isnan(ldf_array)]=1.0
# Let's create an array of CDFs with a tail factor from our LDFs
cdf_array = ldf_array[:,::-1].cumprod(axis=1)[:,::-1]
tail_factor = 1.0
cdf_array = np.append(cdf_array,np.expand_dims(np.repeat(tail_factor,cdf_array.shape[0]),1),axis=1)

In [None]:
pd.DataFrame(ldf_array).head(20)

### Ultimates and IBNR

In [None]:
latest_diagonal = np.nan_to_num(np.diagonal(triangle_array[:,::-1,],axis1=1,axis2=2)[:,::-1])
ultimate = latest_diagonal * cdf_array[:,::-1]
ibnr = ultimate - latest_diagonal

### The business questions answered by our model

In [None]:
companies = np.array(triangle_frame.index.levels[0])[triangle_sum!=0]
print('How much IBNR does the entire industry need according to this model?')
print(np.sum(ibnr).round(0))
print()
print('What is the average ultimate to paid ratio across the industry?')
print((np.sum(ultimate)/np.sum(latest_diagonal)).round(3))
print()
print('Which company has the highest 12-Ultimate CDF?')
print(companies[np.argmax(cdf_array[:,0])])
print()
print('Which company has the lowest 12-24 LDF?')
print(companies[np.argmin(ldf_array[:,1])])
print()
print('What is the 95% confidence interval on the estimate of 12-Ultimate CDF?')
print((np.sort(cdf_array[:,1])[int(.025*len(cdf_array[:,1]))],cdf_array[:,1][int(.975*len(cdf_array[:,1]))]))

Performance test of the above code 

In [None]:
%%timeit
triangle_array = np.array(triangle_frame).reshape(len(cas['GRNAME'].unique()),len(cas['AccidentYear'].unique()),len(cas['DevelopmentLag'].unique()))
triangle_sum = np.nansum(np.nansum(triangle_array, axis=1),axis=1)
triangle_array = triangle_array[triangle_sum!=0,:,:]
triangle_array[triangle_array==0]=np.nan
ata_array = triangle_array[:,:-1,1:]/triangle_array[:,:-1,:-1]
accident_periods = len(cas['DevelopmentLag'].unique())
ata_array_defaults = np.expand_dims(np.all(np.isnan(ata_array),axis=1),axis=1)
ata_array[np.repeat(ata_array_defaults,accident_periods-1,axis=1)]=1.0
ldf_array = np.nanmean(ata_array, axis=1)
ldf_array[np.isnan(ldf_array)]=1.0
cdf_array = ldf_array[:,::-1].cumprod(axis=1)[:,::-1]
tail_factor = 1.0
cdf_array = np.append(cdf_array,np.expand_dims(np.repeat(tail_factor,cdf_array.shape[0]),1),axis=1)
latest_diagonal = np.nan_to_num(np.diagonal(triangle_array[:,::-1,],axis1=1,axis2=2)[:,::-1])
ultimate = latest_diagonal * cdf_array[:,::-1]
ibnr = ultimate - latest_diagonal

In [None]:
for m in [ i for i in dir(np) if i[0] >= 'a' and i[0]<='z']:
    print(m)

In [None]:
for m in [ i for i in dir(np.random) if i[0] >= 'a' and i[0]<='z']:
    print(m)

In [None]:
# optional pause for something more advanced... 
for m in [ i for i in dir(np) if i[0] >= 'a' and i[0]<='z']:
    print(f'\n\n{m}\n{"="*len(m)}\n')
    print(np.__getattribute__(m).__doc__)

In [None]:
df = pd.DataFrame([(i, getattr(x, i).__doc__) for i in dir(x)], columns=['Method', 'Help'])

In [None]:
cas.columns

# SM Triangles

In [None]:
N1 = pd.read_csv(r'http://www.mynl.com/RPM/masterdata.csv')

In [None]:
plt.figure(figsize=(12,12))
plt.plot(np.log(np.log(N1.PaidLoss)), np.log(np.log(N1.UltIncLoss)), 'x', alpha=0.05)

In [None]:
N2 = N1.set_index(keys=['Line', 'GRName', 'AY', 'Lag'], inplace=False)

In [None]:
N2.head()

In [None]:
pd.unique(N1.Line) # , pd.unique(N1.GRName)

In [None]:
%timeit N1[ (N1.GRName == 'Alaska Nat Ins Co') & (N1.Line=='Comm Auto')].head(5)

In [None]:
%timeit N1.query(' GRName == "Alaska Nat Ins Co" and Line=="Comm Auto" ').head(5)

In [None]:
N2.loc[(slice(None), 'FM Global'), :].head(5)

In [None]:
N2.loc['Comm Auto', :].head(5)

In [None]:
N2.loc[(slice(None), slice(None), 1990), :].head(5)

In [None]:
N2.xs(('Canal Ins Co Grp', 'Comm Auto'), level=('GRName', 'Line')).head(3)

In [None]:
N2.xs('Comm Auto', level='Line').head(3)

In [None]:
N1.query(" GRName=='State Farm Mut Grp' and Line=='Comm Auto' ").head()
sfm = 'State Farm Mut Grp' 

In [None]:
# G = pd.pivot_table(N1.query(" GRName=='State Farm Mut Grp' "), values='PaidLoss', index=['GRName', 'Line', 'AY'], columns='Lag')
G = pd.pivot_table(N1, values='PaidLoss', index=['GRName', 'Line', 'AY'], columns='Lag')
G.head(20)

In [None]:
N1.columns

In [None]:
G = pd.pivot_table(N1.query(" AY+Lag <= 1998 and GRName=='State Farm Mut Grp' "), values=['PaidLoss', 'CaseIncLoss'], index=['GRName', 'Line', 'AY'], columns='Lag')
G.head(20)

In [None]:
def meth1(G):
    return pd.concat((G, 
                      pd.DataFrame(G.iloc[:, 1:10].values / G.iloc[:, 0:9].values, index=G.index, columns=pd.MultiIndex.from_tuples([('Inc', f'{i}_{i-1}') for i in range(2,11)])),
                      pd.DataFrame(G.iloc[:, 11:].values / G.iloc[:, 10:-1].values, index=G.index, columns=pd.MultiIndex.from_tuples([('Pd', f'{i}_{i-1}') for i in range(2,11)]))
                     ), axis=1)

In [None]:
G2 = meth1(G)


In [None]:
G2.loc['State Farm Mut Grp', 'Pd'].head(20)

In [None]:
# just the complete triangles 
comp = G2.loc[G2.groupby(['GRName', 'Line']).apply(lambda x : x.isna().sum().sum()) == 180, :]

In [None]:
bit = comp.loc[[sfm]]

In [None]:
bit

In [None]:
%timeit meth1(G)

In [None]:
def meth2(G):
    for i in range(2,11):
        G[str(i) + '_' + str(i-1)] = G[i] / G[i-1]
    return G

In [None]:
%timeit meth2(G)

In [None]:
def meth3(G):
    temp1 = G.iloc[:, 1:]
    temp2 = G.iloc[:, :-1]
    temp1.columns = temp2.columns = [f'{i}_{i-1}' for i in range(2,11)]
    
    return pd.concat((G, temp1 / temp2), axis=1)

In [None]:
%timeit meth3(G)

In [None]:
pd.DataFrame(G.iloc[:, 1:].values / G.iloc[:, :-1].values, index=G.index,columns=[f'{i}_{i-1}' for i in range(2,11)]).head()

In [None]:
pd.set_option('display.multi_sparse', False)

In [None]:
G2.loc[[sfm]].groupby(level=['GRName', 'Line']).apply(lambda x : display( pd.DataFrame({ f'str {i}' : np.nansum(x.loc[:, 'Inc'].values * mask(i, 10, 'link'), axis=0) / \
                                                                      np.nansum( mask(i, 10, 'link'), axis=0) for i in [3, 5, 10]})))

In [None]:
import numpy.ma as ma

In [None]:
x = comp.loc[(sfm, 'Comm Auto')]

In [None]:
np.tile(mask(2, 10, 'loss_num'), (3,1))

In [None]:
mask(2,10,'link')

In [None]:
ma.masked_array(x.loc[:, 'Inc'], 1-mask(2, 10, 'link')).mean(0)

In [None]:
def mask(n, size, kind):
    """ 
    mask for avg last n in a size x size triangle 
    """
    nyrs = size - 1
    if kind=='loss_den':
        ans = np.array([[1 if i + j < nyrs and i + j >= nyrs - n else 0 for i in range(size)] for j in range(size)])
    elif kind=='loss_num':
        ans = np.array([[1 if i > 0 and i + j < size and i + j >= size - n else 0 for i in range(size)] for j in range(size)])
    else:
        ans = np.array([[1 if i + j < nyrs and i + j >= nyrs - n else 0 for i in range(nyrs)] for j in range(size)])
    return ans

def mask2(n, size, kind):
    """ 
    mask for avg last n in a size x size triangle 
    """
    nyrs = size - 1
    if kind=='loss_den':
        ans = np.array([[0 if i + j < nyrs and i + j >= nyrs - n else 1 for i in range(size)] for j in range(size)])
    elif kind=='loss_num':
        ans = np.array([[0 if i > 0 and i + j < size and i + j >= size - n else 1 for i in range(size)] for j in range(size)])
    else:
        ans = np.array([[0 if i + j < nyrs and i + j >= nyrs - n else 1 for i in range(nyrs)] for j in range(size)])
    return ans

def make_links(x, avg_tuple=(3, 5, 10)):
    return pd.DataFrame({ \
        **{ ('Inc', f'str {i}') : np.nansum(x.loc[:, 'Inc'].values * mask(i, 10, 'link'), 0) / np.nansum( mask(i, 10, 'link'), 0) for i in avg_tuple}, \
        **{ ('Pd', f'str {i}') :  np.nansum(x.loc[:, 'Pd'].values * mask(i, 10, 'link'), 0) /  np.nansum( mask(i, 10, 'link'), 0) for i in avg_tuple}, \
        **{ ('Inc', f'wtd {i}') : np.nansum((x.loc[:, 'CaseIncLoss'].values * mask(i, 10, 'loss_num')), 0)[1:] / \
                                  np.nansum((x.loc[:, 'CaseIncLoss'].values * mask(i, 10, 'loss_den')), 0)[:-1] for i in avg_tuple}, \
        **{ ('Pd', f'wtd {i}') :  np.nansum((x.loc[:, 'PaidLoss'].values * mask(i, 10, 'loss_num')), 0)[1:] / \
                                  np.nansum((x.loc[:, 'PaidLoss'].values * mask(i, 10, 'loss_den')), 0)[:-1] for i in avg_tuple}, \
        }, \
        index=range(1,10)).T

def make_links2(x, avg_tuple=(3, 5, 10)):
    return pd.DataFrame({ \
        **{ (j, f'str {i}') : ma.masked_array(x.loc[:, j], mask2(i, 10, 'link')).mean(0) for i in avg_tuple for j in ['Inc', 'Pd']}, \
        **{ (j, f'wtd {i}') : ma.masked_array(x.loc[:, k], mask2(i, 10, 'loss_num')).sum(0)[1:] / ma.masked_array(x.loc[:, k], mask2(i, 10, 'loss_den')).sum(0)[:-1] \
           for i in avg_tuple for j, k in [('Inc', 'CaseIncLoss'), ('Pd', 'PaidLoss')]}, \
        }, \
        index=range(1,10)).T

In [None]:
links2 = comp.groupby(level=['GRName', 'Line']).apply(make_links2)

In [None]:
links = comp.groupby(level=['GRName', 'Line']).apply(make_links)

In [None]:
(links - links2).abs().sum(1).sum()

In [None]:
%timeit comp.groupby(level=['GRName', 'Line']).apply(make_links2)

In [None]:
%timeit comp.groupby(level=['GRName', 'Line']).apply(make_links)

In [None]:
links.loc[(sfm, 'Comm Auto')].T.plot()

In [None]:
links.head(30).groupby('GRName').apply(lambda x : display(x.xs('Comm Auto', level=1))) # loc[['Comm Auto']].T.plot())

In [None]:
links.to_csv('links.csv')

In [None]:
links.loc[(slice(None), 'Comm Auto'), :].head()

In [None]:
f, axs = plt.subplots(10, 6, figsize=(18,24))
axs = axs.flatten()
it = iter(axs)
# links.iloc[:300, :].groupby(['Line', 'GRName']).apply(lambda x : x.reset_index(level=[0,1], drop=True).T.plot(legend=None, ax=next(it), title=' '.join(x.name) ))
links.loc[(slice(None), 'Work Comp'), :].groupby(['GRName']).apply(lambda x : x.reset_index(level=[0], drop=True).T.plot(legend=None, ax=next(it), title=x.name) )
# for ax in it:
#     f.delaxes(ax)
plt.tight_layout()

In [None]:
big_cos = ['State Farm Mut Grp', 'Federal Ins Co Grp', 'Canal Ins Co Grp', 'Erie Ins Exchange Grp', 
           'Employers Mut Co Of Des Moines', 'New Jersey Manufacturers Grp', 'Pennsylvania Natl Ins Grp', 
           'Vanliner Ins Co', 'Lancer Ins Co', 'Protective Ins Grp', 'FL Farm Bureau Grp', 'Harco Natl Ins Co', 
           'Century-Natl Ins Co', 'NC Farm Bureau Ins Grp', 'National American Ins Co', 'Philadelphia Ind Ins Co & Aff', 
           'West Bend Mut Ins Grp', 'Church Mut Ins Co', 'Lumber Ins Cos', 'Farmers Automobile Grp', 'Grinnell Mut Grp']
                    

In [None]:
def known_ctrs_ex( line_name, curr_year):
    '''
    add all company triangles for given line 
    '''
    G = pd.pivot_table(N1[ (N1.AY+N1.Lag <= curr_year+1) & (N1.Line==line_name) & (N1['GRName'].isin(big_cos))], 
                       values=['PaidLoss', 'CaseIncLoss'], 
                       index=['GRName', 'AY'], columns='Lag')
    ## add link ratios 
    for ls in ['CaseIncLoss', 'PaidLoss']:
        for i in range(2,11):
            G[(ls+'Link', i-1)] = G[(ls, i)] / G[(ls, i-1)]
    G.columns.names = ['LossType', 'Lag']
    G.sort_index(axis=1, inplace=True)
    return G

In [None]:
H = known_ctrs_ex('Comm Auto', 1997)

In [None]:
H.loc[('Vanliner Ins Co', 1988):('Vanliner Ins Co', 1998), :]

In [None]:
def pdIncPlot(H, co_name, bins=201):
    '''
    bootstrap from paid and incurred and create product distribution 
    '''
    v = {}
    kpi = np.array(H.loc[(co_name, 1997), ('CaseIncLoss', 1)])
    kpp = np.array(H.loc[(co_name, 1997), ('PaidLoss', 1)])
    for i in range(1,10):
        kpp = np.kron(kpp, H.loc[(co_name, 1988):(co_name, 1997-i), ('PaidLossLink', i)])
        kpi = np.kron(kpi, H.loc[(co_name, 1988):(co_name, 1997-i), ('CaseIncLossLink', i)])

    T = pd.DataFrame( {'inc' : kpi, 'pd' : kpp})
    display(T.describe())
    plt.figure()
    bp = np.linspace(0, 1.05*max(max(kpi), max( kpp)), bins)
    npd,  _, _ =plt.hist(kpp, bins=bp, color='b', alpha=0.5, label='pd')
    ninc, _, _ =plt.hist(kpi, bins=bp, color='r', alpha=0.5, label='inc')
    bay = ninc*npd / sum(ninc*npd) * sum(npd)
    xs = (bp[1:]+bp[0:-1])/2
    plt.plot(xs, bay, '-g', label='post')
    plt.legend()
    plt.title('Co: {:},  MLE = {:,.1f}'.format(co_name, xs[bay.argmax()]))

In [None]:
big_cos

In [None]:
pdIncPlot(H,   big_cos[-1])

# Performance

In [None]:
n = 100000000
df = pd.DataFrame({
    'a': np.random.randn(n),
    'b': np.random.randn(n),
    'c': np.random.randn(n),
})
a =  np.random.randn(n)

In [None]:
%timeit r = np.sin(a - 1) + 1

In [None]:
%timeit r = np.sin(df['a'] - 1) + 1

In [None]:
%timeit r = np.sin(df['a'].values - 1) + 1

In [None]:
import numexpr

In [None]:
expr = 'sin(a - 1) + 1'

In [None]:
%timeit r = numexpr.evaluate(expr)

In [None]:
def dowork(a):
    expr = 'sin(a - 1) + 1'
    return numexpr.evaluate(expr)

In [None]:
%timeit r = dowork(df['a'])

# Great Supply of Datasets!

In [None]:
test = pd.read_html('https://vincentarelbundock.github.io/Rdatasets/datasets.html', header=0, attrs={"class" : "dataframe"})[0]

In [None]:
test3 = pd.read_csv('http://www.mynl.com/RPM/Datasets.csv').iloc[:, 1:]  # first column is blank

In [None]:
test3.iloc[:, 1:].head()

In [None]:
test3.head()

In [None]:
def explore(f):
    return pd.DataFrame([('type', type(f))] + [(i, getattr(f, i).__doc__) for i in dir(f)], columns=['Method', 'Help'])

In [None]:
explore(list)

In [None]:
a = list(range(20))

In [None]:
u, v, z = a

In [None]:
u, v, w, z

In [None]:
N1.head(20)

In [None]:
G = pd.pivot_table(N1.query(" AY+Lag <= 1998 "), values=['PaidLoss', 'CaseIncLoss'], index=['GRName', 'Line', 'AY'], columns='Lag')

In [None]:
G.head(20)

In [None]:
dir(pd.MultiIndex)

In [None]:
G.index.get_level_values(0)

In [None]:
Nco = len(N1.GRName.unique())
Nli = len(N1.Line.unique())
NAY = len(N1.AY.unique())
Nco, Nli, NAY

In [None]:
max(map(len, N1.Line.unique()))

In [None]:
cont = np.zeros(3, dtype=[('co_name', 'S36'), ('line', 'S13', Nli), ('paid', 'f8', (NAY, NAY)), ('inc', 'f8', (NAY, NAY))]) 

In [None]:
cont

In [None]:
cont['co_name'] = sorted(N1.GRName.unique())

In [None]:
G.xs('Comm Auto', level=1).index.get_level_values(0).unique()

In [None]:
cont['paid'] = G['CaseIncLoss'].values.reshape(Nco, NAY, NAY)

In [None]:
cont['paid'] = np.random.rand(300).reshape((3,10,10))

In [None]:
cont

# Pandas Functions

* DataFrame
* head, tail, describe, summary 
* unique
* from csv, dictionary 
* loc, slices
* create_index, reset_index 
* MultiIndex 
* loc, slices and xs
* query 
* pivot, stack and unstack
* melt
* **concat**, append, keys 
* pivot_table (crosstab)
* **merge** (indicator) and join
* groupby (.groups, .get_group, as_index)
* sum, mean, std etc. 
* aggregate
* transform (same size as input whiten)
* apply
* plot

## Not covered but check out on your own
* map (series), applymap (dataframes) 
* evaluate 
* str
* dt
* style


# Seaborn Plotting 


In [95]:
x = np.random.randn(5,5); x

array([[ 0.40231208, -0.38541315,  2.12418725, -2.03435528, -0.20735257],
       [ 1.13856252, -1.53295787, -0.47587915, -0.01957986, -0.83399226],
       [-0.79281493, -0.46330648,  0.65058999, -1.10537578,  0.67477686],
       [ 0.40392809,  1.88290606,  0.3874182 , -0.02280603, -0.27023674],
       [-0.52403009, -1.70459097, -1.31178437,  0.23692334, -1.15206946]])

In [96]:
df0 = pd.DataFrame(x)
df0

Unnamed: 0,0,1,2,3,4
0,0.402312,-0.385413,2.124187,-2.034355,-0.207353
1,1.138563,-1.532958,-0.475879,-0.01958,-0.833992
2,-0.792815,-0.463306,0.65059,-1.105376,0.674777
3,0.403928,1.882906,0.387418,-0.022806,-0.270237
4,-0.52403,-1.704591,-1.311784,0.236923,-1.152069


In [97]:
df0.columns = list('abcde')

In [99]:
df0['class'] = list('wxxxy')
df0

Unnamed: 0,a,b,c,d,e,class
0,0.402312,-0.385413,2.124187,-2.034355,-0.207353,w
1,1.138563,-1.532958,-0.475879,-0.01958,-0.833992,x
2,-0.792815,-0.463306,0.65059,-1.105376,0.674777,x
3,0.403928,1.882906,0.387418,-0.022806,-0.270237,x
4,-0.52403,-1.704591,-1.311784,0.236923,-1.152069,y


In [100]:
df0.index.name = 'id'
df0.columns.name = 'var'
df0

var,a,b,c,d,e,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.402312,-0.385413,2.124187,-2.034355,-0.207353,w
1,1.138563,-1.532958,-0.475879,-0.01958,-0.833992,x
2,-0.792815,-0.463306,0.65059,-1.105376,0.674777,x
3,0.403928,1.882906,0.387418,-0.022806,-0.270237,x
4,-0.52403,-1.704591,-1.311784,0.236923,-1.152069,y


In [101]:
df = pd.DataFrame({'class': list('vwxxy'), 'subclass': list('aaabb'), 'a': np.random.randn(5), 'c': np.arange(5, dtype=np.float)}, index=pd.Index(range(5), name='idx'))

In [103]:
df

Unnamed: 0_level_0,class,subclass,a,c
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,v,a,1.004841,0.0
1,w,a,-0.676699,1.0
2,x,a,-1.073942,2.0
3,x,b,2.13113,3.0
4,y,b,0.022908,4.0


In [105]:
pd.concat( (df0, df), sort=True) 

Unnamed: 0,a,b,c,class,d,e,subclass
0,0.402312,-0.385413,2.124187,w,-2.034355,-0.207353,
1,1.138563,-1.532958,-0.475879,x,-0.01958,-0.833992,
2,-0.792815,-0.463306,0.65059,x,-1.105376,0.674777,
3,0.403928,1.882906,0.387418,x,-0.022806,-0.270237,
4,-0.52403,-1.704591,-1.311784,y,0.236923,-1.152069,
0,1.004841,,0.0,v,,,a
1,-0.676699,,1.0,w,,,a
2,-1.073942,,2.0,x,,,a
3,2.13113,,3.0,x,,,b
4,0.022908,,4.0,y,,,b


In [110]:
df0.select_dtypes(np.number) / df.select_dtypes(np.number)

Unnamed: 0_level_0,a,b,c,d,e
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.400374,,inf,,
1,-1.682525,,-0.475879,,
2,0.738229,,0.325295,,
3,0.189537,,0.129139,,
4,-22.875554,,-0.327946,,


In [633]:
df.dtypes

x     object
y     object
a    float64
b    float64
dtype: object

In [634]:
df['sinb'] = np.sin(df.b)

In [635]:
df.head()

Unnamed: 0_level_0,x,y,a,b,sinb
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,a,v,-0.500066,0.0,0.0
1,b,w,1.537605,1.0,0.841471
2,c,x,0.073138,2.0,0.909297
3,d,y,0.839512,3.0,0.14112
4,e,z,0.208504,4.0,-0.756802


In [636]:
df1 = df.set_index('x')
df1.columns.name = 'variable'
df1.head()

variable,y,a,b,sinb
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,v,-0.500066,0.0,0.0
b,w,1.537605,1.0,0.841471
c,x,0.073138,2.0,0.909297
d,y,0.839512,3.0,0.14112
e,z,0.208504,4.0,-0.756802


In [660]:
df1.corr()

variable,a,b,sinb
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.0,0.145689,0.389357
b,0.145689,1.0,-0.511991
sinb,0.389357,-0.511991,1.0


In [661]:
df1.dtypes

variable
y        object
a       float64
b       float64
sinb    float64
dtype: object

In [666]:
df1.select_dtypes(object)

variable,y
x,Unnamed: 1_level_1
a,v
b,w
c,x
d,y
e,z


In [667]:
df1.append(df1)

variable,y,a,b,sinb
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,v,-0.500066,0.0,0.0
b,w,1.537605,1.0,0.841471
c,x,0.073138,2.0,0.909297
d,y,0.839512,3.0,0.14112
e,z,0.208504,4.0,-0.756802
a,v,-0.500066,0.0,0.0
b,w,1.537605,1.0,0.841471
c,x,0.073138,2.0,0.909297
d,y,0.839512,3.0,0.14112
e,z,0.208504,4.0,-0.756802


In [735]:
df2 = pd.DataFrame({'y': list('lmnop'), 'a': np.random.randn(5), 'b': np.arange(5, dtype=np.float)}, index=pd.Index(list('abcjk'), name='x'))
df2

Unnamed: 0_level_0,y,a,b
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,l,0.291228,0.0
b,m,-2.2681,1.0
c,n,1.259754,2.0
j,o,-1.798738,3.0
k,p,0.025041,4.0


In [736]:
pd.concat((df1,df2), sort=True)

Unnamed: 0_level_0,a,b,sinb,y
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,-0.500066,0.0,0.0,v
b,1.537605,1.0,0.841471,w
c,0.073138,2.0,0.909297,x
d,0.839512,3.0,0.14112,y
e,0.208504,4.0,-0.756802,z
a,0.291228,0.0,,l
b,-2.2681,1.0,,m
c,1.259754,2.0,,n
j,-1.798738,3.0,,o
k,0.025041,4.0,,p


In [737]:
df3 = pd.concat((df1,df2), sort=True, keys=['df1', 'df2'], names=['src'])
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df1,b,1.537605,1.0,0.841471,w
df1,c,0.073138,2.0,0.909297,x
df1,d,0.839512,3.0,0.14112,y
df1,e,0.208504,4.0,-0.756802,z
df2,a,0.291228,0.0,,l
df2,b,-2.2681,1.0,,m
df2,c,1.259754,2.0,,n
df2,j,-1.798738,3.0,,o
df2,k,0.025041,4.0,,p


In [738]:
df3.b

src  x
df1  a    0.0
df1  b    1.0
df1  c    2.0
df1  d    3.0
df1  e    4.0
df2  a    0.0
df2  b    1.0
df2  c    2.0
df2  j    3.0
df2  k    4.0
Name: b, dtype: float64

In [739]:
df3['b']

src  x
df1  a    0.0
df1  b    1.0
df1  c    2.0
df1  d    3.0
df1  e    4.0
df2  a    0.0
df2  b    1.0
df2  c    2.0
df2  j    3.0
df2  k    4.0
Name: b, dtype: float64

In [740]:
df3.b.unique()

array([0., 1., 2., 3., 4.])

In [741]:
df3.index

MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
           labels=[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 0, 1, 2, 5, 6]],
           names=['src', 'x'])

In [746]:
df3.index.get_level_values(1).unique()

Index(['a', 'b', 'c', 'd', 'e', 'j', 'k'], dtype='object', name='x')

In [747]:
df3.loc['df1']

Unnamed: 0_level_0,a,b,sinb,y
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,-0.500066,0.0,0.0,v
b,1.537605,1.0,0.841471,w
c,0.073138,2.0,0.909297,x
d,0.839512,3.0,0.14112,y
e,0.208504,4.0,-0.756802,z


In [748]:
df3.loc[:, 'a']

src  x
df1  a   -0.500066
df1  b    1.537605
df1  c    0.073138
df1  d    0.839512
df1  e    0.208504
df2  a    0.291228
df2  b   -2.268100
df2  c    1.259754
df2  j   -1.798738
df2  k    0.025041
Name: a, dtype: float64

In [749]:
df3.loc[:, 'a':'b']

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1
df1,a,-0.500066,0.0
df1,b,1.537605,1.0
df1,c,0.073138,2.0
df1,d,0.839512,3.0
df1,e,0.208504,4.0
df2,a,0.291228,0.0
df2,b,-2.2681,1.0
df2,c,1.259754,2.0
df2,j,-1.798738,3.0
df2,k,0.025041,4.0


In [750]:
df3[['a']]

Unnamed: 0_level_0,Unnamed: 1_level_0,a
src,x,Unnamed: 2_level_1
df1,a,-0.500066
df1,b,1.537605
df1,c,0.073138
df1,d,0.839512
df1,e,0.208504
df2,a,0.291228
df2,b,-2.2681
df2,c,1.259754
df2,j,-1.798738
df2,k,0.025041


In [751]:
df3.unstack()

Unnamed: 0_level_0,a,a,a,a,a,a,a,b,b,b,...,sinb,sinb,sinb,y,y,y,y,y,y,y
x,a,b,c,d,e,j,k,a,b,c,...,e,j,k,a,b,c,d,e,j,k
src,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
df1,-0.500066,1.537605,0.073138,0.839512,0.208504,,,0.0,1.0,2.0,...,-0.756802,,,v,w,x,y,z,,
df2,0.291228,-2.2681,1.259754,,,-1.798738,0.025041,0.0,1.0,2.0,...,,,,l,m,n,,,o,p


In [752]:
df3.unstack(0)

Unnamed: 0_level_0,a,a,b,b,sinb,sinb,y,y
src,df1,df2,df1,df2,df1,df2,df1,df2
x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
a,-0.500066,0.291228,0.0,0.0,0.0,,v,l
b,1.537605,-2.2681,1.0,1.0,0.841471,,w,m
c,0.073138,1.259754,2.0,2.0,0.909297,,x,n
d,0.839512,,3.0,,0.14112,,y,
e,0.208504,,4.0,,-0.756802,,z,
j,,-1.798738,,3.0,,,,o
k,,0.025041,,4.0,,,,p


In [753]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df1,b,1.537605,1.0,0.841471,w
df1,c,0.073138,2.0,0.909297,x
df1,d,0.839512,3.0,0.14112,y
df1,e,0.208504,4.0,-0.756802,z
df2,a,0.291228,0.0,,l
df2,b,-2.2681,1.0,,m
df2,c,1.259754,2.0,,n
df2,j,-1.798738,3.0,,o
df2,k,0.025041,4.0,,p


In [754]:
df3[df3.a < 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df2,b,-2.2681,1.0,,m
df2,j,-1.798738,3.0,,o


In [755]:
df3.loc[df3.a < 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df2,b,-2.2681,1.0,,m
df2,j,-1.798738,3.0,,o


In [756]:
df3.loc[df3.a < 0, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df2,b,-2.2681,1.0,,m
df2,j,-1.798738,3.0,,o


In [757]:
df3.query(' a < 0 ')

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df2,b,-2.2681,1.0,,m
df2,j,-1.798738,3.0,,o


In [758]:
df3.loc['df1']

Unnamed: 0_level_0,a,b,sinb,y
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,-0.500066,0.0,0.0,v
b,1.537605,1.0,0.841471,w
c,0.073138,2.0,0.909297,x
d,0.839512,3.0,0.14112,y
e,0.208504,4.0,-0.756802,z


In [759]:
df3.loc['b']

KeyError: 'the label [b] is not in the [index]'

In [760]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df1,b,1.537605,1.0,0.841471,w
df1,c,0.073138,2.0,0.909297,x
df1,d,0.839512,3.0,0.14112,y
df1,e,0.208504,4.0,-0.756802,z
df2,a,0.291228,0.0,,l
df2,b,-2.2681,1.0,,m
df2,c,1.259754,2.0,,n
df2,j,-1.798738,3.0,,o
df2,k,0.025041,4.0,,p


In [761]:
df3.loc[(slice(None), 'b'), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,b,1.537605,1.0,0.841471,w
df2,b,-2.2681,1.0,,m


In [762]:
df3.loc[(slice(None), slice('b','d')), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,b,1.537605,1.0,0.841471,w
df1,c,0.073138,2.0,0.909297,x
df1,d,0.839512,3.0,0.14112,y
df2,b,-2.2681,1.0,,m
df2,c,1.259754,2.0,,n


In [766]:
df3.loc[(slice(None), 'b')], df3.loc[:, 'b']

(src  x
 df1  a    0.0
 df1  b    1.0
 df1  c    2.0
 df1  d    3.0
 df1  e    4.0
 df2  a    0.0
 df2  b    1.0
 df2  c    2.0
 df2  j    3.0
 df2  k    4.0
 Name: b, dtype: float64, src  x
 df1  a    0.0
 df1  b    1.0
 df1  c    2.0
 df1  d    3.0
 df1  e    4.0
 df2  a    0.0
 df2  b    1.0
 df2  c    2.0
 df2  j    3.0
 df2  k    4.0
 Name: b, dtype: float64)

In [767]:
df3.xs('b', level=1)

Unnamed: 0_level_0,a,b,sinb,y
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
df1,1.537605,1.0,0.841471,w
df2,-2.2681,1.0,,m


In [768]:
df3.xs('b', axis=1)

src  x
df1  a    0.0
df1  b    1.0
df1  c    2.0
df1  d    3.0
df1  e    4.0
df2  a    0.0
df2  b    1.0
df2  c    2.0
df2  j    3.0
df2  k    4.0
Name: b, dtype: float64

In [769]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df1,b,1.537605,1.0,0.841471,w
df1,c,0.073138,2.0,0.909297,x
df1,d,0.839512,3.0,0.14112,y
df1,e,0.208504,4.0,-0.756802,z
df2,a,0.291228,0.0,,l
df2,b,-2.2681,1.0,,m
df2,c,1.259754,2.0,,n
df2,j,-1.798738,3.0,,o
df2,k,0.025041,4.0,,p


In [770]:
df4 = df3.reset_index()
df4

Unnamed: 0,src,x,a,b,sinb,y
0,df1,a,-0.500066,0.0,0.0,v
1,df1,b,1.537605,1.0,0.841471,w
2,df1,c,0.073138,2.0,0.909297,x
3,df1,d,0.839512,3.0,0.14112,y
4,df1,e,0.208504,4.0,-0.756802,z
5,df2,a,0.291228,0.0,,l
6,df2,b,-2.2681,1.0,,m
7,df2,c,1.259754,2.0,,n
8,df2,j,-1.798738,3.0,,o
9,df2,k,0.025041,4.0,,p


In [771]:
df4.pivot(index='src', columns='b', values='a')

b,0.0,1.0,2.0,3.0,4.0
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,-0.500066,1.537605,0.073138,0.839512,0.208504
df2,0.291228,-2.2681,1.259754,-1.798738,0.025041


In [772]:
df4.pivot(index='src', columns='b', values=['a', 'sinb'])

Unnamed: 0_level_0,a,a,a,a,a,sinb,sinb,sinb,sinb,sinb
b,0.0,1.0,2.0,3.0,4.0,0.0,1.0,2.0,3.0,4.0
src,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
df1,-0.500066,1.537605,0.073138,0.839512,0.208504,0.0,0.841471,0.909297,0.14112,-0.756802
df2,0.291228,-2.2681,1.259754,-1.798738,0.025041,,,,,


In [773]:
df4.pivot_table(index=['src', 'x'], columns='b', values=['a', 'sinb'])

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,a,a,sinb,sinb,sinb,sinb,sinb
Unnamed: 0_level_1,b,0.0,1.0,2.0,3.0,4.0,0.0,1.0,2.0,3.0,4.0
src,x,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
df1,a,-0.500066,,,,,0.0,,,,
df1,b,,1.537605,,,,,0.841471,,,
df1,c,,,0.073138,,,,,0.909297,,
df1,d,,,,0.839512,,,,,0.14112,
df1,e,,,,,0.208504,,,,,-0.756802
df2,a,0.291228,,,,,,,,,
df2,b,,-2.2681,,,,,,,,
df2,c,,,1.259754,,,,,,,
df2,j,,,,-1.798738,,,,,,
df2,k,,,,,0.025041,,,,,


In [795]:
g3 = df3.groupby(level='x') 

In [796]:
g3.groups 

{'a': MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
            labels=[[0, 1], [0, 0]],
            names=['src', 'x']),
 'b': MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
            labels=[[0, 1], [1, 1]],
            names=['src', 'x']),
 'c': MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
            labels=[[0, 1], [2, 2]],
            names=['src', 'x']),
 'd': MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
            labels=[[0], [3]],
            names=['src', 'x']),
 'e': MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
            labels=[[0], [4]],
            names=['src', 'x']),
 'j': MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
            labels=[[1], [5]],
            names=['src', 'x']),
 'k': MultiIndex(levels=[['df1', 'df2'], ['a', 'b', 'c', 'd', 'e', 'j', 'k']],
            labels=[[1], [6]],
            na

In [797]:
g3.get_group('a')

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df2,a,0.291228,0.0,,l


In [798]:
g4 = df4.groupby('x')
g4.groups

{'a': Int64Index([0, 5], dtype='int64'),
 'b': Int64Index([1, 6], dtype='int64'),
 'c': Int64Index([2, 7], dtype='int64'),
 'd': Int64Index([3], dtype='int64'),
 'e': Int64Index([4], dtype='int64'),
 'j': Int64Index([8], dtype='int64'),
 'k': Int64Index([9], dtype='int64')}

In [800]:
g4.get_group('a')

Unnamed: 0,src,x,a,b,sinb,y
0,df1,a,-0.500066,0.0,0.0,v
5,df2,a,0.291228,0.0,,l


In [801]:
g3.sum()

Unnamed: 0_level_0,a,b,sinb
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,-0.208838,0.0,0.0
b,-0.730495,2.0,0.841471
c,1.332892,4.0,0.909297
d,0.839512,3.0,0.14112
e,0.208504,4.0,-0.756802
j,-1.798738,3.0,0.0
k,0.025041,4.0,0.0


In [802]:
g3.aggregate(sum)

Unnamed: 0_level_0,a,b,sinb
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,-0.208838,0.0,0.0
b,-0.730495,2.0,0.841471
c,1.332892,4.0,0.909297
d,0.839512,3.0,0.14112
e,0.208504,4.0,-0.756802
j,-1.798738,3.0,0.0
k,0.025041,4.0,0.0


In [803]:
g3.agg(sum)

Unnamed: 0_level_0,a,b,sinb
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,-0.208838,0.0,0.0
b,-0.730495,2.0,0.841471
c,1.332892,4.0,0.909297
d,0.839512,3.0,0.14112
e,0.208504,4.0,-0.756802
j,-1.798738,3.0,0.0
k,0.025041,4.0,0.0


In [807]:
g3.agg([sum, np.std, np.min, np.max, np.size])

Unnamed: 0_level_0,a,a,a,a,a,b,b,b,b,b,sinb,sinb,sinb,sinb,sinb
Unnamed: 0_level_1,sum,std,amin,amax,size,sum,std,amin,amax,size,sum,std,amin,amax,size
x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
a,-0.208838,0.55953,-0.500066,0.291228,2.0,0.0,0.0,0.0,0.0,2.0,0.0,,0.0,0.0,2.0
b,-0.730495,2.69104,-2.2681,1.537605,2.0,2.0,0.0,1.0,1.0,2.0,0.841471,,0.841471,0.841471,2.0
c,1.332892,0.839064,0.073138,1.259754,2.0,4.0,0.0,2.0,2.0,2.0,0.909297,,0.909297,0.909297,2.0
d,0.839512,,0.839512,0.839512,1.0,3.0,,3.0,3.0,1.0,0.14112,,0.14112,0.14112,1.0
e,0.208504,,0.208504,0.208504,1.0,4.0,,4.0,4.0,1.0,-0.756802,,-0.756802,-0.756802,1.0
j,-1.798738,,-1.798738,-1.798738,1.0,3.0,,3.0,3.0,1.0,0.0,,,,1.0
k,0.025041,,0.025041,0.025041,1.0,4.0,,4.0,4.0,1.0,0.0,,,,1.0


In [808]:
g3.agg({'a' : [sum, np.std, np.min, np.max, np.size], 'b': [sum, np.std] })

Unnamed: 0_level_0,a,a,a,a,a,b,b
Unnamed: 0_level_1,sum,std,amin,amax,size,sum,std
x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,-0.208838,0.55953,-0.500066,0.291228,2.0,0.0,0.0
b,-0.730495,2.69104,-2.2681,1.537605,2.0,2.0,0.0
c,1.332892,0.839064,0.073138,1.259754,2.0,4.0,0.0
d,0.839512,,0.839512,0.839512,1.0,3.0,
e,0.208504,,0.208504,0.208504,1.0,4.0,
j,-1.798738,,-1.798738,-1.798738,1.0,3.0,
k,0.025041,,0.025041,0.025041,1.0,4.0,


In [813]:
g3.apply(lambda x : display(x))

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df2,a,0.291228,0.0,,l


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,b,1.537605,1.0,0.841471,w
df2,b,-2.2681,1.0,,m


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,c,0.073138,2.0,0.909297,x
df2,c,1.259754,2.0,,n


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,d,0.839512,3.0,0.14112,y


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,e,0.208504,4.0,-0.756802,z


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df2,j,-1.798738,3.0,,o


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df2,k,0.025041,4.0,,p


In [818]:
g3.apply(lambda x : print(x.a * x.b))

src  x
df1  a   -0.0
df2  a    0.0
dtype: float64
src  x
df1  b    1.537605
df2  b   -2.268100
dtype: float64
src  x
df1  c    0.146276
df2  c    2.519509
dtype: float64
src  x
df1  d    2.518535
dtype: float64
src  x
df1  e    0.834018
dtype: float64
src  x
df2  j   -5.396215
dtype: float64
src  x
df2  k    0.100165
dtype: float64


In [844]:
y = g3.get_group('c')
y

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,c,0.073138,2.0,0.909297,x
df2,c,1.259754,2.0,,n


In [833]:
pd.Series( (y.a * y.b).values, name='ab', index=[1,2])

1   -0.0
2    0.0
Name: ab, dtype: float64

In [838]:
g3.apply( lambda y : pd.Series((y.a * y.b).values))

x   
a  0   -0.000000
a  1    0.000000
b  0    1.537605
b  1   -2.268100
c  0    0.146276
c  1    2.519509
d  0    2.518535
e  0    0.834018
j  0   -5.396215
k  0    0.100165
dtype: float64

In [846]:
g3.apply( lambda y : pd.DataFrame((y.a * y.b).values, index=pd.Index(range(10, 10+len(y)), name='idx'), columns=['ab']))  

Unnamed: 0_level_0,Unnamed: 1_level_0,ab
x,idx,Unnamed: 2_level_1
a,10,-0.0
a,11,0.0
b,10,1.537605
b,11,-2.2681
c,10,0.146276
c,11,2.519509
d,10,2.518535
e,10,0.834018
j,10,-5.396215
k,10,0.100165


In [851]:
np.vstack((np.array([1,2,3]),np.array([1,2,3])))

array([[1, 2, 3],
       [1, 2, 3]])

In [854]:
g3.get_group('a')

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df2,a,0.291228,0.0,,l


In [857]:
g3.apply( lambda y : pd.DataFrame(np.hstack([y.a, y.b, (y.a * y.b).values]) ).T)

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,0,-0.500066,0.291228,0.0,0.0,-0.0,0.0
b,0,1.537605,-2.2681,1.0,1.0,1.537605,-2.2681
c,0,0.073138,1.259754,2.0,2.0,0.146276,2.519509
d,0,0.839512,3.0,2.518535,,,
e,0,0.208504,4.0,0.834018,,,
j,0,-1.798738,3.0,-5.396215,,,
k,0,0.025041,4.0,0.100165,,,


In [842]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,sinb,y
src,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,a,-0.500066,0.0,0.0,v
df1,b,1.537605,1.0,0.841471,w
df1,c,0.073138,2.0,0.909297,x
df1,d,0.839512,3.0,0.14112,y
df1,e,0.208504,4.0,-0.756802,z
df2,a,0.291228,0.0,,l
df2,b,-2.2681,1.0,,m
df2,c,1.259754,2.0,,n
df2,j,-1.798738,3.0,,o
df2,k,0.025041,4.0,,p
