<a href="https://colab.research.google.com/github/2020147544/Advances_in_Financial_Engineering/blob/main/chapter4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Weights

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## Concurrency of labels

In [2]:
def mpNumCoEvents(closeIdx,t1,molecule):
    '''
    Compute the number of concurrent events per bar.
    
    Input:
    ` molecule[0]: the date of the first event on which the weight will be computed
    ` molecule[-1]: the date of the last event on which the weight will be computed
    Any event that starts before t1[molecule].max() impacts the count.
    '''
    #1) find events that span the period [molecule[0],molecule[-1]]
    t1=t1.fillna(closeIdx[-1]) # unclosed events still must impact other weights
    t1=t1[t1>=molecule[0]] # events that end at or after molecule[0]
    t1=t1.loc[:t1[molecule].max()] # events that start at or before t1[molecule].max()
    #2) count events spanning a bar
    iloc=closeIdx.searchsorted(np.array([t1.index[0],t1.max()]))
    count=pd.Series(0,index=closeIdx[iloc[0]:iloc[1]+1])
    for tIn,tOut in t1.iteritems():count.loc[tIn:tOut]+=1.
    return count.loc[molecule[0]:t1[molecule].max()]


In [4]:
def mpSampleTW(t1,numCoEvents,molecule):
    # Derive average uniqueness over the event's lifespan
    wght=pd.Series(index=molecule)
    for tIn,tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn]=(1./numCoEvents.loc[tIn:tOut]).mean()
    return wght

In [5]:
def getIndMatrix(barIx,t1):
    '''
    Input:
    ` barIx: the index of bars
    ` t1: a pd Series defined by
      - an index containing the time at which the feature are observed
      - a values array containing the time at which the label is determined

    Output: a binary matrix indicating what bars influence the label for each observation
    '''
    indM=pd.DataFrame(0,index=barIx,columns=range(t1.shape[0]))
    for i,(t0,t1) in enumerate(t1.iteritems()):indM.loc[t0:t1,i]=1.
    return indM

In [8]:
def getAvgUniqueness(indM):
    # Average uniqueness from indicator matrix
    c=indM.sum(axis=1) # concurrency
    u=indM.div(c,axis=0) # uniqueness
    avgU=u[u>0].mean() # average uniqueness
    return avgU

## Sequential Bootstrap

In [9]:
def seqBootstrap(indM,sLength=None):
    '''
    Generate a sample via sequential bootstrap
    
    Input:
    ` indM
    ` sLength: an optional sample length with a default value of as many draws as rows in indM

    Output: the index of the features sampled by sequential bootstrap 
    '''
    if sLength is None:sLength=indM.shape[1]  
    phi=[]
    while len(phi)<sLength:
      avgU=pd.Series()
      for i in indM:
          indM_=indM[phi+[i]] # reduce indM
          avgU.loc[i]=getAvgUniqueness(indM_).iloc[-1]
      prob=avgU/avgU.sum() # draw prob
      phi+=[np.random.choice(indM.columns,p=prob)]
    return phi

## Return Attribution

In [10]:
def mpSampleW(t1,numCoEvents,close,molecule):
    # Derive sample weight by return attribution
    ret=np.log(close).diff() # log-returns, so that they are additive
    wght=pd.Series(index=molecule)
    for tIn,tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn]=(ret.loc[tIn:tOut]/numCoEvents.loc[tIn:tOut]).sum()
    return wght.abs()

## Time Decay

In [12]:
def getTimeDecay(tW,clfLastW=1.):
    # apply piecewise-linear decay to observed uniqueness (tW)
    # newest observation gets weight=1, oldest observation gets weight=clfLastW
    clfW=tW.sort_index().cumsum()
    if clfLastW>=0:slope=(1.-clfLastW)/clfW.iloc[-1]
    else:slope=1./((clfLastW+1)*clfW.iloc[-1])
    const=1.-slope*clfW.iloc[-1]
    clfW=const+slope*clfW
    clfW[clfW<0]=0
    print(const,slope)
    return clfW