We are using an Extended Kalman Filter to model the time series data to the equation:
\begin{align}
y=Ae^{Bx}
\end{align}

We use the extended Kalman Filter in order to model the data to an exponential without taking the log of the data. 

We feed the time series data into the kalman filter with varying Q and R parameters and recieve an A and B value for the modeled equation for each.

The Q and R values used are stored in 'data/QR_Values.csv'

In [3]:
import pandas as pd
import numpy as np

from ipywidgets import IntProgress

In [4]:
chunkMapDF = pd.read_csv('data/ChunkMap.csv')
qrVals = pd.read_csv('data/QR_Values.csv')

In [5]:
# Kalman Filters need very specific parameters. The only values we changer per KF are the Q values. 
# As such, we only need to provide X, Y, and two Q values to get the parameters necessary
def kParameters(X, Y, QA, QB):
    random_state = np.random.RandomState(0)
    F = np.eye(2) #Transition Matrix (Identity Matrix)
    Z = Y #Observation matrix
    X0 = [[4.], [.0023]] #Inital parameters guess
    P0 = (np.eye(2) + random_state.randn(2, 2)*.1)*100*np.eye(2) #Covariance Matrix (The confidence in our prediction)
    R = 1
    Q = [[QA,0.],[0.,QB]]
    w = np.random.multivariate_normal([0., 0.], Q)
    return {'random_state':random_state, 'F':F, 'Z':Z, 'X0':X0, 'P0':P0, 'R':R, 'Q':Q, 'w':w}

# Calculates the A and B values from Ae^Bx for a specific X/Y dataset, based on parameters
# The extended version fits an exponential directly, while non-extended fits linearly and can be 
# converted back to the exponential model.
def kalmanFilter(X, Y, parameters, extended=True):
    Xs = []
    
    Pk = parameters['P0'] 
    Xk = parameters['X0'] 
    for n in range(0, len(X)):
        time, actual = X[n], Y[n]
        Pk = Pk + parameters["Q"]
                  
        Hk, Zk = 0, 0
        if(extended):
            part1 = np.exp(Xk[1][0]*time)
            Hk = [part1, part1*(Xk[0][0]*time)]
            Zk = Xk[0][0]*np.exp(Xk[1][0]*time)
        else:
            Hk = [1., time]
            Zk = np.dot(Hk, Xk) + np.random.normal(loc=0., scale=parameters['R']) 
        
        Yk = actual - Zk # residuals
        transposedH = [[Hk[0]],[Hk[1]]]
        Sk = np.dot(np.dot(Hk, Pk), transposedH)[0]+parameters['R']
        Kk = np.dot(Pk, transposedH)*(1/Sk)
        Xnext = Xk + Kk*Yk        
        p_part = (np.eye(2)-np.outer(Kk, Hk))
        Pnext = np.dot(p_part, Pk)
        Xs.append([Xk[0][0], Xk[1][0]])
        #Set new vars based on current observation
        Pk = Pnext
        Xk = Xnext
    if(len(Xs)==0):
        if(len(X) <240):
            return ['TooShort', 'TooShort']
        else:
            return [np.nan,np.nan]
    return Xs[-1:][0]

### Unscaled Version

In [6]:
chunkNames = chunkMapDF['Chunk'].unique()[:]

chunkBar = IntProgress(min=0, max=len(chunkNames))
contestBar = IntProgress(min=0, max=10000)

print("Chunks:")
display(chunkBar)

print("Contests:")
display(contestBar)

results = []

for chunkName in chunkNames:
    print(chunkName)
    chunkDF = pd.read_csv('data/Chunks/'+chunkName+'.csv').drop(columns=['Unnamed: 0']).sort_values(by=['MinutesRemaining'], ascending=False)
    
    contests = chunkDF['ContestId'].unique()[:]
    contestBar.value=0
    contestBar.max = len(contests)
    for cid in contests:
        cSeriesDF = chunkDF[chunkDF['ContestId']==cid]
        numberToRemove = len(cSeriesDF[cSeriesDF['MinutesRemaining']<240])

        X = cSeriesDF['MinutesRemaining'].max() - cSeriesDF['MinutesRemaining'][:-numberToRemove]
        Y = cSeriesDF['Entries'].cumsum()[:-numberToRemove]
        X, Y = list(X), list(Y)        
        count = 0
        contest = {'ContestId':cid}
        for ID, qrObj in qrVals.iterrows():
            label = qrObj['label']
            P = kParameters(X, Y, float(qrObj['Q1']), float(qrObj['Q2']))
            val = kalmanFilter(X, Y, P)  
            
            contest['A'+label] = val[0]
            contest['B'+label] = val[1]
        results.append(contest)
        contestBar.value+=1
    chunkBar.value+=1


In [207]:
rDF = pd.DataFrame(results)
rDF.to_csv('data/KF_Values.csv')

### Scaled Version

In [7]:
chunkNames = chunkMapDF['Chunk'].unique()[:]

chunkBar = IntProgress(min=0, max=len(chunkNames))
contestBar = IntProgress(min=0, max=100)

print("Chunks:")
display(chunkBar)

print("Contests:")
display(contestBar)

results = []

for chunkName in chunkNames:
    print(chunkName)
    chunkDF = pd.read_csv('data/Chunks_Scaled/'+chunkName+'.csv')
    
    contests = chunkDF['ContestId'].unique()[:]
    contestBar.value=0
    contestBar.max = len(contests)
    for cid in contests:
        cSeriesDF = chunkDF[chunkDF['ContestId']==cid]
        cSeriesDF = cSeriesDF[cSeriesDF['Before4HoursOut']]

        X = cSeriesDF['TimeScaled']
        Y = cSeriesDF['EntriesScaled']
        X, Y = list(X), list(Y)        
        count = 0
        contest = {'ContestId':cid}
        for ID, qrObj in qrVals.iterrows():
            label = qrObj['label']
            P = kParameters(X, Y, float(qrObj['Q1']), float(qrObj['Q2']))
            val = kalmanFilter(X, Y, P)  
            
            contest['A'+label] = val[0]
            contest['B'+label] = val[1]
        results.append(contest)
        contestBar.value+=1
    chunkBar.value+=1


Chunks:


IntProgress(value=0, max=65)

Contests:


IntProgress(value=0)

In [13]:
rDF = pd.DataFrame(results)
rDF.to_csv('data/KF_Values.csv')