We are using a Weighted Least Squares to model the time series data to the equation:
\begin{align}
y=Ae^{Bx}
\end{align}

We feed the time series data into the WLS with varying forgetting factors and recieve an A and B value for the modeled equation for each.

The Forgetting Factors are generated as follows:

In [1]:
dif = .1
start = 0.
Fs = [{'label': 'f1', 'F': 1.}]
for i in range(1, 10):
    cur = round(start+dif*i, 2)
    Fs.append({'label':'f'+str(i+1), 'F':cur})

base=.9
dif = .9
for i in range(0, 5):
    dif = dif*.1
    base = base + dif
    cur = round(base, i+2)
    Fs.append({'label':'f'+str(i+11), 'F':cur})

Control whether to run on scaled or unscaled data with the following boolean:

In [5]:
scaled = False

In [7]:
import pandas as pd
import numpy as np
from ipywidgets import IntProgress
from matplotlib import pyplot as plt

In [10]:
chunkMapDF = pd.read_csv('data/ChunkMap.csv')

In [11]:
# Calculates the A and B values from Ae^Bx for a specific X/Y dataset, based on forgetting factor
# Our WLS functions slightly differently for scaled data, in that the forgetting factor is based on the difference
# between the current time and final time, as opposed to the normal v
def leastSquare(X, Y, F, SCALED_DATA=False):
    H_TRANSPOSE = np.vstack(([1.]*len(X), X))
    H = np.transpose(H_TRANSPOSE).copy()
    currF, i = 1., 0
    currF = F
    for x in X:
        if(SCALED_DATA):
            ## We don't want to 'forget' by the number of measurements before the current measurement.
            ## Instead, we 'forget' by distance from 100 (aka distance from contest ending) 
            currF = F**(100-x)
        else:
            currF = currF*F
        H_TRANSPOSE[0][i], H_TRANSPOSE[1][i] = H_TRANSPOSE[0][i]*currF, H_TRANSPOSE[1][i]*currF
        i+=1 
        
    # We use np.linalg.lstsq because unlike .inv and .solve, .lstsq accepts singular matrices,
    # however, it is only used when the determinant is 0, otherwise .solve is used
    if(np.linalg.det(H_TRANSPOSE @ H) == 0):
        return np.linalg.lstsq(H_TRANSPOSE @ H, H_TRANSPOSE @ Y)[0]
    else:
        return np.linalg.solve(H_TRANSPOSE @ H, H_TRANSPOSE @ Y)

def raiseAB(A, B):
    return np.exp(A), B

def processRegr(X, Y, F, SCALED_DATA=False):
    A, B = leastSquare(X, np.log(Y), F, SCALED_DATA=SCALED_DATA)
    if(A=="HTooShort"):
        return A, B
    else:
        return raiseAB(A, B)


In [17]:
chunkNames = chunkMapDF['Chunk'].unique()[:]

chunkBar = IntProgress(min=0, max=len(chunkNames))
contestBar = IntProgress(min=0, max=10000)

print("Chunks:")
display(chunkBar)

print("Contests:")
display(contestBar)

results = []

for chunkName in chunkNames[:]:
    print(chunkName)
    if(scaled):
        chunkDF = pd.read_csv('data/Chunks_Scaled/'+chunkName+'.csv')
    else:
        chunkDF = pd.read_csv('data/Chunks/'+chunkName+'.csv')

    contests = chunkDF['ContestId'].unique()[:]
    contestBar.value=0
    contestBar.max = len(contests)
    for cid in contests[:]:
        if(scaled):
            cSeriesDF = chunkDF[chunkDF['ContestId']==cid]
            cSeriesDF = cSeriesDF[cSeriesDF['Before4HoursOut']]
            X = cSeriesDF['TimeScaled']
            Y = cSeriesDF['EntriesScaled']
            X, Y = list(X), list(Y)        
            count = 0
        else:
            cSeriesDF = chunkDF[chunkDF['ContestId']==cid].sort_values(by=['MinutesRemaining'], ascending=False)
            numberToRemove = len(cSeriesDF[cSeriesDF['MinutesRemaining']<240])
            X = cSeriesDF['MinutesRemaining'].max() - cSeriesDF['MinutesRemaining'][:-numberToRemove]
            Y = cSeriesDF['Entries'].cumsum()[:-numberToRemove]
            X, Y = list(X), list(Y)   
        
        contest = {'ContestId':cid}
        
        for fFactor in Fs[:]:
            fName = fFactor['label']
            fFactor = fFactor['F']
            A, B = 'TooShort', 'TooShort'
            if(len(X)>=1):
                 A, B  = processRegr(X, Y, fFactor, SCALED_DATA=scaled)
            contest[fName+'A'], contest[fName+'B']=A, B    
        results.append(contest)
        contestBar.value+=1
    chunkBar.value+=1


Chunks:


IntProgress(value=0, max=65)

Contests:


IntProgress(value=0, max=3)

In [18]:
resultsDF = pd.DataFrame(results).set_index('ContestId')
resultsDF.to_csv('data/LR_Values.csv')