In [1]:
import pandas as pd
import numpy as np

from ipywidgets import IntProgress


In [2]:
chunkMapDF = pd.read_csv('data/ChunkMap.csv')
qrVals = pd.read_csv('QR_Values.csv')

In [3]:
def kParameters(X, Y, QA, QB):
    random_state = np.random.RandomState(0)
    F = np.eye(2) #Transition Matrix (Identity Matrix)
    Z = Y #Observation matrix
    X0 = [[4.], [.0023]] #Inital parameters guess (?) <<<< NEED VALUES
    P0 = (np.eye(2) + random_state.randn(2, 2)*.1)*100*np.eye(2) #Covariance Matrix (The confidence in our prediction)
    R = 1
    Q = [[QA,0.],[0.,QB]]
    w = np.random.multivariate_normal([0., 0.], Q)
    return {'random_state':random_state, 'F':F, 'Z':Z, 'X0':X0, 'P0':P0, 'R':R, 'Q':Q, 'w':w}

def kalmanFilter(X, Y, parameters, extended=True):
    Xs = []
    
    Pk = parameters['P0'] 
    Xk = parameters['X0'] 
    for n in range(0, len(X)):
        time, actual = X[n], Y[n]
        Pk = Pk + parameters["Q"]
                  
        Hk, Zk = 0, 0
        if(extended):
            part1 = np.exp(Xk[1][0]*time)
            Hk = [part1, part1*(Xk[0][0]*time)]
            Zk = Xk[0][0]*np.exp(Xk[1][0]*time)
        else:
            Hk = [1., time]
            Zk = np.dot(Hk, Xk) + np.random.normal(loc=0., scale=parameters['R']) #<----- Consider putting Vk instead of R        
        
        Yk = actual - Zk # residuals
        transposedH = [[Hk[0]],[Hk[1]]]
        Sk = np.dot(np.dot(Hk, Pk), transposedH)[0]+parameters['R']
        Kk = np.dot(Pk, transposedH)*(1/Sk)
        Xnext = Xk + Kk*Yk        
        p_part = (np.eye(2)-np.outer(Kk, Hk))
        Pnext = np.dot(p_part, Pk)
        Xs.append([Xk[0][0], Xk[1][0]])
        #Set new vars based on current observation
        Pk = Pnext
        Xk = Xnext
    if(len(Xs)==0):
        if(len(X) <240):
            return ['TooShort', 'TooShort']
        else:
            return [np.nan,np.nan]
    return Xs[-1:][0]

In [None]:
chunkNames = chunkMapDF['Chunk'].unique()[:]

chunkBar = IntProgress(min=0, max=len(chunkNames))
contestBar = IntProgress(min=0, max=10000)

print("Chunks:")
display(chunkBar)

print("Contests:")
display(contestBar)

results = []

for chunkName in chunkNames:
    print(chunkName)
    chunkDF = pd.read_csv('data/Chunks/'+chunkName+'.csv').drop(columns=['Unnamed: 0']).sort_values(by=['MinutesRemaining'], ascending=False)
    
    contests = chunkDF['ContestId'].unique()[:]
    contestBar.value=0
    contestBar.max = len(contests)
    for cid in contests:
        cSeriesDF = chunkDF[chunkDF['ContestId']==cid]
        numberToRemove = len(cSeriesDF[cSeriesDF['MinutesRemaining']<240])

        X = cSeriesDF['MinutesRemaining'].max() - cSeriesDF['MinutesRemaining'][:-numberToRemove]
        Y = cSeriesDF['Entries'].cumsum()[:-numberToRemove]
        X, Y = list(X), list(Y)        
        count = 0
        contest = {'ContestId':cid}
        for ID, qrObj in qrVals.iterrows():
            label = qrObj['label']
            P = kParameters(X, Y, float(qrObj['Q1']), float(qrObj['Q2']))
            val = kalmanFilter(X, Y, P)  
            
            contest['A'+label] = val[0]
            contest['B'+label] = val[1]
        results.append(contest)
        contestBar.value+=1
    chunkBar.value+=1


Chunks:


IntProgress(value=0, max=65)

Contests:


IntProgress(value=0, max=10000)

chunk1




chunk2




chunk3
chunk4
chunk5
chunk6
chunk7
chunk8
chunk9
chunk10
chunk11
chunk12
chunk13
chunk14
chunk15
chunk16
chunk17
chunk18
chunk19
chunk20
chunk21
chunk22
chunk23
chunk24
chunk25
chunk26
chunk27
chunk28
chunk29
chunk30
chunk31
chunk32
chunk33
chunk34
chunk35
chunk36


In [207]:
rDF = pd.DataFrame(results)

In [211]:
rDF.head()

Unnamed: 0,Av1,Av10,Av2,Av3,Av4,Av5,Av6,Av7,Av8,Av9,...,Bv10,Bv2,Bv3,Bv4,Bv5,Bv6,Bv7,Bv8,Bv9,ContestId
0,3.69333,16.2849,3.72745,20.3159,8.83348,5.79256,12.6975,,31.8214,36.55,...,0.00013292,0.000153986,0.000129761,0.000141659,0.000147688,0.000136475,,0.00012335,0.00012137,55826033
1,3.80067,33.7846,3.85889,44.2676,14.5629,8.52903,24.4516,3.01842,74.0928,99.5069,...,0.000161099,0.000227511,0.000152826,0.000186858,0.000203234,0.000170995,0.00023503,0.00013706,0.000128033,56610037
2,1.00035,0.00883208,1.00031,4.12504e-16,0.988483,0.998862,0.881227,1.00268,1.18507e-19,4.3087299999999996e-21,...,0.00043565,0.000231829,0.00175842,0.000232342,0.000231891,0.000237291,0.000231727,0.00210986,0.00225268,56751298
3,1.09548,5.62037,1.10574,6.59058,2.87978,1.97619,4.42187,1.00399,2.19328e-09,3.1816300000000005e-18,...,0.000221068,0.000297001,0.000213631,0.000252297,0.000269883,0.000232269,0.000301509,0.00123285,0.00218331,56859642
4,1.00055,7.70277e-10,1.00053,1.28382e-12,1.08517,1.01239,1.31506,1.00268,3.15993e-15,1.62217e-15,...,0.00152566,0.000289738,0.00190242,0.000284955,0.000289044,0.000273639,0.000289611,0.00225621,0.00229548,57008091


In [209]:
len(rDF.dropna())

638723

In [210]:
rDF.to_csv('data/KF_Values.csv')

In [9]:
d = pd.read_csv('data/Chunks_Scaled/Chunk1.csv')
d.head(1)

Unnamed: 0,ContestId,EntriesScaled,TimeScaled,Before4HoursOut
0,55826033,0.000486,0.0,True


In [12]:
chunkNames = chunkMapDF['Chunk'].unique()[:]

chunkBar = IntProgress(min=0, max=len(chunkNames))
contestBar = IntProgress(min=0, max=100)

print("Chunks:")
display(chunkBar)

print("Contests:")
display(contestBar)

results = []

for chunkName in chunkNames:
    print(chunkName)
    chunkDF = pd.read_csv('data/Chunks_Scaled/'+chunkName+'.csv')
    
    contests = chunkDF['ContestId'].unique()[:]
    contestBar.value=0
    contestBar.max = len(contests)
    for cid in contests:
        cSeriesDF = chunkDF[chunkDF['ContestId']==cid]
        cSeriesDF = cSeriesDF[cSeriesDF['Before4HoursOut']]

        X = cSeriesDF['TimeScaled']
        Y = cSeriesDF['EntriesScaled']
        X, Y = list(X), list(Y)        
        count = 0
        contest = {'ContestId':cid}
        for ID, qrObj in qrVals.iterrows():
            label = qrObj['label']
            P = kParameters(X, Y, float(qrObj['Q1']), float(qrObj['Q2']))
            val = kalmanFilter(X, Y, P)  
            
            contest['A'+label] = val[0]
            contest['B'+label] = val[1]
        results.append(contest)
        contestBar.value+=1
    chunkBar.value+=1


Chunks:


IntProgress(value=0, max=65)

Contests:


IntProgress(value=0)

chunk1
chunk2
chunk3
chunk4
chunk5
chunk6




chunk7
chunk8
chunk9
chunk10
chunk11
chunk12
chunk13
chunk14
chunk15
chunk16
chunk17
chunk18
chunk19
chunk20
chunk21
chunk22
chunk23
chunk24
chunk25
chunk26
chunk27
chunk28
chunk29
chunk30
chunk31
chunk32
chunk33
chunk34
chunk35
chunk36
chunk37
chunk38
chunk39
chunk40
chunk41
chunk42
chunk43
chunk44
chunk45
chunk46
chunk47
chunk48
chunk49
chunk50
chunk51
chunk52
chunk53
chunk54
chunk55




chunk56
chunk57
chunk58
chunk59
chunk60
chunk61
chunk62
chunk63
chunk64
chunk65


In [13]:
rDF = pd.DataFrame(results)

In [17]:
len(rDF.dropna())

629290

In [18]:
rDF.to_csv('data/KF_Scaled_Values.csv')