In [42]:
import pandas as pd
import numpy as np
import os

In [43]:
def getSeriesDF(cid, fileName):
    localDF = pd.read_csv('data/Chunks/'+fileName+".csv").drop(columns=['Unnamed: 0'])
    localDF = localDF.loc[localDF['ContestId']==cid]
    localDF = localDF.assign(InvertedTime = localDF['SecondsRemaining'].max()-localDF['SecondsRemaining'])
    localDF = localDF.assign(SummedEntries = localDF['Entries'][::-1].cumsum()[::-1])
    return localDF
def setupSeries(df):
    localDF = df.copy(deep=True)
    localDF = localDF.assign(X=localDF["InvertedTime"]).assign(Y=localDF["SummedEntries"])
    localDF = localDF[['X', 'Y']].sort_values(by=['X'])
    return localDF
def defaultParameters(df):
    random_state = np.random.RandomState(0)
    F = np.eye(2) #Transition Matrix (Identity Matrix)

    Z = list(df['Y']) #Observation matrix
    X0 = [[4.], [.0023]] #Inital parameters guess (?) <<<< NEED VALUES
    P0 = (np.eye(2) + random_state.randn(2, 2)*.1)*100*np.eye(2) #Covariance Matrix (The confidence in our prediction)
    R = random_state.rand()*5.05
#     Q = getQ(random_state)*1000
    Q = [[np.power(9.,62),0.],[0.,np.power(6.,65)]]
    w = np.random.multivariate_normal([0., 0.], Q)
    return {'random_state':random_state, 'F':F, 'Z':Z, 'X0':X0, 'P0':P0, 'R':R, 'Q':Q, 'w':w}
def kalmanFilter(series, parameters, extended=True):
    Xs = []
    
    Pk = parameters['P0'] 
    Xk = parameters['X0'] 
    for n in range(0, len(series)):
        actual = np.array(series.iloc[n]['Y']) #Current actual
        time = np.array(series.iloc[n]['X']) #Current time
        Pk = Pk + parameters["Q"]
                
        Hk, Zk = 0, 0
        if(extended):
            part1 = np.exp(Xk[1][0]*time)
            Hk = [part1, part1*(Xk[0][0]*time)]
            Zk = Xk[0][0]*np.exp(Xk[1][0]*time)
        else:
            Hk = [1., time]
            Zk = np.dot(Hk, Xk) + np.random.normal(loc=0., scale=parameters['R']) #<----- Consider putting Vk instead of R        
        
        Yk = actual - Zk # residuals
        transposedH = [[Hk[0]],[Hk[1]]]
        Sk = np.dot(np.dot(Hk, Pk), transposedH)[0]+parameters['R']
        Kk = np.dot(Pk, transposedH)*(1/Sk)
        Xnext = Xk + Kk*Yk        
        p_part = (np.eye(2)-np.outer(Kk, Hk))
        Pnext = np.dot(p_part, Pk)
        
        Xs.append([Xk[0][0], Xk[1][0]])
        #Set new vars based on current observation
        Pk = Pnext
        Xk = Xnext
    return Xs

def timeString():
    now = datetime.datetime.now()
    return str(now.year) + "-" + str(now.month) + "-" + str(now.day) + "-" + str(now.hour) + "-" + str(now.minute)

In [5]:
dirList = os.listdir("data/results/")
fList = []
for filename in dirList: #limit to .csv
    if(filename[-4:] == '.csv'):
        fList.append(filename[:]) 

In [30]:
kalmanResultsDF = pd.read_csv('data/results/'+fList[0]).drop(columns=['Unnamed: 0']).set_index('ContestId')

In [44]:
import datetime
from ipywidgets import IntProgress

def getTimeDif(startTime, endTime):
    d1 = datetime.datetime.strptime(startTime[:-4], '%Y-%m-%d %H:%M:%S')
    d2 = datetime.datetime.strptime(endTime[:-4], '%Y-%m-%d %H:%M:%S')
    return (d2 - d1).total_seconds() / 60

def getDuration(df):
    return getTimeDif(df['ContestStartDatetimeEST'], df['ContestEndDatetimeEST'])

def getAllDurations(df, showBar=False):
    if(showBar):
        f = IntProgress(min=0, max=len(df))
        display(f)
    results = []
    for i, c in df.iterrows():
        if(showBar):
            f.value+=1
        results.append(getDuration(c))
    return results

In [38]:
contests = pd.read_csv('data/WorkingData.csv').set_index('ContestId')
contests = contests.assign(ActualEntries=contests['Entries'])
contests = contests[['Entries', 'Duration']]

In [60]:
contests = pd.merge(contests, kalmanResultsDF, on='ContestId', how='right')

In [96]:
contests.head(5)

Unnamed: 0_level_0,Entries,Duration,Xf,label
ContestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
56776765,29726.0,0.0,"[11.594546265679089, 0.0010912673212833738]",v4
56776771,200.0,0.0,"[1.3753119073654418, 0.000703526183774105]",v4
56779283,7.0,90.0,"[0.9554694297179177, 0.0007013052708854483]",v4
56779285,134.0,90.0,"[1.14749658089893, 0.0010996846268103446]",v4
56780720,142.0,90.0,"[2.499054577653936, 0.0009116698005067062]",v4


In [64]:
def AeBx(A, B, x):
    return A*np.exp(B*x)
def Xf_to_AB(Xf):
    split = Xf[1:-1].split(', ')
    return [float(split[0]), float(split[1])]

In [117]:
AeBx(10, .0001, 40000)

545.9815003314424

In [104]:
results = []
for cid, r in contests.iterrows():
    AB = Xf_to_AB(r['Xf'])
    results.append(AeBx(AB[0], AB[1], r['Duration']))
contests.assign(GuessVal = results)

Unnamed: 0_level_0,Entries,Duration,Xf,label,GuessVal
ContestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
56776765,29726.0,0.0,"[11.594546265679089, 0.0010912673212833738]",v4,11.594546
56776771,200.0,0.0,"[1.3753119073654418, 0.000703526183774105]",v4,1.375312
56779283,7.0,90.0,"[0.9554694297179177, 0.0007013052708854483]",v4,1.017720
56779285,134.0,90.0,"[1.14749658089893, 0.0010996846268103446]",v4,1.266876
56780720,142.0,90.0,"[2.499054577653936, 0.0009116698005067062]",v4,2.712750
56780721,142.0,90.0,"[4.7540987445029455, 0.0007718225005282453]",v4,5.096078
56780722,35.0,90.0,"[1.9644825384989988, 0.0006528070212149298]",v4,2.083359
56804513,35671.0,0.0,"[122.433939151377, 0.0010341340513713771]",v4,122.433939
56804550,396.0,0.0,"[5.901560902032439, 0.0007551773929510637]",v4,5.901561
56804551,73.0,0.0,"[3.1183726455460214, 0.0005639369021338479]",v4,3.118373


In [92]:
AB = Xf_to_AB(c['Xf'])


11.594546265679089