In [235]:
%matplotlib inline

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt

### Read the training data

In [2]:
train_df = pd.read_csv('data/train.csv', index_col='row_id', dtype={"place_id": str})

According to the discussions in the forum, the time is most likely measured in minutes, so let's extract some temporal features that might be useful.

### Add temporal features

In [3]:
train_df['hour_of_day'] = (train_df['time'] / 60) % 24
train_df['day_of_week'] = (train_df['time'] / 60 / 24) % 7 # The offset shouldn't matter

### Group by place id

In [18]:
gb = train_df.groupby('place_id')
place_dfs = [gb.get_group(x) for x in gb.groups]

We could convert these new columns to ints, but leaving them as they are now might come in handy.

### Filter out places with low number of check-ins

In [93]:
filtered_place_dfs = [x for x in place_dfs if x.size >= 10]
len(filtered_place_dfs)

108144

### Compute Gaussian per place id

In [192]:
def get_gauss(df):
    X = df[['x', 'y', 'hour_of_day']].values
    if X.shape[0] <= 5:
        return None
    mean = np.mean(X, axis = 0)
    S = 1/X.shape[0] * np.dot((X - mean).T,(X - mean))
    return (df['place_id'].unique()[0], multivariate_normal(mean, S))
    
dists = [get_gauss(place_dfs[i]) for i in range(len(place_dfs))]

In [193]:
dists_dict = dict([d for d in dists if d != None])

### Assign Gaussians to grid

In [236]:
def as_str(x,y):
    return str(round(abs(x), 1)) + str(round(abs(y), 1))

In [237]:
the_grid = {as_str(x,y):[] for x in np.arange(0,10.1,0.1) for y in np.arange(0,10.1,0.1)}
devs = 1
for place in dists_dict.keys():
    dist = dists_dict[place]
    min_x = dist.mean[0] - devs * dist.cov[0][0]
    max_x = dist.mean[0] + devs * dist.cov[0][0]
    min_y = dist.mean[1] - devs * dist.cov[1][1]
    max_y = dist.mean[1] + devs * dist.cov[1][1]
    for x in np.arange(min_x, max_x, 0.1):
        for y in np.arange(min_y, max_y, 0.1):
            if max_y < 10.0 and max_x < 10.0 and min_x >= 0 and min_y >= 0:
                the_grid[as_str(x,y)].append(place)

In [253]:
def predict(x, y, h, grid, dists_dict):
    candidates = grid[as_str(x, y)]
    scored = [(p, dists_dict[p].pdf([x,y,h])) for p in candidates]
    return sorted(scored, key=lambda x: -x[1])
        
def format(sorted_list):
    return ' '.join([s[0] for s in sorted_list[:3]])

format(predict(8.3078, 7.0407, 16.12, the_grid, dists_dict))

'3108247878 5274648504 8056933091'

### Make predictions

In [254]:
test_df = pd.read_csv('data/test.csv', index_col='row_id')

In [256]:
test_df['hour_of_day'] = (test_df['time'] / 60) % 24

In [257]:
test_df

Unnamed: 0_level_0,x,y,accuracy,time,hour_of_day
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.1675,1.3608,107,930883,10.716667
1,7.3909,2.5301,35,893017,3.616667
2,8.0978,2.3473,62,976933,10.216667
3,0.9990,1.0591,62,907285,1.416667
4,0.6670,9.7254,40,914399,23.983333
5,0.1771,0.0022,161,814077,7.950000
6,5.5299,4.6581,2,930759,8.650000
7,8.6021,3.1744,4,862115,16.583333
8,4.2250,6.3435,162,787391,19.183333
9,0.6489,6.2611,39,793166,19.433333


In [267]:
test_df["place_id"] = test_df.apply(lambda r: format(predict(r["x"], r["y"], r["hour_of_day"], the_grid, dists_dict)), axis=1)

In [268]:
!rm submission.csv
test_df[["place_id"]].to_csv("submission.csv", index=True)