In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
import math

### Read the training data

In [2]:
train_df = pd.read_csv('data/train.csv', dtype={"place_id": str})

According to the discussions in the forum, the time is most likely measured in minutes, so let's extract some temporal features that might be useful.

### Add temporal features

In [3]:
train_df['hour_of_day'] = (train_df['time'] / 60) % 24
train_df['day_of_week'] = (train_df['time'] / 60 / 24) % 7 # The offset shouldn't matter

### Group by place id

In [4]:
gb = train_df.groupby('place_id')
place_dfs = {x:gb.get_group(x) for x in gb.groups}

### Filter out places with low number of check-ins

In [5]:
place_dfs_filtered = {k:place_dfs[k] for k in place_dfs.keys() if place_dfs[k].shape[0] >= 10}
len(place_dfs_filtered)

107189

### Compute Gaussian per place id based on (x,y) coords

In [6]:
def get_gauss(df, cols):
    X = df[cols].values
    mean = np.mean(X, axis = 0)
    S = 1/X.shape[0] * np.dot((X - mean).T,(X - mean))
    return (mean, S)

In [7]:
gausss = {p:get_gauss(place_dfs_filtered[p], ['x', 'y']) for p in place_dfs_filtered.keys()}

### Build the grid and assign places to each cell

In [8]:
def gridify(x): return int(round(min(max(float(x), 0.0), 9.9), 1) * 10)

In [9]:
def grid_str(x, y): return str(gridify(x)) + 'x' + str(gridify(y))

In [10]:
def grid_range(min_x, max_x): return np.arange(gridify(min_x), gridify(max_x) + 1, 1)

In [11]:
the_grid = {grid_str(x,y):[] for x in np.arange(0,10,0.1) for y in np.arange(0,10,0.1)}

In [12]:
deviations = 1.0
for place in gausss.keys():
    mean = gausss[place][0]
    S = gausss[place][1]
    min_x = mean[0] - deviations * S[0][0]
    max_x = mean[0] + deviations * S[0][0]
    min_y = mean[1] - deviations * S[1][1]
    max_y = mean[1] + deviations * S[1][1]
    for x in grid_range(min_x, max_x):
        for y in grid_range(min_y, max_y):
            the_grid[grid_str(x/10.0,y/10.0)].append(place)

### Some examples to validate the grid construction

In [17]:
the_grid[grid_str(5.90, 9.71)]

['7396606135',
 '9345928370',
 '1202174131',
 '8812223743',
 '4389214681',
 '4436576431',
 '7812440412',
 '7787555129',
 '5834556066',
 '5468513105',
 '6819069525',
 '8676578772',
 '5179610484',
 '2127493647',
 '2958153071',
 '2131338217',
 '7707356316',
 '4345595792',
 '2921317722',
 '7361785084',
 '8166141888',
 '6186310668',
 '8447149812',
 '2767943404',
 '9250717868',
 '6899880657',
 '5822153901',
 '7638415617',
 '2235013688',
 '1396024553',
 '9810305662',
 '1541783326',
 '6876611921',
 '6764973134',
 '4333143859',
 '7129952781',
 '1915587512',
 '6579888064',
 '9828111352',
 '7930096261',
 '8206434529',
 '4109490262',
 '3862195903',
 '9700198309',
 '6253132900',
 '8273116638',
 '2085279618',
 '2868026451',
 '5448696641',
 '5005915268',
 '9190010373',
 '1829727440',
 '5970467097',
 '6230476329',
 '6604722320',
 '1807952336',
 '2967548636',
 '2495692883',
 '3951099526',
 '6497749560',
 '1167622483',
 '6833060534',
 '5657611893',
 '9130838562',
 '1491534683',
 '1278722489',
 '96478856

In [21]:
place_dfs_filtered['9130838562'][['x', 'y']]

Unnamed: 0,x,y
218955,2.6857,9.7130
294230,3.4975,9.7143
353969,3.7660,9.6991
749249,4.6828,9.7319
847809,4.2385,9.7274
901542,5.5014,9.7189
1076722,4.4909,9.7205
2454391,1.7527,9.7294
3124188,3.0296,9.7067
3888542,4.6041,9.7137


### Define the models here

In [44]:
# The models should have a 'fit' method that takes 
# a dict {"place_id":DataFrame_of_check_ins} as a param
# and a 'predict' method that takes a vector representing a 
# check in and returnes the ranked list of top 3 as 
# expected for the submission

class GdaModel:
    def fit(self, places, dfs, cols):
        self.places = places
        nof_check_ins = [dfs[p].shape[0] for p in places]
        total = sum(nof_check_ins)
        self.prior = np.array(nof_check_ins)/total
        
        params = [get_gauss(dfs[p],cols) for p in places]
        self.gaussians = [multivariate_normal(g[0], g[1]) for g in params]
        
    def predict(self, check_in):
        pdfs = [g.pdf(check_in) for g in self.gaussians]
        probs = list(np.array(pdfs) * np.array(self.prior))
        top = sorted(zip(places, probs), key=lambda x: -x[1])[:3]
        return ' '.join([t[0] for t in top])

### Test it

In [75]:
places = the_grid[grid_str(5.90, 9.71)]
gda = GdaModel()
gda.fit(places, place_dfs_filtered, ['x', 'y', 'hour_of_day'])
params = [get_gauss(place_dfs_filtered[p],['x', 'y', 'hour_of_day']) for p in places]
multivariate_normal(params[0][0], params[0][1]).pdf([5.4, 9.8, 10.0])
gda.predict([[5.4, 9.71, 10.0]])

'7099936234 2085279618 9062035227'

In [76]:
place_dfs_filtered['7099936234']

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour_of_day,day_of_week
1019,1019,6.0703,9.7183,51,372186,7099936234,11.100000,6.462500
10038,10038,6.0438,9.7181,60,92432,7099936234,4.533333,1.188889
21459,21459,6.0383,9.7142,64,481352,7099936234,6.533333,5.272222
22551,22551,6.0344,9.6984,63,452675,7099936234,8.583333,6.357639
28278,28278,6.0224,9.6939,65,373080,7099936234,2.000000,0.083333
77579,77579,6.0182,9.7057,67,757847,7099936234,6.783333,1.282639
81018,81018,6.0450,9.7203,106,694550,7099936234,7.833333,6.326389
97541,97541,6.0660,9.7178,308,494370,7099936234,7.500000,0.312500
105287,105287,6.0605,9.7126,66,315518,7099936234,2.633333,2.109722
108848,108848,6.0682,9.7275,57,543436,7099936234,9.266667,6.386111


### Train the models for each cell in the grid

In [45]:
models_grid = {}
for k in the_grid.keys():
    places = the_grid[k]
    gda = GdaModel()
    gda.fit(places, place_dfs_filtered, ['x', 'y', 'hour_of_day'])
    models_grid[k] = gda

### Some examples to validate the model

In [125]:
def predict(x, y, h):
    key = grid_str(x,y)
    return ' '.join([t[0] for t in sorted([(p[0], p[1]*p[2].pdf([x,y,h]))for p in zip(models_grid[key].places, models_grid[key].prior, models_grid[key].gaussians)], key=lambda x: -x[1])[:3]])
predict(1.33, 6.51, 10.716667)

'5581938781 4879377862 7768371665'

In [120]:
place_dfs_filtered['7768371665']

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour_of_day,day_of_week
2746,2746,0.9184,6.5192,107,568363,7768371665,16.716667,2.696528
7342,7342,0.9532,6.5205,73,457665,7768371665,19.750000,2.822917
25192,25192,0.7035,6.5217,10,554505,7768371665,1.750000,0.072917
30179,30179,1.1290,6.5058,67,505337,7768371665,22.283333,0.928472
40434,40434,0.9677,6.5202,69,550825,7768371665,12.416667,4.517361
63366,63366,0.9395,6.5107,59,387018,7768371665,18.300000,2.762500
81685,81685,0.9798,6.5104,3,124984,7768371665,19.066667,2.794444
119726,119726,0.9699,6.5206,13,741638,7768371665,0.633333,4.026389
165716,165716,0.9450,6.5343,3,195388,7768371665,16.466667,2.686111
200409,200409,0.9554,6.5220,3,156336,7768371665,13.600000,3.566667


### Make predictions

In [127]:
test_df = pd.read_csv('data/test.csv')

In [128]:
test_df['hour_of_day'] = (test_df['time'] / 60) % 24

In [129]:
test_df["place_id"] = test_df.apply(lambda r: predict(r['x'], r['y'], r['hour_of_day']), axis=1)

In [130]:
!rm submission.csv
test_df[["row_id","place_id"]].to_csv("submission.csv", index=False)