In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import math

### Read the training data

In [2]:
train_df = pd.read_csv('data/train.csv', dtype={"place_id": str})

According to the discussions in the forum, the time is most likely measured in minutes, so let's extract some temporal features that might be useful.

### Add temporal features

In [3]:
train_df['hour_of_day'] = (train_df['time'] / 60) % 24
train_df['day_of_week'] = (train_df['time'] / 60 / 24) % 7 # The offset shouldn't matter

### Group by place id

In [4]:
gb = train_df.groupby('place_id')
place_dfs = {x:gb.get_group(x) for x in gb.groups}

### Filter out places with very low number of check-ins

In [5]:
place_dfs_filtered = {k:place_dfs[k] for k in place_dfs.keys() if place_dfs[k].shape[0] >= 5}
len(place_dfs_filtered)

107688

### Compute Gaussian per place id based on (x,y) coords

In [6]:
def get_gauss(df, cols):
    X = df[cols].values
    mean = np.mean(X, axis = 0)
    S = 1/X.shape[0] * np.dot((X - mean).T,(X - mean))
    return (mean, S)

In [7]:
gausss = {p:get_gauss(place_dfs_filtered[p], ['x', 'y']) for p in place_dfs_filtered.keys()}

### Build the grid and assign places to each cell

In [8]:
def gridify(x): return int(round(min(max(float(x), 0.0), 9.9), 1) * 10)

In [9]:
def grid_str(x, y): return str(gridify(x)) + 'x' + str(gridify(y))

In [10]:
def grid_range(min_x, max_x): return np.arange(gridify(min_x), gridify(max_x) + 1, 1)

In [11]:
places_grid = {grid_str(x,y):[] for x in np.arange(0,10,0.1) for y in np.arange(0,10,0.1)}

In [12]:
deviations = 1.0
for place in gausss.keys():
    mean = gausss[place][0]
    S = gausss[place][1]
    min_x = mean[0] - deviations * S[0][0]
    max_x = mean[0] + deviations * S[0][0]
    min_y = mean[1] - deviations * S[1][1]
    max_y = mean[1] + deviations * S[1][1]
    for x in grid_range(min_x, max_x):
        for y in grid_range(min_y, max_y):
            places_grid[grid_str(x/10.0,y/10.0)].append(place)

In [20]:
index = 0
indices = {}
for p in places_grid.keys():
    print("Processing " + p + "(" + str(index) + ")")
    current_df = train_df[train_df['place_id'].isin(places_grid[p])]
    current_df.to_csv("data/grid_data/train/train_" + str(index) + "_" + p + ".csv", index=False)
    indices[p] = index
    index = index + 1

Processing 56x48(0)
Processing 79x50(1)
Processing 15x69(2)
Processing 76x39(3)
Processing 85x4(4)
Processing 63x53(5)
Processing 2x94(6)
Processing 8x6(7)
Processing 82x54(8)
Processing 36x84(9)
Processing 30x94(10)
Processing 16x20(11)
Processing 47x6(12)
Processing 79x18(13)
Processing 46x96(14)
Processing 25x53(15)
Processing 72x59(16)
Processing 68x55(17)
Processing 47x9(18)
Processing 21x2(19)
Processing 22x4(20)
Processing 75x30(21)
Processing 96x22(22)
Processing 11x37(23)
Processing 82x76(24)
Processing 6x11(25)
Processing 73x15(26)
Processing 41x48(27)
Processing 7x24(28)
Processing 49x86(29)
Processing 36x66(30)
Processing 97x95(31)
Processing 70x3(32)
Processing 36x88(33)
Processing 67x74(34)
Processing 6x99(35)
Processing 77x22(36)
Processing 28x14(37)
Processing 52x79(38)
Processing 72x27(39)
Processing 84x49(40)
Processing 23x13(41)
Processing 48x55(42)
Processing 78x56(43)
Processing 56x29(44)
Processing 85x14(45)
Processing 84x73(46)
Processing 96x68(47)
Processing 27x

In [21]:
test_df = pd.read_csv('data/test.csv')

In [23]:
test_df['hour_of_day'] = (test_df['time'] / 60) % 24
test_df['day_of_week'] = (test_df['time'] / 60 / 24) % 7 # The offset shouldn't matter

In [25]:
test_df['grid_location'] = test_df.apply(lambda r: grid_str(r['x'], r['y']), axis = 1)

In [None]:
for p in places_grid.keys():
    print("Processing " + p + "(" + str(indices[p]) + ")")
    current_df = test_df[test_df['grid_location'] == p][['row_id','x', 'y', 'accuracy', 'time', 'hour_of_day', 'day_of_week']]
    current_df.to_csv("data/grid_data/test/test_" + str(indices[p]) + "_" + p + ".csv", index=False)

Processing 56x48(0)
Processing 79x50(1)
Processing 15x69(2)
Processing 76x39(3)
Processing 85x4(4)
Processing 63x53(5)
Processing 2x94(6)
Processing 8x6(7)
Processing 82x54(8)
Processing 36x84(9)
Processing 30x94(10)
Processing 16x20(11)
Processing 47x6(12)
Processing 79x18(13)
Processing 46x96(14)
Processing 25x53(15)
Processing 72x59(16)
Processing 68x55(17)
Processing 47x9(18)
Processing 21x2(19)
Processing 22x4(20)
Processing 75x30(21)
Processing 96x22(22)
Processing 11x37(23)
Processing 82x76(24)
Processing 6x11(25)
Processing 73x15(26)
Processing 41x48(27)
Processing 7x24(28)
Processing 49x86(29)
Processing 36x66(30)
Processing 97x95(31)
Processing 70x3(32)
Processing 36x88(33)
Processing 67x74(34)
Processing 6x99(35)
Processing 77x22(36)
Processing 28x14(37)
Processing 52x79(38)
Processing 72x27(39)
Processing 84x49(40)
Processing 23x13(41)
Processing 48x55(42)
Processing 78x56(43)
Processing 56x29(44)
Processing 85x14(45)
Processing 84x73(46)
Processing 96x68(47)
Processing 27x