In [173]:
import warnings
warnings.filterwarnings('ignore')

In [174]:
import numpy as np
import pandas as pd
import os.path, os
import datetime
import time
import sys
import functools
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from bayes_opt import BayesianOptimization
import argparse
import uuid
import json

In [175]:
def map_k_precision(truthvalues, predictions):
    z = (predictions == truthvalues[:, None]).astype(np.float32)
    weights = 1./(np.arange(predictions.shape[1], dtype=np.float32) + 1.)
    z = z * weights[None, :]
    return float(np.mean(np.sum(z, axis=1)))



def f_generator(dataframe):
    mintue = dataframe['time'] % 60
    dataframe['hour'] = dataframe['time'] // 60
    dataframe['weekday'] = dataframe['hour'] // 24
    dataframe['month'] = dataframe['weekday'] // 30
    dataframe['year'] = (dataframe['weekday'] // 365 + 1)
    dataframe['hour'] = ((dataframe['hour'] % 24 + 1) + mintue / 60.0)
    dataframe['weekday'] = (dataframe['weekday'] % 7 + 1)
    dataframe['month'] = (dataframe['month'] % 12 + 1)
#     dataframe['log10acc'] = np.log10(dataframe['accuracy'].values)
    return dataframe


def f_preprocessor(dataframe,
                   w_x=500,
                   w_y=1000,
                   w_hour=4,
#                    w_log10acc=15,
                   w_weekday=3,
                   w_month=2,
                   w_year=8):
    
    dataframe["x"] = dataframe["x"] * w_x
    dataframe["y"] = dataframe["y"] * w_y
    dataframe["year"] = dataframe["year"] * w_year
    dataframe["hour"] = dataframe["hour"] * w_hour
    dataframe["weekday"] = dataframe["weekday"] * w_weekday
    dataframe["month"] = dataframe["month"] * w_month
    dataframe['log10acc'] = np.log10(dataframe['accuracy'].values) * w_log10acc  
    dataframe.drop(['time', 'accuracy'], axis=1, inplace=True)
    return dataframe
    

def slicer_xy(df_train, df_test, slicer_param):
    for param in slicer_param:
        x_min, x_max, y_min, y_max = param["x_min"], param["x_max"], param["y_min"], param["y_max"]
        x_border_augment, y_border_augment = param["x_border_augment"], param["y_border_augment"]
        df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment)
                         & (df_train['x'] < x_max+x_border_augment)
                         & (df_train['y'] >= y_min-y_border_augment)
                         & (df_train['y'] < y_max+y_border_augment)].copy()

        df_cell_test = df_test[(df_test['x'] >= x_min)
                           & (df_test['x'] < x_max)
                           & (df_test['y'] >= y_min)
                           & (df_test['y'] < y_max)].copy()

        yield df_cell_train, df_cell_test
    
def slicer_param_generator_xy(**kwargs):
    size = kwargs["size"]
    x_step = kwargs["x_step"]
    y_step = kwargs["y_step"]
    x_border_augment = x_step * 0.2
    y_border_augment = y_step * 0.2
    
    cells = (int)(size/x_step)
    for i in range(cells):
        start_time_row = time.time()
        x_min = x_step * i
        x_max = x_step * (i+1)
        x_min = round(x_min, 4)
        x_max = round(x_max, 4)
        if x_max == size:
            x_max = x_max + 0.001

        for j in range((int)(size/y_step)):
            y_min = y_step * j
            y_max = y_step * (j+1)
            y_min = round(y_min, 4)
            y_max = round(y_max, 4)
            if y_max == size:
                y_max = y_max + 0.001
            yield {"x_min": x_min , 
                   "x_max" : x_max,
                   "y_min" : y_min,
                   "y_max" : y_max,
                   "x_border_augment": x_border_augment,
                   "y_border_augment": y_border_augment}

def do_predict(df_cell_train, df_cell_test, th, n_neighbors):

    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    row_ids = df_cell_test.index
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values.astype(float)

    X_test = df_cell_test.values.astype(float)

    # Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=int(n_neighbors),
                               weights='distance',
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(
        np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids


def ml_pipeline(slicer, th=5, n_neighbors=26):
    cnt = 0
    for cell_train, cell_test in slicer:
        pred_labels, row_id = do_predict(f_preprocessor(cell_train),
                                         f_preprocessor(cell_test),
                                         th, n_neighbors)
        yield pred_labels, row_id
        if cnt % 10 == 0:
            print(cnt)
        cnt+=1

def wraper_for_optimization():
    pass
    

Load data
--

In [176]:
now_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")


print('Loading data ...')
df_train = pd.read_csv('/opt/devs/train_facebook.csv',
                       usecols=[
                           'row_id', 'x', 'y', 'accuracy', 'time', 'place_id'],
                       index_col=0)
df_train.sort_values('time', inplace=True)
df_test = pd.read_csv('/opt/devs/test_facebook.csv',
                      usecols=['row_id', 'x', 'y', 'accuracy', 'time'],
                      index_col=0)

Loading data ...


Init
--

In [177]:
param = {"size": 10, "x_step": 1.0, "y_step" : 0.5}
th = 5 
n_neighbors = 26
output_path = "/opt/devs/facebook_eugene.tsv"

Feature generation
--

In [178]:
df_train = f_generator(df_train)
df_test = f_generator(df_test)

Cross validation
--

In [179]:
ninety_percent_mark = int(df_train.shape[0]*0.7)

df_validation_test = df_train[ninety_percent_mark:]
truthvalues = df_validation_test.place_id.values

df_validation_test.drop(["place_id"], axis=1, inplace=True)

df_test2 = df_validation_test.copy()
df_test2['pred1'] = -99
df_test2['pred2'] = -99
df_test2['pred3'] = -99
df_test2.drop([column for column in df_test2.columns if "pred" not in column], axis=1, inplace=True)


df_validation_train = df_train[:ninety_percent_mark]

In [180]:
slicer_param_gen = slicer_param_generator_xy(**param)
slicer_gen = slicer_xy(df_validation_train, df_validation_test, slicer_param_gen)

In [181]:
pipeline = ml_pipeline(slicer_gen)

In [182]:
for pred_labels, row_ids in pipeline:
    for ids, labs in zip(row_ids, pred_labels):
        df_test2.loc[ids] = np.array(labs)
        
predictions = df_test2[['pred1', 'pred2', 'pred3']].as_matrix()
mapk = map_k_precision(truthvalues, predictions)
print(mapk)

NameError: name 'w_log10acc' is not defined

Generate submission
--

In [None]:
slicer_param_gen = slicer_param_generator_xy(**param)
slicer_gen = slicer_xy(df_validation_train, df_validation_test, slicer_param_gen)

In [None]:
pipeline = ml_pipeline(slicer_gen)

In [None]:
with open(output_path) as f:
    f.write('row_id,place_id\n')
    for pred_labels, row_ids in pipeline:
        for ids, labs in zip(row_ids, pred_labels):
            fh.write("{0},{1}\n".format(ids, ' '.join([str(x) for x in labs])))