In [1852]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost
from sklearn.mixture import GMM
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', 80) 
pd.set_option('display.max_rows', 100) 
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [1853]:
train = pd.read_csv("../data/merchants_train.csv", sep=';')
test = pd.read_csv("../data/merchants_test.csv", sep=';')
data = pd.read_csv("../data/transactions.csv")

In [1854]:
train = train.set_index('merchant_id')
test = test.set_index('merchant_id')
data.index = data.merchant_id

In [1855]:
def get_time(pair):
    x, y = pair
    x = x.split(':')
    y = y.split(':')
    x = [int(i) for i in x]
    y = [int(i) for i in y]
    x = x[0]*3600 + x[1]*60 + x[2]
    y = y[0]*3600 + y[1]*60 + y[2]
    result = x - y
    if result >= 80000:
        result = result - 86400
    if result <= -80000:
        result = result + 86400
    return abs(result)
def preprocessing(data, train):
    
    data['dist'] = [tuple(i) for i in data[['latitude', 'longitude']].values]
    y = ((data.loc[train.index][['latitude', 'longitude']] - train).apply(abs) < 0.002).min(axis=1)
    
    d = {}
    for i, target in zip(data.loc[train.index]['dist'], y):
        if i not in d:
            d[i] = [target]
        else:
            d[i].append(target)
    d2 = {}
    for i, m, target in zip(data.loc[train.index]['dist'], data.loc[train.index].merchant_id, y):
        if m not in d2:
            d2[m] = {}
            d2[m][i] = [target]
        else:
            if i not in d2[m]:
                d2[m][i] = [target]
            else:
                d2[m][i].append(target)
    a = []
    b = []

    for i, m in data[['dist', 'merchant_id']].values:
        if m in train.index:
            a.append(len(d[i]) - len(d2[m][i]))
            b.append((np.sum(d[i]) - np.sum(d2[m][i])) / a[-1])
        else:
            if i not in d:
                a.append(0)
                b.append(np.nan)
            else:
                a.append(len(d[i]))
                b.append(np.mean(d[i]))

    
    data['popularity'] = a
    data['mean_target'] = b
    
    
    min_lat = train.latitude.min() - 1
    min_lon = train.longitude.min() - 1
    max_lat = train.latitude.max() + 1
    max_lon = train.longitude.max() + 1
    
    index = ((data.latitude > min_lat) & (data.latitude < max_lat) \
         &(data.longitude > min_lon) & (data.longitude < max_lon))
    data = data[index]
    
    data['description'] = [tuple(i) for i in data[['merchant_id', 'latitude', 'longitude']].values]
    data['freq3'] = data.description.map(data.description.groupby(data.description).apply(len))
    
    index = np.invert(data.description.duplicated())
    data = data[index]
    
    dist_to_freq = data.groupby(data.dist).apply(len)
    data['freq'] = data.dist.map(dist_to_freq)
    data['time'] = [get_time(i) for i in data[['real_transaction_dttm', 'record_date']].values]
    
    return data

In [1856]:
data = preprocessing(data, train)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x1b2e34080>>
Traceback (most recent call last):
  File "//anaconda/lib/python3.5/site-packages/xgboost-0.4-py3.5.egg/xgboost/core.py", line 324, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'


In [1857]:
data_train = data.loc[train.index]
data_test = data.loc[test.index]
y = ((data_train[['latitude', 'longitude']] - train).apply(abs) < 0.002).min(axis=1)

In [1858]:
def get_prediction(data, index,  fun):
    return data.groupby('merchant_id').apply(fun).loc[index][['latitude', 'longitude']]

def get_error(p, y):
    fire = (p.loc[y.index] - y).apply(abs) < 0.002
    return (fire.latitude & fire.longitude).mean()

def algo(x):
    coord = x[['latitude', 'longitude']].values
    index = np.array([True]*len(x))
    while sum(index) > 3:
        dist = np.mean(abs(coord - coord[index].mean(axis=0)), axis=1)
        if sum((dist != max(dist[index])) & index) < 3:
            break
        index = (dist != max(dist[index])) & index
    dist = np.mean(abs(coord - coord[index].mean(axis=0)), axis=1)
    index = np.argmax(dist == min(dist[index]))
    return x.iloc[index]

def algo2(x):
    model = DBSCAN()
    coord = x[['latitude', 'longitude']].values
    model = DBSCAN(eps=0.005, min_samples=2)
    labels = model.fit_predict(coord)
    clusters = sorted(np.unique(labels))
    if len(clusters) != 1:
        c_max = np.argmax([(labels == i).sum() for i in clusters if i != -1])
    else:
        c_max = clusters[0]
    return algo(x.iloc[labels == c_max])

def algo3(x):
    lat = x.latitude.median()
    lon = x.longitude.median()
    fire = abs(x[['latitude', 'longitude']] - [lat, lon]).values
    index = np.argmin(fire[:, 0]**2 + fire[:, 1]**2)
    return x.iloc[index]





In [1859]:
index = np.hstack([train.index, test.index])

In [1860]:
p1 = get_prediction(data[(data.mean_target != 0)], index, algo3)
get_error(p1, train)

0.31224930576982413

In [1861]:
p2 = get_prediction(data[data.mean_target != 0], index, algo2)
get_error(p2, train)

0.34588090095649493

In [1862]:
def get_distribution(thr, d, some_data):
    distribution = []

    for merch, lat, lon in some_data[['merchant_id', 'latitude', 'longitude']].values:
        dist = []
        if merch in d:
            tmp = (((d[merch] - [lat, lon])**2).sum(axis=1) < thr).mean()
            distribution.append(tmp)
        else:
            distribution.append(np.nan)
    return distribution

In [1871]:
gmm = GMM(n_components=3)
gmm.fit(data[['latitude', 'longitude']])

def create_X(some_data, y, is_train=True):
    X = pd.DataFrame(index=some_data.index)
    X['f1'] = ((some_data - p1.loc[some_data.index.unique()])[['latitude', 'longitude']].values**2).sum(axis=1)
    X['f2'] = ((some_data - p2.loc[some_data.index.unique()])[['latitude', 'longitude']].values**2).sum(axis=1)
    
    X['num'] = X.groupby(X.index).apply(len)
    X['freq'] = some_data.freq
    
    freq2 = [(abs(i - train.values) < 0.002).min(axis=1).sum() for i in some_data[['latitude', 'longitude']].values]
    if is_train:
        freq2 = freq2 - y
    X['freq2'] = freq2
    X['freq3'] = some_data.freq3
    d = data.groupby(data.merchant_id).apply(lambda x: x[['latitude', 'longitude']].values).to_dict()
    
    X['popularity'] = some_data.popularity
    X['mean_target'] = some_data.mean_target.fillna(0)
    X['time'] = some_data.time
    X['gmm'] = gmm.score(some_data[['latitude', 'longitude']].fillna(0))
    X['distribution1'] = get_distribution(0.0001, d, some_data)
    X['distribution2'] = get_distribution(0.00001, d, some_data)
    
    lat = some_data.latitude.values
    lat2 = np.array(sells)[:,1]
    lon = some_data.longitude.values
    lon2 = np.array(sells)[:,0]
    return X

In [1872]:
X = create_X(data_train, y)

In [1865]:
def create_cv(X):
    ind_to_ind = pd.Series(range(len(X)), index=X.index)
    index = X.index.unique()
    n = len(index)
    cv = KFold(n, n_folds=5, shuffle=True, random_state=241)
    new_cv = []
    for t, v in cv:
        new_cv.append([ind_to_ind.loc[index[t]].values, ind_to_ind.loc[index[v]].values])
    return new_cv

In [1869]:
xgtrain = xgboost.DMatrix(X, label=y)
params = {'objective':'binary:logistic', 
          'eta':0.03,
          'booster':'gbtree',
          'max_depth':8,
          'nthread':8, 
          'seed':0, 
          'eval_metric':'auc'}
lalka = xgboost.cv(params=list(params.items()), 
              early_stopping_rounds=50, 
              verbose_eval=10,
              dtrain=xgtrain,
                folds=new_cv,
              num_boost_round=10000)
lalka[-1:]

[0]	train-auc:0.969277+0.00188403	test-auc:0.959058+0.00607326
[10]	train-auc:0.975063+0.00204021	test-auc:0.964065+0.00622694
[20]	train-auc:0.977502+0.00168051	test-auc:0.965612+0.0062401
[30]	train-auc:0.978827+0.00135423	test-auc:0.96674+0.00634517
[40]	train-auc:0.980219+0.00118823	test-auc:0.967155+0.00637933
[50]	train-auc:0.981434+0.00126834	test-auc:0.967475+0.00644608
[60]	train-auc:0.982483+0.00112425	test-auc:0.967801+0.00647531
[70]	train-auc:0.983217+0.00107589	test-auc:0.967978+0.00654308
[80]	train-auc:0.983885+0.000982076	test-auc:0.968108+0.00658276
[90]	train-auc:0.984538+0.000955323	test-auc:0.968146+0.00669641
[100]	train-auc:0.985078+0.000926082	test-auc:0.968242+0.00667581
[110]	train-auc:0.985656+0.000883004	test-auc:0.968377+0.0068923
[120]	train-auc:0.986148+0.000884198	test-auc:0.968484+0.00688335
[130]	train-auc:0.98661+0.000831595	test-auc:0.968616+0.00689336
[140]	train-auc:0.987096+0.000811861	test-auc:0.968731+0.00683428
[150]	train-auc:0.987529+0.000711

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
250,0.969236,0.00712,0.990026,0.000528


In [None]:
0.969001 227

In [1876]:
model = xgboost.XGBClassifier(n_estimators=220, max_depth=8, seed=241, learning_rate=0.03)
proba = pd.Series(index=X.index)
for t, v in new_cv:
    model.fit(X.iloc[t], y.iloc[t])
    proba.iloc[v] = model.predict_proba(X.iloc[v])[:,1]
ii = proba.reset_index().groupby(proba.index).apply(lambda x: x[0].argmax()).values
get_error(data_train.iloc[ii], train)

0.37673557543967912

In [1877]:
X_test = create_X(data_test, y, is_train=False)

In [1878]:
model.fit(X, y)
proba_test = pd.Series(model.predict_proba(X_test)[:,1], index = X_test.index)

In [1879]:
ii = proba_test.reset_index().groupby(proba_test.index).apply(lambda x: x[0].argmax()).values
data_test.iloc[ii][['latitude', 'longitude']].to_csv('B2.csv', sep=';')