In [1]:
import pandas as pd
import numpy as np
from itertools import product
from sklearn.linear_model import LinearRegression
import xgboost
%matplotlib inline

In [2]:
transactions = pd.read_csv('~/data/sberbank/transactions.csv')
customers_gender = pd.read_csv('~/data/sberbank/customers_gender_train.csv')

In [3]:
cuses_test = list(set(transactions.customer_id.unique().tolist()).difference(customers_gender.customer_id.unique()))
all_cuses = transactions.customer_id.unique()
all_mcc = transactions.mcc_code.unique()

In [4]:
transactions = transactions[transactions.amount < 0].copy()
transactions['day'] = transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

In [5]:
def rmse(x, y):
    return np.sqrt(np.mean((x - y)**2))
def create_tmp(maxday):
    index = (transactions.day > maxday - 30) & (transactions.day <= maxday)
    tmp = transactions[index]
    tmp = tmp.groupby(['customer_id', 'mcc_code'])['amount'].sum().apply(lambda x: np.log(-x + 1))
    return tmp
def create_series():
    X = list(product(*[all_cuses, all_mcc]))
    X = pd.DataFrame(X, columns=['customer_id',  'mcc_code'])
    X = X.set_index(['customer_id', 'mcc_code'])
    for i, time in enumerate(range(transactions.day.max(), 30, -30)):
        tmp = create_tmp(time)
        X[i] = tmp
    X = X.fillna(0)
    X['series'] = [list(i) for i in X.values]
    return X.index,  np.array(list(X.series))

In [6]:
index, M = create_series()

In [7]:
def regression(x):
    if (x == 0).sum() == len(x):
        return 0
    if len(x) < 2:
        return 0
    A = np.vstack([np.arange(0, len(x)), np.ones(len(x))]).T
    return np.linalg.inv(A.T.dot(A)).dot(A.T).dot(x).dot([-1, 1])
def create_short(X):
    X_short = X.set_index('customer_id')
    X_short = X_short.loc[cuses_test]
    X_short = X_short.reset_index()
    return X_short

In [8]:
minmonth = pd.DataFrame(index=index).reset_index().set_index('customer_id')
tmp = pd.Series(index=all_cuses)
tmp = 15 - ((transactions.day.groupby(by=transactions.customer_id).min() + 24) // 30).loc[all_cuses]
minmonth['maxtime'] = tmp
minmonth = minmonth.fillna(0)
maxtime = minmonth.maxtime.astype(int).values

In [9]:
tmp = pd.DataFrame(index=index).reset_index()
tmp['series'] = [M[i, 2:] for i in range(len(M))]
tmp = tmp.series.groupby(tmp.customer_id).apply(np.hstack).apply(list)
corr = np.corrcoef(np.array(list(tmp.values)))

In [10]:
Arr = pd.DataFrame(M[:, 2:].mean(axis=1), index=index).reset_index()
tmp = Arr[0].groupby(Arr.customer_id).apply(list)
A = np.array(list(tmp))
A[(A == 0).sum(axis=1) == 184] = 0.001
corr = np.corrcoef(A)
corr_pandas = pd.DataFrame(corr, index=tmp.index, columns=tmp.index).fillna(0)
s = {}
for i, (user, code) in enumerate(index.values):
    if user not in s:
        s[user] = {}
    s[user][code] = M[i]
neighbors = {}
for user in corr_pandas.index:
    neighbors[user] = corr_pandas.loc[user].sort_values(ascending=False)[:15].index.difference([user])
s_dash = []
amount = Arr[[0]].groupby(Arr.customer_id).sum()
ind = dict(zip(amount.index, range(len(amount))))
for user, code in index.values:
    num = 0
    den = 0
    alpha = np.mean(s[user][code][2:])
    for user2 in neighbors[user]:
        num += corr[ind[user], ind[user2]] * s[user2][code]
        den += corr[ind[user], ind[user2]]
    s_dash.append(list(num / den))
M2 = np.array(s_dash)

In [11]:
def create_X(index, M, M2, time):
    
    X = pd.DataFrame(index=index).reset_index()
    
    X['amount'] = M[:, time - 1]
    X['mean'] = M[:, time:].mean(axis=1)
    X['last_month'] = M[:, time]
    X['mean_4month'] = M[:, time : time + 4].mean(axis=1)
    X['mean_8month'] = M[:, time : time + 8].mean(axis=1)
    X['is_zero'] = (M[:, time:] == 0).sum(axis=1)  == (M.shape[1] - time)
    X['flex_regression'] = [regression(M[i, time : maxtime[i] - 1]) for i in range(len(M))]
    X['regression'] = [regression(M[i, time:]) for i in range(len(M))]
    X['flex_mean'] = [M[i, time : max(maxtime[i] - 1, time + 1)].mean() for i in range(len(M))]
    
    
    col_names = X.columns.difference(['amount', 'customer_id', 'mcc_code'])
    columns = dict(zip(col_names, col_names + '_mean_over_mcc'))
    X = pd.merge(X, X[col_names].groupby(X.mcc_code).mean().rename(columns=columns).reset_index(), how='left')
    
    a = 0.5**np.arange(M.shape[1] - 2)
    a = a / a.sum()
    X['mean_weighted'] = M[:,time : M.shape[1] + time - 2].dot(a)
    
    X['regression2'] = [regression(M2[i, time:]) for i in range(len(M))]
    X['mean_8month2'] = M2[:, time : time + 8].mean(axis=1)
    X['mean2'] = M2[:, time:].mean(axis=1)
    
    return X

In [12]:
X0 = create_X(index, M, M2, 0)
X1 = create_X(index, M, M2, 1)

In [13]:
X0_short = create_short(X0)

In [14]:
Xtrain = X1.drop(['amount', 'customer_id'], axis=1)
ytrain = X1.amount
Xtest = X0_short.drop(['amount', 'customer_id'], axis=1)
ytest = X0_short.amount

In [15]:
model1 = xgboost.XGBRegressor(n_estimators=65, max_depth=5, seed=241, nthread=8)
model2 = xgboost.XGBRegressor(n_estimators=74, max_depth=6, seed=241, nthread=8)
model3 = xgboost.XGBRegressor(n_estimators=56, max_depth=7, seed=241, nthread=8)

In [16]:
model1.fit(Xtrain, ytrain)
model2.fit(Xtrain, ytrain)
model3.fit(Xtrain, ytrain)
p1 = model1.predict(Xtest)
p2 = model2.predict(Xtest)
p3 = model3.predict(Xtest)

In [17]:
p = np.array([p1, p2, p3])
a = np.array([2, 7, 7])
pred = a.dot(p)/a.sum()

In [18]:
Xtest['volume'] = np.e**pred - 1
Xtest['customer_id'] = X0_short.customer_id

In [19]:
Xtest[['customer_id', 'mcc_code', 'volume']].to_csv('C.csv', index=False)