In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import xgboost
%matplotlib inline

In [2]:
transactions = pd.read_csv('~/data/sberbank/transactions.csv')

In [3]:
shift = 500
train_transactions = transactions[transactions.amount < 0].copy()
train_transactions['day'] = train_transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

In [4]:
train_grid = pd.DataFrame(columns=train_transactions.mcc_code.unique(), 
                          index=train_transactions.day.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis=1)
train_grid.columns = ['mcc_code', 'day']

In [5]:
train_transactions = pd.merge(train_grid,
                              train_transactions.groupby(['day', 'mcc_code'])[['amount']].sum().reset_index(),
                              how='left').fillna(0)

In [6]:
train_transactions.loc[train_transactions.index, 'amount'] = np.log(-train_transactions.amount + shift)
train_transactions['week_day'] = train_transactions.day % 7
train_transactions = train_transactions.sort_values(['mcc_code', 'day'])

In [7]:
def regression(x):
    clf = LinearRegression()
    clf.fit(x.day.values.reshape(len(x), 1), x.amount)
    return clf.predict((np.arange(1, 31) + x.day.max()).reshape(30, 1))

In [8]:
tmp = train_transactions[train_transactions.day < 186].groupby('mcc_code')['amount'].mean()
code_map = tmp.sort_values().reset_index().reset_index().set_index('mcc_code')['index']

In [9]:
class LinearReg():
    def fit(self, X):
        codes = X.mcc_code.unique()
        days = np.arange(7)
        self.d = {}
        index2 = [X.week_day == i for i in range(7)]
        for code in codes:
            self.d[code] = {}
            index1 = X.mcc_code == code
            for day in days:
                index = index1 & index2[day]
                self.d[code][day] = LinearRegression()
                x = X[index]
                self.d[code][day].fit(x.day.values.reshape(len(x), 1), x.amount)
    def predict(self, X):
        prediction = []
        for day, code, weekday in X[['day', 'mcc_code', 'week_day']].values:
            prediction.append(self.d[code][weekday].predict(day)[0])
        return prediction
    
class LinearReg2():
    def fit(self, X):
        codes = X.mcc_code.unique()
        days = [True, False]
        self.d = {}
        index2 = [X.weekend == i for i in days]
        for code in codes:
            self.d[code] = {}
            index1 = X.mcc_code == code
            for day in days:
                index = index1 & index2[day]
                self.d[code][day] = LinearRegression()
                x = X[index]
                self.d[code][day].fit(x.day.values.reshape(len(x), 1), x.amount)
    def predict(self, X):
        prediction = []
        for day, code, weekday in X[['day', 'mcc_code', 'weekend']].values:
            prediction.append(self.d[code][weekday].predict(day)[0])
        return prediction
    
a = [0.93]**np.arange(50, 0, -1)
a = a / a.sum()

a2 = [0.7]**np.arange(15, 0, -1)
a2 = a2 / a2.sum()

def create_weighted_mean(series):
    return a.dot(series[-50:])
def create_weighted_mean2(series):
    return a2.dot(series[-15:])

In [10]:
M = np.array(list(train_transactions.groupby('mcc_code').amount.apply(list).values))

In [11]:
def mean_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    for i in range(1, 61):
        m = np.mean(s[:n - i])
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def weighted_mean_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    for i in range(1, 61):
        m = create_weighted_mean(s[:n - i])
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def mean_last_year_error(s):
    e = [[] for i in range(30)]
    s = s[-182-45:]
    n = len(s)
    for i in range(1, 61):
        m = np.mean(s[:n - i])
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def mean_last_month_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    for i in range(1, 61):
        m = np.mean(s[n - i - 30:n - i])
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def mean_last_week_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    for i in range(1, 61):
        m = np.mean(s[n - i - 7:n - i])
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def regression_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    model = LinearRegression()
    for i in range(1, 61):
        l = len(s[:n - i])
        model.fit(np.arange(0, l).reshape(l, 1), s[:n - i])
        m = model.predict(np.arange(l, l + 30).reshape(30, 1))
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m[j - 1] - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def regression_year_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    model = LinearRegression()
    for i in range(1, 61):
        l = len(s[max(0, n - i - 182):n - i])
        model.fit(np.arange(0, l).reshape(l, 1), s[max(0, n - i - 182):n - i])
        m = model.predict(np.arange(l, l + 30).reshape(30, 1))
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m[j - 1] - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def regression_month_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    model = LinearRegression()
    for i in range(1, 61):
        l = len(s[n - i - 30:n - i])
        model.fit(np.arange(0, l).reshape(l, 1), s[n - i - 30:n - i])
        m = model.predict(np.arange(l, l + 30).reshape(30, 1))
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            e[j - 1].append((m[j - 1] - s[n - i + j - 1])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def regression_very_good_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    models = [LinearRegression() for i in range(7)]
    for i in range(1, 61):
        l = len(s[:n - i])
        for k in range(7):
            y = s[:n - i][k::7]
            X = np.arange(0, l)[k::7]
            X = X.reshape(len(X), 1)
            models[k].fit(X, y)
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            time = n - i + j - 1
            e[j - 1].append((models[time % 7].predict(time)[0] - s[time])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def week_amount_mean_error(s):
    e = [[] for i in range(30)]
    n = len(s)
    for i in range(1, 61):
        l = len(s[:n - i])
        m = [np.mean(s[:n - i][k::7]) for k in range(7)]
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            time = n - i + j - 1
            e[j - 1].append((m[time % 7] - s[time])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 
def week_amount_mean_year_error(s):
    e = [[] for i in range(30)]
    s = s[-182-45:]
    n = len(s)
    for i in range(1, 61):
        l = len(s[:n - i])
        m = [np.mean(s[:n - i][k::7]) for k in range(7)]
        left = max(1, i - 29)
        right = min(i, 30)
        for j in range(left, right + 1):
            time = n - i + j - 1
            e[j - 1].append((m[time % 7] - s[time])**2)
    for i in range(30):
        e[i] = np.mean(e[i])
    return e 

In [12]:
def create_month_data(max_day, data):
    if max_day == data.day.max():
        index = (data.day > max_day - 30) & (data.day <= max_day)
        X = data[index].copy()
        X.day += 30
        X.amount = np.nan
    else:
        index = (data.day > max_day) & (data.day <= max_day + 30)
        X = data[index].copy()
    X['week_day'] = X.day % 7
    X['weekend'] = abs(X.week_day - 1.5) < 1
    data['weekend'] = abs(data.week_day - 1.5) < 1
    data = data[data.day <= max_day]
    data['zeros'] = data.amount == np.log(shift)
    story = data
    tmp = story.groupby(['mcc_code', 'week_day']).amount.mean().reset_index()
    tmp.columns = list(tmp.columns[:-1]) + ['week_amount_mean']
    X = pd.merge(X, tmp, how='left')

    
    story = data[data.day > max_day - 182]
    tmp = story.groupby(['mcc_code', 'week_day']).amount.mean().reset_index()
    tmp.columns = list(tmp.columns[:-1]) + ['week_amount_mean_year']
    X = pd.merge(X, tmp, how='left')
    
    std = data.groupby(['mcc_code', 'week_day'])['amount'].mean().reset_index().groupby(['mcc_code'])['amount'].std()
    X['t'] = X.day - max_day
    
    X = X.sort_values(['mcc_code', 'day']).set_index('mcc_code')
    
    
    X['zeros'] = data.zeros.groupby(story.mcc_code).mean()
    
    
    story = data
    X['mean_all'] = story.amount.groupby(by=story.mcc_code).mean()
    X['weighted_mean'] = story.amount.groupby(by=story.mcc_code).apply(create_weighted_mean)
    
    story = data[data.day > max_day - 182]
    X['mean_last_year'] = story.amount.groupby(by=story.mcc_code).mean()
    story = data[data.day > max_day - 30]
    X['mean_last_month'] = story.amount.groupby(by=story.mcc_code).mean()
    story = data[data.day > max_day - 7]
    X['mean_last_week'] = story.amount.groupby(by=story.mcc_code).mean()
    X['last_day'] = data[data.day == max_day].set_index('mcc_code').amount
    story = data
    X['regression'] = np.concatenate(story.groupby(story.mcc_code).apply(regression).values)
    story = data[data.day > max_day - 182]
    X['regression_year'] = np.concatenate(story.groupby(story.mcc_code).apply(regression).values)
    story = data[data.day > max_day - 30]
    X['regression_month'] = np.concatenate(story.groupby(story.mcc_code).apply(regression).values)
    X['diff_mean_yaer_mean_weekday_year']  = X['mean_all'] - X['week_amount_mean']
    good_regress= LinearReg()
    good_regress.fit(data)
    X['regression_very_good'] = good_regress.predict(X.reset_index())
    X['good'] = 0.3 * X.week_amount_mean_year + 0.45 * X.mean_last_month + 0.25 * X.regression_very_good
    M = np.array(list(data.groupby('mcc_code').amount.apply(list).values))
    
    w1 = np.hstack([week_amount_mean_error(M[i]) for i in range(len(M))])
    w2 = np.hstack([regression_very_good_error(M[i]) for i in range(len(M))])
    w3 = np.hstack([weighted_mean_error(M[i]) for i in range(len(M))])
    w4 = np.hstack([mean_last_year_error(M[i]) for i in range(len(M))])
    w1 = (1 / w1)**4
    w2 = (1 / w2)**4
    w3 = (1 / w3)**4
    w4 = (1 / w4)**4
    tmp = w1 * X.week_amount_mean +  w2 * X.regression_very_good + w3 * X.weighted_mean + w4 * X.mean_last_year
    X['very_very_good'] = tmp / (w1 + w2 + w3 + w4)
    
    X['diff_mean_all_mean_last_year'] = X['mean_all'] - X['mean_last_year']
    X['diff_last_day_regression_very_good'] = X['last_day'] - X['regression_very_good']
    
    return X

In [13]:
def rmse(x, y):
    return np.sqrt(np.mean((x - y)**2))

In [14]:
def create_train_test2(data, max_day1, max_day2):
    test = create_month_data(max_day2, data)
    train = pd.concat([create_month_data(i, data) for i in range(max_day1, max_day2 - 29, 30)])
    train = train.reset_index()
    test = test.reset_index()
    train.mcc_code = train.mcc_code.map(code_map)
    test.mcc_code = test.mcc_code.map(code_map)
    return (train.drop(['day', 'amount', 'week_day', 'mean_all'], axis=1), 
            test.drop(['day', 'amount', 'week_day', 'mean_all'], axis=1), 
            train.amount, 
            test.amount)

In [15]:
Xtrain, Xtest, ytrain,_ = create_train_test2(train_transactions, 186, train_transactions.day.max())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
model = xgboost.XGBRegressor(max_depth=3, n_estimators=283, seed=0, nthread=8, learning_rate=0.05)
model.fit(Xtrain, ytrain)
pred = model.predict(Xtest)

In [17]:
code_map_invert = code_map.reset_index()['mcc_code']
Xtest.mcc_code = Xtest.mcc_code.map(code_map_invert)

In [18]:
Xtest['volume'] = pred
Xtest['day'] = Xtest.t +  train_transactions.day.max()
test_transactions = pd.DataFrame(columns=train_transactions.mcc_code.unique(), 
                                 index=np.arange(1, 31) + train_transactions.day.max())
test_transactions = test_transactions.unstack().reset_index().dropna(axis=1)
test_transactions.columns = ['mcc_code', 'day']
test_transactions = pd.merge(test_transactions, Xtest[['mcc_code', 'day', 'volume']])
test_transactions.volume = np.e ** test_transactions.volume - shift

In [19]:
test_transactions2 = test_transactions.copy()
tmp = np.log(test_transactions[test_transactions.mcc_code == 6211].volume + 500) + 1.5
test_transactions2.loc[test_transactions.mcc_code == 6211, 'volume'] = np.e**tmp - 500
tmp = np.log(test_transactions[test_transactions.mcc_code == 4722].volume + 500) - 1
test_transactions2.loc[test_transactions.mcc_code == 4722, 'volume'] = np.e**tmp - 500
tmp = np.log(test_transactions[test_transactions.mcc_code == 3501].volume + 500) - 0.5
test_transactions2.loc[test_transactions.mcc_code == 3501, 'volume'] = np.e**tmp - 500

In [20]:
test_transactions2[['mcc_code', 'day', 'volume']].to_csv('B.csv', index=False)