# Базовое решение для задачи C

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import LinearRegression
from scipy import sparse

Читаем входные файлы с данными

In [None]:
transactions = pd.read_csv('transactions.csv')
customers_gender = pd.read_csv('customers_gender_train.csv')

Берём расходные транзакции и формируем тестовую выборку

In [None]:
train_transactions = transactions[transactions.amount < 0].copy()
train_transactions['day'] = train_transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

In [None]:
test_transactions = \
    pd.DataFrame(columns=set(transactions.customer_id.unique().tolist()).difference(customers_gender.customer_id.unique()),
                 index=train_transactions.mcc_code.unique())
test_transactions = test_transactions.unstack().reset_index().dropna(axis=1)

train_grid = pd.DataFrame(columns=transactions.customer_id.unique(), index=train_transactions.mcc_code.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis=1)

Добавляем признаки, относящиеся к дате.

In [None]:
for tr_table in [train_grid, test_transactions]:
    tr_table.columns = ['customer_id', 'mcc_code']
    tr_table['month_num'] = (train_transactions.day.max() + 15) // 30
    tr_table['year_num'] = (train_transactions.day.max() + 15) // 365

train_transactions['month_num'] = train_transactions.day // 30
train_transactions['year_num'] = train_transactions.day // 365

In [None]:
train_transactions = \
    pd.merge(train_grid,
             train_transactions.groupby(['year_num', 'month_num', 'customer_id', 'mcc_code'])[['amount']].sum().reset_index(),
             how='left').fillna(0)

In [None]:
for month_shift in range(1, 5):
    train_shift = train_transactions.copy()
    train_shift['month_num'] = train_shift['month_num'] + month_shift
    train_shift = train_shift.rename(columns={"amount" : 'amount_{0}'.format(month_shift)})  
    train_shift = train_shift[['year_num', 'month_num', 'customer_id', 'mcc_code', 'amount_{0}'.format(month_shift)]]

    train_transactions = pd.merge(train_transactions, train_shift, 
                                  on=['year_num', 'month_num', 'customer_id', 'mcc_code'], how='left').fillna(0)
    test_transactions = pd.merge(test_transactions, train_shift, 
                                 on=['year_num', 'month_num', 'customer_id', 'mcc_code'], how='left').fillna(0)

In [None]:
for shift in range(1, 4):
    train_transactions['diff_{}_{}'.format(shift, shift + 1)] = \
        train_transactions['amount_{}'.format(shift)] - train_transactions['amount_{}'.format(shift + 1)]
    test_transactions['diff_{}_{}'.format(shift, shift + 1)] = \
        test_transactions['amount_{}'.format(shift)] - test_transactions['amount_{}'.format(shift + 1)]    

In [None]:
hasher = FeatureHasher(n_features=100000, input_type='string')
train_sparse = \
    hasher.fit_transform(train_transactions[['year_num', 'month_num', 'customer_id', 'mcc_code']].astype(str).as_matrix())

In [None]:
test_sparse = \
    hasher.transform(test_transactions[['year_num', 'month_num', 'customer_id', 'mcc_code']].astype(str).as_matrix())

In [None]:
train_sparse = sparse.hstack([train_sparse,
                              np.log(np.abs(train_transactions[['amount_1', 'amount_2', 'amount_3', 'amount_3',
                                                                'diff_1_2',  'diff_2_3',  'diff_3_4']]) + 1).as_matrix()
                             ])

test_sparse = sparse.hstack([test_sparse,
                             np.log(np.abs(test_transactions[['amount_1', 'amount_2', 'amount_3', 'amount_3',
                                                              'diff_1_2',  'diff_2_3',  'diff_3_4']]) + 1).as_matrix()
                            ])

Обучаемся на всех данных

In [None]:
shift = 1
clf = LinearRegression()
clf.fit(train_sparse, np.log(-train_transactions['amount'] + shift))

Предсказываем объём трат для тестовых данных и создаём файл с ответом

In [None]:
test_transactions['volume'] = np.e ** clf.predict(test_sparse) - shift
test_transactions[['customer_id', 'mcc_code', 'volume']].to_csv('baseline_c.csv', index=False)