# ALFABANK CAMPUS

# Card transactions prediction

Дарья Сергеева (telegram @visna_dp)

## Описание задачи
Требуется предсказать следующие 10 транзакций клиента (MCC-коды) на основании списка его предыдущих транзакций. Результат оценивается по метрике map@10.

https://www.kaggle.com/competitions/alfabank-campus/overview

## Библиотеки

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

import warnings
warnings.filterwarnings("ignore")

In [None]:
#GPU count and name
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-836903c1-b042-272b-e3b2-b6b6856cba37)


In [None]:
# Подключаем google диск, где у нас хранятся данные
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Функции

In [None]:
# функция расчета метрики map@k
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


## Загрузка и предобработка данных

In [None]:
df_train = pd.read_csv('./drive/MyDrive/SF_DS/data_alfa/df_train.csv', sep=';')
df_test = pd.read_csv('./drive/MyDrive/SF_DS/data_alfa/df_test.csv', sep=';')
df_train.head()

Unnamed: 0,Id,Data,Target
0,0,"4814,4814,6010,6011,4814,6011,6011,4814,6011,6...",4814481448144814541148144814481448144814
1,1,"6011,6011,6011,6011,6011,6011,6011,4814,4814,4...",4814601148146011481448146011481460114814
2,2,"8021,6011,6011,6010,4829,4814,6011,6011,6011,6...",6011601160104829482960106011601148146011
3,3,"4814,6011,4814,4814,4814,6011,6011,5691,5691,5...",6011601160106011601148144814601148144814
4,4,"4814,4814,4814,4814,4814,4814,5946,4814,4814,6...",5499601148144829520054115499591254115912


In [None]:
df_test.head()

Unnamed: 0,Id,Data
0,0,"4814,4814,6011,6011,6010,6011,6011,4814,6011,4..."
1,1,"6010,6011,6010,5411,5411,5977,6011,6010,5411,6..."
2,2,"4814,6011,5251,6011,7832,5641,5814,4829,5311,6..."
3,3,"6011,4722,4722,4722,4814,6011,6011,4829,6011,6..."
4,4,"4814,4814,4814,6011,4814,4814,4814,4814,4814,4..."


In [None]:
# Преобразуем данные в список
df_train['Data'] = df_train.Data.apply(lambda s: list(map(int, s.split(','))))
df_train['Target'] = df_train.Target.apply(lambda s: list(map(int, s.split(','))))
df_test['Data'] = df_test.Data.apply(lambda s: list(map(int, s.split(','))))

In [None]:
# Создадим признак для количества кодов в списке
df_train['Count'] = df_train['Data'].apply(len)
df_train['Count'].describe()

count     7033.000000
mean       473.322906
std        811.665063
min         40.000000
25%        180.000000
50%        336.000000
75%        570.000000
max      21101.000000
Name: Count, dtype: float64

In [None]:
df_test['Count'] = df_test['Data'].apply(len)
df_test['Count'].describe()

count     7033.000000
mean       476.756150
std       1341.916565
min         40.000000
25%        177.000000
50%        344.000000
75%        582.000000
max      88771.000000
Name: Count, dtype: float64

Распределение длин последовательностей MCC примерно одинаковое для выборок train и test. Среднее значение около 475, минимальное 40.

In [None]:
# Обрежем данные до длины 500 и преобразуем в строку
limit_tranc = 500
df_train['Data_trunc'] = df_train['Data'].apply(
    lambda x: x[-limit_tranc-1:-1] if len(x) > limit_tranc else x
)
df_test['Data_trunc'] = df_test['Data'].apply(
    lambda x: x[-limit_tranc-1:-1] if len(x) > limit_tranc else x
)
df_train['Data_trunc'] = df_train['Data_trunc'].astype(str).str.replace(',', '')
df_test['Data_trunc'] = df_test['Data_trunc'].astype(str).str.replace(',', '')

## Модель RNN с LSTM

In [None]:
# Токенизация данных
X_train = df_train['Data_trunc']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
n_codes = len(tokenizer.word_index)
print('Количество уникальных кодов MCC:', n_codes)

Количество уникальных кодов MCC: 184


In [None]:
# Функция разбивает каждую последовательность кодов на 40 (feature) + 10 (target) со сдвигом
def create_features_targets(X_seq, length_feature=40, length_target=10):
    features = []
    targets = []
    for X in X_seq:
        for i in range(0, len(X) - (length_feature + length_target)):
            feature = X[i:i + length_feature]
            target = X[i + length_feature:i + length_feature + length_target]
            features.append(feature)
            targets.append(target)
    return features, targets

In [None]:
# Функция для преобразования последовательности в вектор one hot
def vectorize_sequences_oh(sequences, dimension):
    results = np.zeros((len(sequences), dimension + 1))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

In [None]:
# Создадим список features (длиной 40) и targets (длиной 10) для обучения из train
length = 40
features, targets = create_features_targets(X_train_seq, length_feature=length)
print("Общее количество последовательностей:", len(targets))

Общее количество последовательностей: 1934373


In [None]:
X_train_pad = pad_sequences(features)
X_train_pad.shape

(1934373, 40)

In [None]:
y_train_oh = vectorize_sequences_oh(targets, n_codes)
y_train_oh.shape


(1934373, 185)

In [None]:
# создаем модель LSTM
model_lstm = Sequential()
model_lstm.add(Embedding(n_codes, 128, input_length=length))
model_lstm.add(SpatialDropout1D(0.5))
model_lstm.add(LSTM(40, return_sequences=True))
model_lstm.add(LSTM(40))
model_lstm.add(Dense(y_train_oh.shape[1], activation='sigmoid'))

In [None]:
model_lstm.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'AUC'])

In [None]:
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           23552     
                                                                 
 spatial_dropout1d (Spatial  (None, 40, 128)           0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 40, 40)            27040     
                                                                 
 lstm_1 (LSTM)               (None, 40)                12960     
                                                                 
 dense (Dense)               (None, 185)               7585      
                                                                 
Total params: 71137 (277.88 KB)
Trainable params: 71137 (277.88 KB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [None]:
history_lstm = model_lstm.fit(X_train_pad,
                              y_train_oh,
                              epochs=20,
                              batch_size=512,
                              validation_split=0.2
                              )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
model_lstm.save('./drive/MyDrive/SF_DS/data_alfa/model_lstm.h5')

In [None]:
del X_train
del X_train_pad
del X_train_seq
del features
del targets
del y_train_oh

In [None]:
# Загружаем обученную модель
#model_lstm = load_model('./drive/MyDrive/SF_DS/data_alfa/model_lstm.h5')

In [None]:
# Дообучим модель на данных из test
length = 40
X_test = df_test['Data_trunc']
X_test_seq = tokenizer.texts_to_sequences(X_test)
features, targets = create_features_targets(X_test_seq, length_feature=length)

print("Общее количество последовательностей:", len(targets))

Общее количество последовательностей: 1943768


In [None]:
X_test_pad = pad_sequences(features)
X_test_pad.shape

(1943768, 40)

In [None]:
y_test_oh = vectorize_sequences_oh(targets, n_codes)
y_test_oh.shape

(1943768, 185)

In [None]:
history_lstm = model_lstm.fit(X_test_pad,
                              y_test_oh,
                              epochs=20,
                              batch_size=512,
                              validation_split=0.2
                              )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
model_lstm.save('./drive/MyDrive/SF_DS/data_alfa/model_lstm.h5')

In [None]:
del X_test
del X_test_pad
del X_test_seq
del features
del targets
del y_test_oh

## Оценка модели

In [None]:
# Функция для предсказания 10 транзакций по убыванию вероятности
def predict_next(X_seq):
    y_pred = model_lstm.predict(X_seq)

    # Создадим массив для хранения результатов
    result = []

    for i in range(y_pred.shape[0]):
        # Отсортируем индексы по убыванию предсказанной вероятности
        top_indices = list(np.argsort(-y_pred[i]))
        # Возьмем первые 10 значений
        top_indices = top_indices[:10]
        result.append(top_indices)
    return result

In [None]:
# Выделим 40 последних транзакций для предсказания
df_train['Last'] = df_train['Data'].apply(
    lambda x: x[-length-1:-1] if len(x) > length else x
)
df_train['Last'] = df_train['Last'].astype(str).str.replace(',', '')
X_train = df_train['Last']
X_train_seq = tokenizer.texts_to_sequences(X_train)

In [None]:
# Делаем предсказание на train
# Предсказываем последовательность уникальных кодов
y_pred = predict_next(X_train_seq)
y_pred = tokenizer.sequences_to_texts(y_pred)
df_train['Predicted'] = y_pred
df_train['Predicted'] = df_train.Predicted.apply(lambda s: list(map(int, s.split())))



In [None]:
# Расчитываем метрику для предсказания Predicted
mapk(df_train['Target'], df_train['Predicted'])

0.3332854017003739

## Submission

In [None]:
# Выделим последние 40 транзакций для предсказания на test
df_test['Last'] = df_test['Data'].apply(
    lambda x: x[-length-1:-1] if len(x) > length else x
)
df_test['Last'] = df_test['Last'].astype(str).str.replace(',', '')
X_test = df_test['Last']
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Предсказываем последовательность уникальных кодов
y_pred = predict_next(X_test_seq)
y_pred = tokenizer.sequences_to_texts(y_pred)
df_test['Predicted'] = y_pred



In [None]:
# Выгружаем submission уникальных кодов
submission_rnn = df_test[['Id', 'Predicted']]
submission_rnn.to_csv('submission_rnn.csv', index=False)
