In [None]:
from scipy.linalg import hankel

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = "cuda"

In [None]:
def MakeSet(ser, lzc, fwd):
    H = hankel(ser)
    X0 = H[: -lzc - fwd + 1, :lzc]
    X = []
    for i in range(X0.shape[0] - fwd - 1):
        X.append(X0[i:i + fwd + 1, :].T)
    X = np.array(X)
    y = H[:-lzc - 2 * fwd, lzc + fwd:lzc + 2 * fwd]
    return X, y


def F1metr(x_pred, x_real): #классы: 1 - positive, O - negative
    x_pred, x_real= x_pred.astype(int), x_real.astype(int)
    tp = len(np.where(x_pred[np.where(x_real == 1)] == 1)[0])
    tn = len(np.where(x_pred[np.where(x_real == 0)] == 0)[0])
    fp = len(np.where(x_pred[np.where(x_real == 0)] == 1)[0])
    fn = len(np.where(x_pred[np.where(x_real == 1)] == 0)[0])
    if (tp + fp) * (tp + fn) * tp:
        precision, recall = tp / (tp + fp), tp / (tp + fn)
        f1 = 2 * precision * recall / (precision + recall)
    elif sum(x_pred - x_real):
        f1 = 0.
    else:
        f1 = 1.
    if (tp + tn + fp + fn):
        accuracy = (tp + tn) / (tp + tn + fp + fn) * 100
    else:
        accuracy = 0.
    return f1, accuracy


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.3):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size

        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bidirectional=False)
        self.dropout = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True, bidirectional=False)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.tanh = nn.Tanh()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm1(x)
        out = self.dropout(out)
        out, _ = self.lstm2(out)
        out = out[:, -1, :]
        out = self.tanh(self.fc1(out))
        out = self.sigmoid(self.fc2(out))
        return out


In [None]:
df = pd.read_csv("processed/transact_18_22.csv", index_col=["client", "date"])
compression = pd.read_csv("processed/transact_18_22_lempel_ziv_compression.csv", index_col="client")

In [None]:
fwd = 7
split = 120
n = len(compression.index.to_list())

In [None]:
# df.loc[224]["code"].values

In [None]:
# compression.loc[224]["survival"]

In [None]:
highlab=['survival', 'socialization', 'self_realization']
res = pd.DataFrame(columns=['id'] + highlab)
res['id'] = ['id'] * n

users = compression.index.to_list()

for i in tqdm(range(len(users)), desc="Processing users"):
    user_id = users[i]

    for sssr in highlab:
        trans = df.loc[user_id][sssr].values
        lzc = compression.loc[user_id][sssr]

        lzc = min(max(lzc, 14), 30)

        X, y = MakeSet(trans, lzc, fwd)
        if len(X) <= split:
            print("Skipped ", user_id, sssr)
            continue

        X_train, y_train = X[:split], y[:split]
        X_test, y_test = X[split:], y[split:]

        X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
        y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
        X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
        y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)

        model = LSTMModel(input_size=X_train_t.shape[2], hidden_size=X_train_t.shape[1], output_size=y_train_t.shape[1]).to(device)

        criterion = torch.nn.BCELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        model.train()
        batch_size = 1
        train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=batch_size, shuffle=True)
        for xb, yb in train_loader:
            optimizer.zero_grad()
            y_pred = model(xb)
            loss = criterion(y_pred, yb)
            loss.backward()
            optimizer.step()

        model.eval()
        f1 = []
        with torch.no_grad():
            for j in range(len(y_test_t)):
                xb = X_test_t[j:j+1]
                y_true = y_test_t[j:j+1]
                y_pred = model(xb)
                y_pred_bin = (y_pred > 0.5).int()
                f1_score_val, _ = F1metr(y_pred_bin.cpu().numpy()[0], y_true.cpu().numpy()[0])
                f1.append(f1_score_val)

        if len(y_test_t) <= 1:
            print("Skipped ", user_id, sssr)
            continue

        res.iloc[i, 0] = user_id
        res.loc[i, sssr] = np.where(np.array(f1) > 0.75)[0].shape[0] / len(f1)

        del model
        torch.cuda.empty_cache()

    res.to_csv('results/res_test.csv', index=False)