In [1]:
import gc
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pickle

In [None]:
DATA_PATH = './data/transactions_train_valid_test_splits.pickle'

In [None]:
with open(DATA_PATH, 'rb') as f:
    data = pickle.load(f)
train_data = data['train']
valid_data = data['valid']
test_data = data['test']

train_data.rename(columns={"Errors?":"Errors", "Is Fraud?": "Fraud"}, inplace=True)
valid_data.rename(columns={"Errors?":"Errors", "Is Fraud?": "Fraud"}, inplace=True)
test_data.rename(columns={"Errors?":"Errors", "Is Fraud?": "Fraud"}, inplace=True)

In [None]:
def casting_dataframe(df):
    df["Date"] = df.Year.astype(str)+"-"+df.Month.astype(str)+"-"+df.Day.astype(str) + " " + df.Time
    df.User = df.User.astype(np.int16)
    df.Card = df.Card.astype(str)
    df.Year = df.Year.astype(np.int16)
    df.Month = df.Month.astype(np.int8)
    df.Day = df.Day.astype(np.int8)
    df.Amount = df.Amount.str.strip("$").astype(np.float32)
    df.Errors = df.Errors.fillna("Empty")
    df.IsFraud_target = df.IsFraud_target.astype(np.int8)
    df.MCC = df.MCC.astype(str)
    df["Date"] = pd.to_datetime(df.Date, format='%Y-%m-%d %H:%M')
    df["Dow"] = df.Date.dt.dayofweek.astype(np.int8)
    df["Hour"] = df.Time.apply(lambda x: x.split(":")[0]).astype(np.int8)
    df["Minute"] = df.Time.apply(lambda x: x.split(":")[1]).astype(np.int8)
    df["Outcome"] = (df.Amount<0).astype(np.int8)
    df.drop(columns=["Time", "Fraud", "Zip"], inplace=True)
    return df

In [None]:
train_data = casting_dataframe(train_data)
valid_data = casting_dataframe(valid_data)
test_data = casting_dataframe(test_data)

In [None]:
def cyclic_preprocessing(X, col):
    """
    Apply sin and cos transformation over cyclic variable
    """
    x = X[col]
    number_unique_values = len(x.unique())
    sin = np.sin(2 * np.pi * x / number_unique_values)
    cos = np.cos(2 * np.pi * x / number_unique_values)
    return sin.astype(np.float32), cos.astype(np.float32)

In [None]:
def preprocces_cyclic_features(X, features):
    for feature in features:
        X[feature+'_sin'], X[feature+'_cos'] = cyclic_preprocessing(X, feature)
    return X

In [None]:
def preprocessing(df, something):
    ## Sort values by date
    df.sort_values(by='Date', inplace=True,ignore_index=True)
    
    # Take difference between some actions in time in seconds
    df["Time_diff"] = df.groupby(by='User').Date.diff()
    df["Time_diff"].fillna(pd.Timedelta(seconds=0), inplace=True)
    df["Time_diff"] = df["Time_diff"].apply(lambda x: x.seconds)
    
    # FIll NA in merchant State
    df["Merchant State"].fillna("ONLINE", inplace=True)
    
    # Exclude rare errors
    indexes_with_low_proba = something["Errors"]
    df.loc[df.Errors.isin(indexes_with_low_proba), "Errors"] = "Rare"
    
    # Binary mask if merchant is'n most frequent
    df["is_diff_merchant"] = df.groupby('User')["Merchant Name"].diff()
    df.is_diff_merchant.fillna(0, inplace=True)
    df.loc[df.is_diff_merchant!=0, 'is_diff_merchant']=1
    df.is_diff_merchant = df.is_diff_merchant.astype(np.int8)
    
    # Binary mask if city is different than most frequent for user
    tmp = df.groupby(by='User')["Merchant City"].agg(lambda x:x.value_counts().index[0])
    for user in tqdm(df.User.unique()):
        df.loc[df.User==user, 'is_diff_merchant_city'] = (df.loc[df.User==user, 'Merchant City'] == tmp[user]).astype(np.int8)
    
    # Binary mask if state is different than most frequent for user
    tmp = df.groupby(by='User')["Merchant State"].agg(lambda x:x.value_counts().index[0])
    for user in tqdm(df.User.unique()):
        df.loc[df.User==user, 'is_diff_merchant_state'] = (df.loc[df.User==user, 'Merchant State'] == tmp[user]).astype(np.int8)

    # Prepare cyclic feature
    cyclic_features = ["Month", "Day", "Dow", "Hour", "Minute"]
    df = preprocces_cyclic_features(df, cyclic_features)
    
    df.drop(columns=["Year", "Merchant Name", "Merchant City", "Merchant State",
                    "Month", "Day", "Dow", "Hour", "Minute"], inplace=True)
    return df

In [None]:
tmp = pd.crosstab(train_data.Errors, train_data.IsFraud_target)
indexes_with_low_proba = []
for indx in tmp.index:
    if tmp.loc[indx,:].sum() < 2500:
        indexes_with_low_proba.append(indx)
        
something = {}
something["Errors"]=indexes_with_low_proba

In [None]:
train_data = preprocessing(train_data, something)
valid_data = preprocessing(valid_data, something)
test_data = preprocessing(test_data, something)

In [None]:
with open("data/preprocessed.pickle", "wb") as file:
    pickle.dump({"train":train_data, "valid":valid_data, "test":test_data}, file)

In [2]:
categorical_var = ["Card", "Use Chip", "MCC", "Errors", "Outcome",
                   "is_diff_merchant", "is_diff_merchant_city", "is_diff_merchant_state"]

In [3]:
def categorical_to_unique(df, categorical_var):
    df[categorical_var] = df[categorical_var].astype(str)
    for category in categorical_var:
        df[category] = category + " " + df[category]
    return df

In [4]:
def unique_labels(df, categorical_var):
    mapping_dict = {}
    vals = []
    for category in categorical_var:
        tmp = df[category].unique().tolist()
        vals.extend(tmp)
    
    mapping_dict = {label:i for i, label in enumerate(vals)}
    return mapping_dict

In [5]:
def mapping_column(df, columns, mapping_labels):
    for col in columns:
        df[col] = df[col].map(mapping_labels)
    return df

In [16]:
with open("data/preprocessed.pickle", 'rb') as f:
    data = pickle.load(f)
train_data = data['train']
valid_data = data['valid']
test_data = data['test']

In [17]:
train_data = categorical_to_unique(train_data, categorical_var)
valid_data = categorical_to_unique(valid_data, categorical_var)
test_data = categorical_to_unique(test_data, categorical_var)

In [8]:
mapping_labels = unique_labels(train_data, categorical_var)

In [18]:
train_data = mapping_column(train_data, categorical_var, mapping_labels)
valid_data = mapping_column(valid_data, categorical_var, mapping_labels)
test_data = mapping_column(test_data, categorical_var, mapping_labels)

In [20]:
test_data.head()

Unnamed: 0,User,Card,Amount,Use Chip,MCC,Errors,IsFraud_target,Date,Outcome,Time_diff,...,Month_sin,Month_cos,Day_sin,Day_cos,Dow_sin,Dow_cos,Hour_sin,Hour_cos,Minute_sin,Minute_cos
0,1683,3,66.169998,9,23,121,0,1991-07-01 12:36:00,129,0,...,-0.5,-0.866025,0.201299,0.97953,0.0,1.0,1.224647e-16,-1.0,-0.587785,-0.809017
1,1683,3,65.0,9,23,121,0,1991-07-01 12:47:00,129,660,...,-0.5,-0.866025,0.201299,0.97953,0.0,1.0,1.224647e-16,-1.0,-0.978148,0.207912
2,1683,3,-65.0,9,23,121,0,1991-07-01 12:55:00,130,480,...,-0.5,-0.866025,0.201299,0.97953,0.0,1.0,1.224647e-16,-1.0,-0.5,0.866025
3,1683,3,63.700001,9,22,121,0,1991-07-01 15:08:00,129,7980,...,-0.5,-0.866025,0.201299,0.97953,0.0,1.0,-0.7071068,-0.707107,0.743145,0.669131
4,1683,3,77.830002,9,29,121,0,1991-07-02 12:12:00,129,75840,...,-0.5,-0.866025,0.394356,0.918958,0.781832,0.62349,1.224647e-16,-1.0,0.951057,0.309017


In [19]:
with open("data/train_encoded.pickle", "wb") as file:
    pickle.dump(train_data, file)
with open("data/valid_encoded.pickle", "wb") as file:
    pickle.dump(valid_data, file)
with open("data/test_encoded.pickle", "wb") as file:
    pickle.dump(test_data, file)

In [None]:
# import torch
# import torch.nn as nn

# example = torch.tensor([[0,1,2,3,4],[0,4,2,5,6]])
# emb = nn.Embedding(7, 32)
# res = emb(example)
# all_but_last_two_dims = res.size()[:-2]
# U = res.view(*all_but_last_two_dims, -1)

* batch_size = number of users
* seq_len = window for user
* embedding_dim = features

Ля вот как нормально собрать батчи?

В датасете формировать seqlen, embedding_dim для каждого юзера, где в __getitem__ подавать indx юзера. Там же формировать seq_len заданной длинны из рандомных наблюдений, но с хотя бы одним фродом в юзере

Тогда даталоадер будет формировать батчи -> batch_size, seq_len, embedding_dim. В colate_fn падить последовательности до максимальной длинны.

Линейный слой нормально должен работать с размерностью batch_size, seq_len, embedding_dim.

Возможно лосс придется считать в цикле [loss(seq, target) for seq in batch_size ].sum()