# Предобработка данных

Здесь все доступные таблицы загружаются, очищаются от NaNов и удаляются строки с ошибочными данными.
Далее объединение основной таблицей с геотаблицей.

In [24]:
import numpy as np
import pandas as pd

In [25]:
df = pd.read_csv('dataset/train.csv', sep=';')

df.dropna(axis=0, how='any', inplace=True)

In [26]:
geo_df = pd.read_csv('dataset/geo_info.csv', sep=';')
df = df.join(geo_df, on='geo_id', lsuffix="bad_", how='left')

In [27]:
df.shape

(749999, 9)

Объединение основной таблицы с таблицей векторов (подмешиваются в MLP-слой)

In [28]:
from sklearn.preprocessing import StandardScaler

vectors_df = pd.read_csv('dataset/referer_vectors.csv', sep=';').set_index('referer')
scaled_features = StandardScaler().fit_transform(vectors_df.values)
vectors_df = pd.DataFrame(scaled_features, index=vectors_df.index, columns=vectors_df.columns)

In [29]:
df.shape

(749999, 9)

Добавление целевой переменной

In [30]:
labels = pd.read_csv('dataset/train_labels.csv', sep=';')
df = df.merge(labels, how='left', on='user_id')
df.dropna(axis=0, how='any', subset=['target'], inplace=True)

In [31]:
df.shape

(593442, 10)

Очистка и разделение колонок

In [32]:
for i, col in enumerate(df.columns):
    df[col].fillna(f'NaN_{i}', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(f'NaN_{i}', inplace=True)
  df[col].fillna(f'NaN_{i}', inplace=True)


In [33]:
df.shape

(593442, 10)

In [34]:
df.columns

Index(['request_ts', 'user_id', 'referer', 'geo_idbad_', 'user_agent',
       'geo_id', 'country_id', 'region_id', 'timezone_id', 'target'],
      dtype='object')

In [35]:
import ast
def extract_user_agent_data(df):
    new_df = df.copy()

    # Преобразуем строку в словарь с помощью ast.literal_eval
    new_df['user_agent'] = new_df['user_agent'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Извлекаем данные из словаря в отдельные столбцы
    for key in ['browser', 'browser_version', 'os', 'os_version']:
        new_df[key] = new_df['user_agent'].apply(lambda x: x.get(key) if isinstance(x, dict) else None)

    return new_df


df = extract_user_agent_data(df)

In [36]:
def split_referer(df):
    new_df = df.copy()
    
    new_df['ind_referer'] = df['referer']

    # Удаляем 'https://'
    new_df['referer'] = new_df['referer'].str.replace('https://', '', regex=True)

    # Разделяем значения по '/' и создаем новые столбцы
    split_df = new_df['referer'].str.split('/', expand=True)
    split_df = split_df.fillna(value='0')

    # Добавляем новые столбцы в исходный DataFrame
    new_df = new_df.join(split_df, lsuffix='original')

    return new_df


df = split_referer(df)
df = df.drop(['referer','user_agent', "geo_idbad_", "request_ts"], axis=1)

In [37]:
df[1].loc[df[1] == ""] = '0'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[1].loc[df[1] == ""] = '0'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1].loc[df[1] == ""] = '0'


In [38]:
df

Unnamed: 0,user_id,geo_id,country_id,region_id,timezone_id,target,browser,browser_version,os,os_version,ind_referer,0,1
0,fb858e8e0a2bec074450eaf94b627fd3,7854.0,c31b4e,30af3a,12119c5,0.0,Chrome Mobile,119.0.0,Android,10,https://9b48ee5/,9b48ee5,0
1,46a5f128fd569c764a92c2eaa788095e,NaN_5,NaN_6,NaN_7,NaN_8,0.0,Chrome Mobile,111.0.0,Android,10,https://9b48ee5/,9b48ee5,0
2,5a74e9ac53ffb21a20cce117c0ad77ba,7112.0,10d5df1,NaN_7,f34c29,0.0,Yandex Browser,20.12.5,Android,11,https://9634fd0/1409e548,9634fd0,1409e548
3,af735816ca19115431ae3d89518c8c91,5515.0,a144f8,NaN_7,cff524,0.0,Chrome Mobile,119.0.0,Android,10,https://9b48ee5/,9b48ee5,0
4,364f0ae0a3f29a685c4fb5bae6033b9a,2805.0,c31b4e,54fad0,e56e80,0.0,Yandex Browser,18.11.1,Android,4.4.4,https://9b48ee5/,9b48ee5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
593437,3543d64627ead3a519e3199834e2a148,8906.0,1234f1d,15a554,d903ee,0.0,Chrome Mobile,105.0.0,Android,11,https://6e5ed9f/1301418e,6e5ed9f,1301418e
593438,37df5ff1d739f61d442b164db6281e46,613.0,c31b4e,4b9ad7,e56e80,0.0,Chrome Mobile,94.0.4606,Android,11,https://635e50c/13795cfa,635e50c,13795cfa
593439,b28dbf7b0f2bcbdf01c52715573b93b5,6134.0,ac5671,84f51b,b017ac,0.0,Chrome Mobile,119.0.0,Android,10,https://635e50c/13795cfa,635e50c,13795cfa
593440,dd58499b49a3cbb2b5699ac812c6bbc6,NaN_5,NaN_6,NaN_7,NaN_8,1.0,Chrome Mobile,119.0.0,Android,10,https://6c16abb/,6c16abb,0


In [39]:
df.isna().sum()

user_id            0
geo_id             0
country_id         0
region_id          0
timezone_id        0
target             0
browser            0
browser_version    0
os                 0
os_version         0
ind_referer        0
0                  0
1                  0
dtype: int64

In [40]:
users = df['user_id'].unique().tolist()

In [41]:
len(users)

499999

In [42]:
df.columns

Index([        'user_id',          'geo_id',      'country_id',
             'region_id',     'timezone_id',          'target',
               'browser', 'browser_version',              'os',
            'os_version',     'ind_referer',                 0,
                       1],
      dtype='object')

In [43]:
import numpy as np
def flatten(xss):
    return [x for xs in xss for x in xs]

vocab = flatten([df[u].unique() for u in df.drop('user_id', axis=1).columns])

In [44]:
len(vocab)

273226

In [45]:
for _, row in df[df['user_id'] == users[20]].iterrows():
    print(row.tolist())

['c255b0aa42612bdf1b2e8088f40c20d0', 'NaN_5', 'NaN_6', 'NaN_7', 'NaN_8', 0.0, 'Yandex Browser', '23.9.9', 'Android', '10', 'https://9b48ee5/', '9b48ee5', '0']


Далее 2d-таблица превращается в 3d-тензор. Данные по каждому отдельному пользователю объединяются в отдельную таблицу. (n_users, max_history_size, n_features)

In [46]:
columns_in_request = len(df.columns) - 3  # minus user_id, ind_referer, target
max_requests_per_user = 8

In [None]:
user_requests = [[['0'] * columns_in_request] * max_requests_per_user] * len(users)
referer_requests = [[['0'] * 10] * max_requests_per_user] * len(users)
labels = [0] * len(users)

In [24]:
from pqdm.processes import pqdm

def process_user(userid_index):
    i, userid = userid_index
    j = 0
    user_data = [['0'] * columns_in_request] * max_requests_per_user
    referer_data = [['0'] * 10] * max_requests_per_user
    for _, row in df[df['user_id'] == userid].drop(['user_id', 'target'], axis=1).iterrows():
        if j == max_requests_per_user:
            break
        user_data[j] = row.drop('ind_referer').tolist()
        referer_data[j] = vectors_df[vectors_df.index.str.fullmatch(row['ind_referer'])].iloc[0].tolist()
        j += 1
    return i, user_data, referer_data, df[df['user_id'] == userid]['target'].iloc[0]

results = pqdm(enumerate(users), process_user, n_jobs=10)  # Adjust n_jobs based on your CPU cores

for i, user_data, referer_data, target in results:
    user_requests[i] = user_data
    referer_requests[i] = referer_data
    labels[i] = target

QUEUEING TASKS | : 0it [00:00, ?it/s]

PROCESSING TASKS | :   0%|          | 0/499999 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/499999 [00:00<?, ?it/s]

In [47]:
df.drop(['ind_referer', 'target'], axis=1, inplace=True)

In [48]:
columns_in_request

10

In [49]:
requests = np.array(user_requests)
referers = np.array(referer_requests)
labels = np.array(labels)

NameError: name 'user_requests' is not defined

In [50]:
print(requests.shape)
print(referers.shape)
print(labels.shape)

NameError: name 'requests' is not defined

# Сохранение и загрузка данных

In [83]:
np.save('sample_full.npy', requests, allow_pickle=True)
np.save('sample_full_referers.npy', referers, allow_pickle=True)
np.save('sample_full_labels.npy', labels, allow_pickle=True)

In [51]:
requests = np.load('sample_full.npy', allow_pickle=True)
referers = np.load('sample_full_referers.npy', allow_pickle=True)
labels = np.load('sample_full_labels.npy', allow_pickle=True)

requests = requests[:,:4,:]
referers = referers[:,:4,:]

In [52]:
requests.shape

(499999, 4, 10)

In [45]:
del user_requests
del referer_requests

NameError: name 'user_requests' is not defined

### Искусственное увеличение датасета. 
Изначально таблица каждого пользователя хранит данные последовательно, количество данных увеличивается путем перемешивания истории браузера и её копирования

In [53]:
rng = np.random.default_rng(seed=41)

In [54]:
shuffled_requests = rng.permutation(requests, axis=1)
requests = np.concatenate((shuffled_requests, requests), axis=0)
del shuffled_requests

In [55]:
shuffled_referers = rng.permutation(referers, axis=1)
referers = np.concatenate((shuffled_referers, referers), axis=0)
del shuffled_referers

In [56]:
rng = np.random.default_rng(seed=42)

In [57]:
shuffled_requests = rng.permutation(requests, axis=1)
requests = np.concatenate((shuffled_requests, requests), axis=0)
del shuffled_requests

In [58]:
shuffled_referers = rng.permutation(referers, axis=1)
referers = np.concatenate((shuffled_referers, referers), axis=0)
del shuffled_referers

In [86]:
labels = np.concatenate((labels, labels, labels, labels), axis=0)

# Объявление модели.
Модель - однослойный трансформер с одной головой, параллельно расположен MLP-блок, обрабатывающий данные векторов. Выводы трансформера и MLP конкатенируются и передаются в другой MLP-блок

In [99]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer, LayerNorm
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

class CustomTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.token_to_id = {token: idx for idx, token in enumerate(vocab)}
        self.pad_token = '<PAD>'
        self.pad_token_id = len(vocab)
        
    def encode(self, features):
        return np.array([[self.token_to_id.get(feature, self.pad_token_id) for feature in feature_set] for feature_set in features])

class TransformerBinaryClassifier(nn.Module):
    def __init__(self, n_features, vocab_size, d_model=24, nhead=1, num_layers=1, dim_feedforward=128, vectors_len = 40):
        super(TransformerBinaryClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, d_model)  # +1 for padding token
        
        encoder_layers = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout=0.4, activation='gelu')
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers, norm=LayerNorm(d_model))
        
        self.mlp = nn.Sequential(nn.Linear(vectors_len, dim_feedforward),
                                nn.ReLU(),
                                nn.BatchNorm1d(dim_feedforward),
                                nn.Dropout(0.3),
                                nn.Linear(dim_feedforward, vectors_len))
        
        self.fc = nn.Sequential(nn.Linear(d_model * n_features + vectors_len, dim_feedforward),
                                nn.ReLU(),
                                nn.BatchNorm1d(dim_feedforward),
                                nn.Dropout(0.3),
                                nn.Linear(dim_feedforward, 1))
        self.sigmoid = nn.Sigmoid()

    def forward(self, features, vectors):
        batch_size = len(features)
        
        # Convert features to tensor
        tokenized_features = torch.tensor(features)
        
        # Embedding
        embeddings = self.embedding(tokenized_features)
        
        # Reshape for transformer input
        embeddings = embeddings.permute(1, 0, 2).contiguous()
        
        # Transformer encoder
        transformer_out = self.transformer_encoder(embeddings)
        
        # Reshape back to (batch_size, d_model * n_features)
        transformer_out = transformer_out.permute(1, 0, 2).contiguous()
        transformer_out = transformer_out.view(batch_size, -1)
        
        vec = self.mlp(vectors)
        
        # Fully connected layer
        logits = self.fc(torch.cat((transformer_out, vec), dim=1))
        output = self.sigmoid(logits)
        
        return output

In [61]:
from tqdm import tqdm

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for features, refs, labels in tqdm(train_loader, unit="batch", total=len(train_loader)):
            features, refs, labels = features.to(device), refs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(features, refs)
            loss = criterion(outputs, labels.float().unsqueeze(1))
            loss.backward()
            optimizer.step()
        
            running_loss += loss.item() * features.size(0)
        
        scheduler.step()
        
        epoch_loss = running_loss / len(train_loader.dataset)
        
        model.eval()
        val_running_corrects = 0
        all_labels = []
        all_outputs = []
        
        with torch.no_grad():
            for features, refs, labels in tqdm(val_loader, unit="batch", total=len(val_loader)):
                features, refs, labels = features.to(device), refs.to(device), labels.to(device)
                outputs = model(features, refs)
                preds = (outputs > 0.5).int()
                val_running_corrects += (preds == labels.unsqueeze(1)).sum().item()
                all_labels.extend(labels.cpu().numpy())
                all_outputs.extend(outputs.cpu().numpy())
        
        val_acc = val_running_corrects / len(val_loader.dataset)
        val_roc_auc = roc_auc_score(all_labels, all_outputs)
        
        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}, Val Acc: {val_acc:.4f}, LR: {scheduler.get_lr()}, Val ROC AUC: {val_roc_auc:.4f}')

# Создание векторов из таблиц и токенизация

In [63]:
requests_flat = requests.reshape((requests.shape[0], requests.shape[1] * requests.shape[2]))
requests_flat.shape

(1999996, 40)

In [64]:
referers_flat = referers.reshape((referers.shape[0], referers.shape[1] * referers.shape[2]))
referers_flat.shape

(1999996, 40)

In [65]:
# Initialize tokenizer
tokenizer = CustomTokenizer(vocab)

In [66]:
tokens = tokenizer.encode(requests_flat)

In [67]:
tokens.shape

(1999996, 40)

In [87]:
from torch.utils.data import TensorDataset

features = torch.tensor(tokens)
refs = torch.tensor(referers_flat.astype('float32'))
labels = torch.tensor(labels)

In [88]:
labels.shape

torch.Size([1999996])

In [89]:
# Train-test split
features_train, features_val, refs_train, refs_val, labels_train, labels_val = train_test_split(features, refs, labels, test_size=0.05, random_state=42)

# Create datasets and loaders
train_dataset = TensorDataset(features_train, refs_train, labels_train)
val_dataset = TensorDataset(features_val, refs_val, labels_val)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

# Обучение

In [100]:
# Define the model
n_features = 80
model = TransformerBinaryClassifier(40, len(vocab))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function, optimizer, and scheduler
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10)

  tokenized_features = torch.tensor(features)
  1%|          | 86/14844 [00:01<04:16, 57.62batch/s] 


KeyboardInterrupt: 