In [None]:
import datatable as dt
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
import janestreet
from tqdm import tqdm
import pandas as pd
import os

In [None]:
data = dt.fread('/kaggle/input/jane-street-market-prediction/train.csv') #用datarable可以加速读取csv文件
data = data.to_pandas()

In [None]:
"""
Reduce Memory Usage by 50%
https://www.kaggle.com/tomwarrens/nan-values-depending-on-time-of-day
"""

## Reduce Memory

def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
#                 reducing float16 for calculating numpy.nanmean
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

data = reduce_memory_usage(data)

In [None]:
data = data.drop(data[(data['weight'] == 0) | (data['date'] <= 85)].index) #删除前85天的数据和weight等于0的数据
ignore_columns = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp','ts_id','date']
features = [col for col in data.columns if col not in ignore_columns]

In [None]:
data = data.fillna(data.mean()) #平均值填充
data['action'] = (data['resp'] > 0).astype('int') #resp大于0使action取1，也就是买入
data = data.drop(columns=ignore_columns)

In [None]:
class Timeseries_Dataset(Dataset): 
    """
    Custom Dataset subclass.
    Serves as input to DataLoader to transform X
      into sequence data using rolling window.
    DataLoader using this dataset will output batches
      of `(batch_size, seq_len, n_features)` shape.
    Suitable as an input to RNNs.
    """

    def __init__(self, X: np.ndarray, y: np.ndarray, seq_len: int = 32):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len - 1)

    def __getitem__(self, index):
        return {'x': torch.tensor(self.X[index:index + self.seq_len], dtype=torch.float),
                'y': torch.tensor(self.y[index + self.seq_len - 1], dtype=torch.long)}

In [None]:
class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v):

        attn = torch.matmul(q / self.temperature, k.transpose(1, 2))

        # if mask is not None:
        #     attn = attn.masked_fill(mask == 0, -1e9)

        attn = self.dropout(F.softmax(attn, dim=-1))
        output = torch.matmul(attn, v)

        return output, attn

class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)


    def forward(self, q, k, v):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        batch_size = q.size(0)
        residual = q

        # Pass through the pre-attention projection: b x lq x (n*dv)
        # Separate different heads: b x lq x n x dv
        q = self.w_qs(q)
        k = self.w_ks(k)
        v = self.w_vs(v)

        # Transpose for attention dot product: b x n x lq x dv
        # q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        # if mask is not None:
        #     mask = mask.unsqueeze(1)   # For head axis broadcasting.

        q = q.view(n_head * batch_size, -1, d_k)
        k = q.view(n_head * batch_size, -1, d_k)
        v = q.view(n_head * batch_size, -1, d_v)

        q, attn = self.attention(q, k, v)

        q = q.view(batch_size, -1, n_head * d_k)
        # Transpose to move the head dimension back: b x lq x n x dv
        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
        q = self.dropout(self.fc(q))
        q += residual

        q = self.layer_norm(q)

        return q, attn


class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid) # position-wise
        self.w_2 = nn.Linear(d_hid, d_in) # position-wise
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        residual = x

        x = self.w_2(F.relu(self.w_1(x)))
        x = self.dropout(x)
        x += residual

        x = self.layer_norm(x)

        return x

class EncoderLayer(nn.Module):
    ''' Compose with two layers '''

    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
        self.last_linear = nn.Linear(d_model, 2)

    def forward(self, enc_input):
        enc_output, attn = self.slf_attn(
            enc_input, enc_input, enc_input)
        enc_output = self.pos_ffn(enc_output)
        enc_output = self.last_linear(enc_output)
        return enc_output[:,-1,:]

In [None]:
num_epochs = 10
batch_size = 4096
lr = 0.0005
d_model = 131
d_inner = 89
n_head = 8
d_k = 45
d_v = 45

seq_dim = 32
target_column = 'action'

feature_columns = data.columns[~data.columns.isin([target_column])]
train, validation = data[:int(len(data) * 0.8)], data[int(len(data) * 0.2):]  #pandas数据前80%作训练，后20%作预测
train_features, train_target = train[feature_columns], train[[target_column]]
validation_features, validation_target = validation[feature_columns], validation[[target_column]]
train_dataset = Timeseries_Dataset(X=train_features.values, y=train_target.values, seq_len=seq_dim) #转成numpy数据
validation_dataset = Timeseries_Dataset(X=validation_features.values, y=validation_target.values, seq_len=seq_dim)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

In [None]:
weight = '/kaggle/input/weight-lstm/best_30.pth' #???

phase_training = True
if os.path.exists(weight):
    phase_training = False

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = EncoderLayer(d_model, d_inner, n_head, d_k, d_v)
model = model.to(device)
if phase_training:
    iterations_per_epoch = len(train_loader)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    print('Start model training ...')
    best_acc = 0.0
    patience, trials = 100, 0
    for epoch in range(1, num_epochs + 1):
        for i, train_batch in enumerate(train_loader):
            model.train()
            features = train_batch['x'].to(device)
            targets = train_batch['y'].to(device)
            targets = torch.squeeze(targets)
            preds = model(features)
            loss = criterion(preds, targets) #？？？ok
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch} best model saved with loss: {loss:2.2}')

        model.eval()
        correct, total = 0, 0
        for valid_batch in validation_loader:
            features = valid_batch['x'].to(device)
            targets = valid_batch['y'].to(device)
            targets = torch.squeeze(targets)
            preds = model(features)
            preds = F.log_softmax(preds, dim=1).argmax(dim=1) #输出结果一行两个特征【0,1】，取出较大的特征对应的列数0或者1作为结果
            total += targets.size(0)
            correct += (preds == targets).sum().item() #tensor类型转换成python数字

        acc = correct / total

        if epoch % 1 == 0:
            print(f'Epoch: {epoch:3d}. Loss: {loss.item():.4f}. Acc.: {acc:2.2%}')

        if acc > best_acc:
            trials = 0
            best_acc = acc
            torch.save(model.state_dict(), 'best.pth') #？？？ok
            print(f'Epoch {epoch} best model saved with accuracy: {best_acc:2.2%}')
        else:
            trials += 1
            if trials >= patience:
                print(f'Early stopping on epoch {epoch}')
                break
    print('Training Complete !!!')

In [None]:
if phase_training: #???ok
    model.load_state_dict(torch.load('best.pth'))
else:
    model.load_state_dict(torch.load(weight))
    
model.eval()
X_test = None
env = janestreet.make_env()
env_iter = env.iter_test()
for (test_df, pred_df) in tqdm(env_iter):
    if test_df['weight'].item() > 0:
        test_df = pd.DataFrame(test_df, columns=feature_columns)
        test_df = test_df.fillna(test_df.mean())
        if X_test is None:
            X_test = np.concatenate([test_df for _ in range(seq_dim)],axis=0)
        X_test = np.concatenate([X_test[1:], test_df] ,axis=0)
        preds = model(torch.tensor(X_test[np.newaxis,:], dtype=torch.float).to(device))
#         preds = preds.cpu().detach().numpy()
#         action = ((test_df['weight'].values * preds[:, 1]) > 0).astype('int')
        preds = F.log_softmax(preds, dim=1).argmax(dim=1)
        action = preds.cpu().detach().numpy()
        pred_df.action = action
    else:
        pred_df.action = 0
    env.predict(pred_df)