In [1]:
import numpy as np
from setting import api_dic_path
from log import ApiLog, InputData
from tqdm import tqdm

In [2]:
from torch.utils.data import Dataset
import torch
class ContextDataset(Dataset):
    def __init__(self, data, tar, padding_length=50, window_size=50, stride=1):
        self.data = data
        self.tar = tar
        self.padding_length = padding_length
        self.window_size = window_size
        self.stride = stride

    def __getitem__(self, index):
        assert index < len(self)
        return self.data[index:index + self.window_size], self.tar[index + self.padding_length]

    def __len__(self):
        #len(self.data) - 2 * self.padding_length 为去掉padding后的长度，即原始数据长度
        return (len(self.data) - 2 * self.padding_length - self.window_size + 2 * self.padding_length) // self.stride + 1

    def __add__(self, other):
        new_data = torch.cat((self.data, other.data), dim=0)
        new_tar = torch.cat((self.tar, other.tar), dim=0)
        return ContextDataset(new_data, new_tar, self.padding_length, self.window_size, self.stride)

In [5]:
class MyDataset:
    def __init__(self, path, padding_length=50, stride=1):
        self.path = path
        self.padding_length = padding_length
        self.stride = stride
        self.api2token = np.load(api_dic_path, allow_pickle=True).item()
        self.model_dim = self.get_model_dim()
        self.file_list = list(self.path.glob("*.csv"))
        self.file_num = len(self.file_list)
        self.data = self.get_dataset()


    def get_model_dim(self):
        for k,v in self.api2token.items():
            return len(v)

    def get_dataset(self):
        datalist = []
        for file in tqdm(self.file_list):
            content = ApiLog(file)
            input_data = InputData(content.df, self.api2token, self.model_dim)
            data, label = input_data.get_train_data()
            datalist.append(ContextDataset(data, label, padding_length=self.padding_length, window_size=2*self.padding_length + 1, stride=self.stride))
        combined_dataset = datalist[0]
        for ds in datalist[1:]:
            combined_dataset += ds
        return combined_dataset


In [None]:
from setting import nonrepro_path
file_list = list(nonrepro_path.glob("*.csv"))
content = ApiLog(file_list[0])
api2token = np.load(api_dic_path, allow_pickle=True).item()
model_dim = 8
input_data = InputData(content.df, api2token, model_dim)
data, label = input_data.get_train_data()

In [None]:
d1 = ContextDataset(data, label, padding_length=50, window_size=101, stride=1)
d2 = ContextDataset(data, label, padding_length=50, window_size=101, stride=1)

In [None]:
len(d1),len(d2)

In [None]:
type(data)

In [None]:
d3 = d1 + d2

In [None]:
len(d3)

In [None]:
from torch.utils.data import DataLoader
loader = DataLoader(d1, batch_size=32, shuffle=True)

In [6]:
from setting import nonrepro_path
adataset = MyDataset(nonrepro_path, padding_length=50, stride=1)

100%|██████████| 5/5 [00:34<00:00,  6.81s/it]


In [10]:
adataset.data[0]

(tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00],
