In [20]:
import torch
from torch.utils.data import Dataset as TorchDataset
from data.interaction import Interaction
from torch.nn.utils import rnn as rnn_utils
import os
import pandas as pd
from root import DATASET_DIR, absolute, ROOT_DIR
from enum import Enum
from typing import Union, Dict, List
import numpy as np
import copy

class FeatType(Enum):
    Token = (0, "单个离散特征序列")
    TokenSeq = (1, "多个离散特征序列")
    Float = (2, "单个连续特征序列")
    FloatSeq = (3, "多个连续特征序列")
    
    @classmethod
    def from_code(cls, code:Union[str, int]):
        if isinstance(code, str): code = int(code)
        for feat_type in FeatType:
            if feat_type.value[0] == code:
                return feat_type
        return None

class Dataset(TorchDataset):
    def __init__(self, config) -> None:
        super().__init__()
        self.config = config
        
        self._get_preset()
        self._get_field_from_config()
        self._load_data(DATASET_DIR)
        self._data_processing()
        
        
    def _get_field_from_config(self):
        """初始化数据集的通用字段"""
        self.dataset_name = self.config['dataset']
        self.uid_field = self.config["USER_ID_FIELD"]
        self.iid_field = self.config["ITEM_ID_FIELD"]
        self.label_field = self.config["LABEL_FIELD"]
        
        self.split_rate = self.config['split_rate']
    
    def _get_preset(self):
        self.field2type:Dict[str, FeatType] = {}
        
    def _data_processing(self):
        self.feat_name_list = self._build_feat_name_list()
        
    def _load_data(self, data_dir):
        """加载数据集"""
        dataset_dir = os.path.join(data_dir, self.dataset_name)
        self._load_item_feat(dataset_dir, self.dataset_name)
        self._load_user_feat(dataset_dir, self.dataset_name)
        self._load_inter_feat(dataset_dir, self.dataset_name)
        
        
    def _load_feat(self, feat_dir, feat_name):
        path = os.path.join(feat_dir, feat_name)
        if not os.path.exists(path): raise FileNotFoundError(f"{path} not found")
        df = pd.read_csv(
            path, 
            header=0
        )
        
        new_columns = []
        for col in df.columns:
            name, dtype = col.split(":")
            dtype = FeatType.from_code(dtype)
            if dtype is None: raise ValueError(f"feat type {dtype} not found")
            new_columns.append(name)    
            self.field2type[name] = dtype
        df.columns = new_columns
        
        return df

        
    def _load_inter_feat(self, feat_dir, feat_prefix):
        feat_name = f"{feat_prefix}.inter"
        self.inter_feat = self._load_feat(feat_dir, feat_name)
        
    
    def _load_user_feat(self, feat_dir, feat_prefix):
        feat_name = f"{feat_prefix}.user"
        self.user_feat = self._load_feat(feat_dir, feat_name)
    
    def _load_item_feat(self, feat_dir, feat_prefix):
        feat_name = f"{feat_prefix}.item"
        self.item_feat = self._load_feat(feat_dir, feat_name)
        
    def _build_feat_name_list(self):
        feat_name_list = [
            feat_name
            for feat_name in ["inter_feat", "user_feat", "item_feat"]
            if getattr(self, feat_name, None) is not None
        ]
        return feat_name_list
    
    def _change_feat_format(self):
        for feat_name in self.feat_name_list:
            feat = getattr(self, feat_name)
            setattr(self, feat_name, self._dataframe_to_interaction(feat))
    
    def build(self):
        self._change_feat_format()
        dataset = self.split_by_ratio_without_eval(self.split_rate)
        return dataset
        
    def split_by_ratio_without_eval(self, train_ratio:float):
        """分割训练集和测试集"""
        assert 0 < train_ratio < 1
        total_cnt = self.__len__()
        split_ids = self._calcu_split_ids(total_cnt, [train_ratio, 1 - train_ratio])
        next_index = [
            range(start, end)
            for start, end in zip([0] + split_ids, split_ids + [total_cnt])
        ]
        next_df = [self.inter_feat[index] for index in next_index]
        next_ds = [self.copy(_) for _ in next_df]
        return next_ds
        
        
    def copy(self, new_inter_feat):
        """Given a new interaction feature, return a new :class:`Dataset` object,
        whose interaction feature is updated with ``new_inter_feat``, and all the other attributes the same.

        Args:
            new_inter_feat (Interaction): The new interaction feature need to be updated.

        Returns:
            :class:`~Dataset`: the new :class:`~Dataset` object, whose interaction feature has been updated.
        """
        nxt = copy.copy(self)
        nxt.inter_feat = new_inter_feat
        return nxt
        
        

    def _calcu_split_ids(self, tot, ratios):
        """Given split ratios, and total number, calculate the number of each part after splitting.

        Other than the first one, each part is rounded down.

        Args:
            tot (int): Total number.
            ratios (list): List of split ratios. No need to be normalized.

        Returns:
            list: Number of each part after splitting.
        """
        cnt = [int(ratios[i] * tot) for i in range(len(ratios))]
        cnt[0] = tot - sum(cnt[1:])
        for i in range(1, len(ratios)):
            if cnt[0] <= 1:
                break
            if 0 < ratios[-i] * tot < 1:
                cnt[-i] += 1
                cnt[0] -= 1
        split_ids = np.cumsum(cnt)[:-1]
        return list(split_ids)
        
    def _dataframe_to_interaction(self, data:pd.DataFrame):
        data_for_tensor = {}
        for col_name in data:
            assert isinstance(col_name, str)
            value = data[col_name].values
            ftype = self.field2type[col_name]
            if ftype == FeatType.Token:
                data_for_tensor[col_name] = torch.LongTensor(value)
            elif ftype == FeatType.Float:
                data_for_tensor[col_name] = torch.FloatTensor(value)
            else:
                raise NotImplementedError(f"feat type {ftype} not implemented")
        return Interaction(data_for_tensor)
    
    def __len__(self):
        return len(self.inter_feat)
    
                
config_test = {
    "split_rate": 0.2,
    "dataset": "wsdream-rt",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "item_id",
    "LABEL_FIELD": "rt"
}

data = Dataset(config_test)
train, test = data.build()
        


The batch_size of interaction: 394935
    user_id, torch.Size([394935]), cpu, torch.int64
    service_id, torch.Size([394935]), cpu, torch.int64
    rt, torch.Size([394935]), cpu, torch.float32


The batch_size of interaction: 1579740
    user_id, torch.Size([1579740]), cpu, torch.int64
    service_id, torch.Size([1579740]), cpu, torch.int64
    rt, torch.Size([1579740]), cpu, torch.float32




In [None]:
for item in data.columns:
    print(item)

In [3]:
# 把原始的wsdream数据转成原子形式
# https://recbole.io/cn/atomic_files.html

import pandas as pd
from root import ORIGINAL_DATASET_DIR, absolute, DATASET_DIR
import os
from enum import Enum
import numpy as np


class WSDreamDataType(Enum):
    TP_ONLY = (1, "wsdream-tp")
    RT_ONLY = (2, "wsdream-rt")
    TP_AND_RT = (3, "wsdream-all")

    @classmethod
    def from_code(cls, code:int):
        for wsdream_type in WSDreamDataType:
            if wsdream_type.value[0] == code:
                return wsdream_type
        return None

class BasicDataConvert:

    def load_user_data(self):
        raise NotImplementedError
    
    def loda_item_data(self):
        raise NotImplementedError
    
    def load_inter_data(self):
        raise NotImplementedError
    
    def fit(self):
        raise NotImplementedError
    
ALL_USER_FIELD = ["[User ID]", "[IP Address]", "[Country]", "[IP No.]", "[AS]", "[Latitude]", "[Longitude]"]
ALL_ITEM_FIELD = ["[Service ID]","[WSDL Address]","[Service Provider]","[IP Address]","[Country]","[IP No.]","[AS]","[Latitude]","[Longitude]"]
    
class WSDreamDataConvert(BasicDataConvert):
    
    def __init__(self, wsdream_type:WSDreamDataType) -> None:
        super().__init__()
        
        self.origin_user_field = ["[User ID]", "[Country]", "[AS]"]
        self.user_field = ["user_id", "country", "AS"]
        
        self.origin_item_field = ["[Service ID]", "[Country]", "[AS]"]
        self.item_field = ["service_id", "country", "AS",]
        
        self.inter_field = ["user_id", "service_id"]
        if wsdream_type == WSDreamDataType.RT_ONLY: self.inter_field.append("rt")
        elif wsdream_type == WSDreamDataType.TP_ONLY: self.inter_field.append("tp")
        else: self.inter_field.extend(["rt", "tp"])
            
        self.upath = os.path.join(ORIGINAL_DATASET_DIR, "userlist.txt")
        self.ipath = os.path.join(ORIGINAL_DATASET_DIR, "wslist.txt")
        self.rt_inter = os.path.join(ORIGINAL_DATASET_DIR, "rtMatrix.txt")
        self.tp_inter = os.path.join(ORIGINAL_DATASET_DIR, "tpMatrix.txt")
        
        self.wstype = wsdream_type
        self.output_dir = os.path.join(DATASET_DIR, wsdream_type.value[1])
        
        self.dataset_name = wsdream_type.value[1]
        
        self._load_data()
        
    def _load_data(self):
        self.user_data = self.load_user_data()
        self.item_data = self.loda_item_data()
        self.inter_data = self.load_inter_data()
        for name in ["[Country]", "[AS]"]:
            self._deal_categorical_feat(name)
        
    def _deal_categorical_feat(self, name:str):
        if self.item_data is None and self.user_data is None: self._load_data()
        feat_kinds = []
        if name in self.user_data: feat_kinds.extend(self.user_data[name].unique().tolist())
        if name in self.item_data: feat_kinds.extend(self.item_data[name].unique().tolist())
        feat_kinds = list(set(feat_kinds))
        map_ = {
            feat:idx for idx, feat in enumerate(feat_kinds)
        }
        if name in self.user_data: self.user_data.replace({name:map_}, inplace=True)
        if name in self.item_data: self.item_data.replace({name:map_}, inplace=True)

        
    def _feat_type_wrap(self, type_:str):
        feat_types = []
        if type_ == "user":
            feat_types = [0, 0, 0]
            return list(map(lambda x,y:f'{x}:{y}', self.user_field, feat_types))
        elif type_ == "item":
            feat_types = [0, 0, 0]
            return list(map(lambda x,y:f'{x}:{y}', self.item_field, feat_types))
        else:
            if self.wstype == WSDreamDataType.RT_ONLY or self.wstype == WSDreamDataType.TP_ONLY:
                feat_types = [0, 0, 2]
            else:
                feat_types = [0, 0, 2, 2]
            return list(map(lambda x,y:f'{x}:{y}', self.inter_field, feat_types))
    

    def load_inter_data(self) -> pd.DataFrame:
        rt_path, tp_path = None, None
        if self.wstype == WSDreamDataType.RT_ONLY: rt_path = self.rt_inter
        elif self.wstype == WSDreamDataType.TP_ONLY: tp_path = self.tp_inter
        else: rt_path, tp_path = self.rt_inter, self.tp_inter
        if rt_path and tp_path:
            rt_data = np.loadtxt(rt_path, dtype=np.float64)
            tp_data = np.loadtxt(tp_path, dtype=np.float64)
            rows, cols = np.nonzero(rt_data)
            inter_data = pd.DataFrame({self.inter_field[0]:rows, self.inter_field[1]:cols, self.inter_field[2]:rt_data[rows, cols], self.inter_field[3]: tp_data[rows, cols]})
        else:
            path = self.rt_inter if rt_path else self.tp_inter
            inter_data = np.loadtxt(path, dtype=np.float64)
            rows, cols = np.nonzero(inter_data)
            inter_data = pd.DataFrame({self.inter_field[0]:rows, self.inter_field[1]:cols, self.inter_field[2]:inter_data[rows, cols]})
        return inter_data

    def load_user_data(self) -> pd.DataFrame:
        return pd.read_csv(self.upath, sep="\t", header=0)[self.origin_user_field]
    
    def loda_item_data(self):
        return pd.read_csv(self.ipath, sep="\t", header=0)[self.origin_item_field]
    
    def _convert(self, type_:str):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        if type_ == "user":
            data = self.user_data
        elif type_ == "item":
            data = self.item_data
        else:
            data = self.inter_data
        data.columns = self._feat_type_wrap(type_)
        data.to_csv(os.path.join(self.output_dir, f"{self.dataset_name}.{type_}"), index=False)
        
    def fit(self):
        for type_ in ["user", "item", "inter"]:
            self._convert(type_)
        
    
wc = WSDreamDataConvert(wsdream_type=WSDreamDataType.TP_ONLY)
# wc.load_user_data()
wc.fit()

# data = pd.read_csv(os.path.join(DATASET_DIR, "rt.inter"), sep=",", header=0)


{'Spain': 0, 'New Zealand': 1, 'Peru': 2, 'Philippines': 3, 'India': 4, 'Uruguay': 5, 'Costa rica': 6, 'Slovakia': 7, 'Pakistan': 8, 'Slovenia': 9, 'Kuwait': 10, 'Saudi Arabia': 11, 'Argentina': 12, 'Austria': 13, 'Colombia': 14, 'Kazakhstan': 15, 'Venezuela': 16, 'Puerto Rico': 17, 'Brazil': 18, 'South Africa': 19, 'Korea, Republic of': 20, 'Turkey': 21, 'Croatia': 22, 'Finland': 23, 'Iceland': 24, 'Denmark': 25, 'France': 26, 'Italy': 27, 'United States': 28, 'Hong kong': 29, 'Mexico': 30, 'Ukraine': 31, 'United Arab Emirates': 32, 'Czech Republic': 33, 'Bahamas': 34, 'Hong Kong': 35, 'Japan': 36, 'Latvia': 37, 'United Kingdom': 38, 'Chile': 39, 'Netherlands': 40, 'Thailand': 41, 'Taiwan': 42, 'Romania': 43, 'Lithuania': 44, 'Cyprus': 45, 'Canada': 46, 'Israel': 47, 'Serbia and Montenegro': 48, 'Switzerland': 49, 'The Former Yugoslav Republic of Macedonia': 50, 'Guatemala': 51, 'Vietnam': 52, 'Bulgaria': 53, 'Egypt': 54, 'Oman': 55, 'Russian Federation': 56, 'Belgium': 57, 'Estonia':

In [12]:
data = wc.user_data
for idx in data:
    print(idx)
    print(type(idx))
    print(data[idx].values)
    print("==")



user_id:0
<class 'str'>
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 24

In [None]:
from data.interaction import Interaction

it = Interaction(data)

In [None]:
it.numpy()

In [None]:
# 使用独热编码（One-Hot Encoding）
import numpy as np
import pandas as pd

# 假设有一个离散特征 countries，包含三个不同的取值
countries = np.array(['USA', 'China', 'Germany'])

# 使用 NumPy 进行独热编码
one_hot = np.eye(len(countries))
print(one_hot)
# 使用 pandas 进行独热编码
one_hot_df = pd.get_dummies(countries)
print(one_hot_df)
# 使用类别编码（Label Encoding）
import torch

# 假设有一个离散特征 countries，包含三个不同的取值
countries = np.array(['USA', 'China', 'Germany'])

# 使用 PyTorch 进行类别编码
label_encoder = torch.unique(torch.from_numpy(countries), sorted=True, return_inverse=True)
label_encoded = label_encoder[1]
print(label_encoded)
