In [1]:
import torch
import torch.nn as nn
from load_datas import OurData, Dataloader

def de_dim(train_r_data, train_w_data, valid_r_data, valid_w_data, test_r_data, test_w_data):
    S = torch.cat([train_r_data['s2'], valid_r_data['s2'], test_r_data['s2']], dim=0)
    print(S.size())
    for i in range(S.size(1)):
        S[:,i] = (S[:,i] - S[:,i].min()) / S[:,i].max()
    for i in range(S.size(1)):
        print(S[:,i].min(), S[:,i].max())

def get_features(data: OurData):
    """
    train/valid/test:
        _r_data:
            {
            "s1": None, 为上一条数据的s2 (为了省内存)
            "a": list[int], worker的id, 大小为N
            "r": list[float], 得分，大小为N
            "s2": tensor (N, dim)
            }
        _w_data:
            {
            "s1": None, 为上一条数据的s2 (为了省内存)
            "a": list[int], 是否接了任务, 大小为N
            "r": list[float], 钱，大小为N
            "s2": tensor (N, dim)
            }
    """
    ws, ps, es = data.workers, data.projects, data.entrys
    requester_data, worker_data = {"s1": [], "a": [], "r": [], "s2": []}, {"s1": [], "a": [], "r": [], "s2": []}

    wws = {}
    for k, v in ws.items():
        if v != -1:
            wws[k] = v

    w_num = len(wws)
    e_num = len(es)
    w_id2index, w_index2id = {}, list(wws.keys())
    for idx, _id in enumerate(w_index2id):
        w_id2index[_id] = idx
    
    # workers能力：[0, 1]
    category_size = 7
    sub_category_size = 29
    history_dim = w_num * category_size
    workers_quality = (torch.tensor([wws[w_index2id[i]] for i in range(w_num)])) / 100
    workers_history = [0. for i in range(history_dim)]
    workers_history_count = [-1 for i in range(history_dim)]
    
    # last_state = None
    for k, e in enumerate(es):
        print("%d / %d"%(k, e_num), end='\r')
        w_id = str(e["worker_id"])
        if not w_id in w_id2index:
            continue
        state, action, reward = [], w_id2index[w_id], e["withdrawn"] * e['score']
        p = ps[str(e['project_id'])]

         # 更新worker history
        updated_index = int(action * category_size + p['category'])
        workers_history[updated_index] += reward
        workers_history_count[updated_index] += 1 if workers_history_count[updated_index] != -1 else 2
        
        # 得到s2状态
        state.append(workers_quality.clone())
        state.append(torch.tensor(workers_history) / torch.tensor(workers_history_count))
        state.append(torch.tensor([p['category'], p['sub_category']]))
        state = torch.cat(state, 0).type_as(torch.zeros(1, dtype=torch.float16))
        
        # s1状态就是上一个s2状态
        # if last_state is None:
        #     last_state = state
        #     continue
        # requester_data['s1'].append(last_state)
        # worker_data['s1'].append(last_state.clone())
        requester_data['s2'].append(state)
        # worker_data['s2'].append(state.clone())
        last_state = state.clone()
        requester_data['a'].append(action)
        requester_data['r'].append(reward)

        worker_data['a'].append(e["withdrawn"])
        worker_data['r'].append(e["withdrawn"] * p['total_awards'] / p['entry_count'])

    # requester_data['s1'] = torch.stack(requester_data['s1'], dim=0)
    requester_data['s2'] = torch.stack(requester_data['s2'], dim=0)
    # worker_data['s1'] = torch.stack(worker_data['s1'], dim=0)
    # worker_data['s2'] = torch.stack(worker_data['s2'], dim=0)
    print()

    return requester_data, worker_data
       

dataloader = Dataloader()

train_data, valid_data, test_data = dataloader.get_datas()


train_r_data, train_w_data = get_features(train_data)
valid_r_data, valid_w_data = get_features(valid_data)
test_r_data, test_w_data = get_features(test_data)







151721 / 151722
19722 / 19723
19389 / 19390


In [3]:

S = torch.cat([train_r_data['s2'], valid_r_data['s2'], test_r_data['s2']], dim=0)
print(S.size())
for i in range(S.size(1)):
    S[:,i] = (S[:,i] - S[:,i].min()) / S[:,i].max()

torch.Size([172477, 13226])


In [4]:
n = S.size(0)
S_centered = S - torch.mean(S, dim=0)

In [None]:
torch.save((train_r_data['a'], train_r_data['r'], valid_r_data['a'], valid_r_data['r'],test_r_data['a'], test_r_data['r']), "all_ar.pt")

In [6]:
del train_r_data, train_w_data, valid_r_data, valid_w_data, test_r_data, test_w_data

In [7]:
del S

In [None]:
cov_matrix = torch.mm(S_centered.t(), S_centered) / (n - 1)