In [1]:
import pandas as pd

data_path = "data/"

## read worker

In [2]:
worker_data = pd.read_csv(data_path + "worker_quality.csv")
worker_data.head()

Unnamed: 0,worker_id,worker_quality
0,2392073,-1
1,2527235,73
2,2531356,73
3,2400287,75
4,2523170,71


In [3]:
worker = {}
for i in range(worker_data.shape[0]):
    worker[int(worker_data.iloc[i]["worker_id"])] \
        = int(worker_data.iloc[i]["worker_quality"])

In [4]:
len(worker)

1807

## read project

In [5]:
project_list = pd.read_csv(data_path + "project_list.csv", header=None)
project_list.head()

Unnamed: 0,0,1
0,2908089,99
1,2909696,105
2,2909401,127
3,2909378,74
4,2909457,36


project_info结构：
key为project_id，value为：
* sub_category：项目子类别 (0到N的int)
* category：项目大类 (0到N的int)
* entry_count：项目entry的数目，相当于子任务 (int)
* entry_ids：每个entry的id (int)
* client_feedback：请求者的反馈分数 (float)
* average_score：所有entry的平均分 (float)
* total_awards：项目给出的报酬 (float)
* start_date：开始时间 (int)
* deadline：结束时间 (int)


In [6]:
import json

def time_parse(t: str):
    return int(''.join(c for c in t if c.isdigit()))

entry_path = data_path + "entry/"
project_path = data_path + "project/"

project_info = {}
for i in range(project_list.shape[0]):
    project_id, entry_count = int(project_list.iloc[i][0]), int(project_list.iloc[i][1])
    
    # 未发布任务的project跳过
    if entry_count <= 0:
        continue
    
    p_dict = {} # 单个项目的属性
    chosen_attr = ["sub_category", "category", "entry_count", \
                   "client_feedback", "average_score", "total_awards",\
                    "start_date", "deadline", "entry_ids"]
    with open(project_path + "project_%d.txt" % project_id, "r") as f:
        project_data = json.loads(f.readlines()[0])
        for attr in chosen_attr:
            p_dict[attr] = project_data[attr]

    p_dict["start_date"] = time_parse(p_dict["start_date"])
    p_dict["deadline"] = time_parse(p_dict["deadline"])

    project_info[project_id] = p_dict

len(project_info)

2489

In [7]:
# 重新映射到[0,N]

st1, st2 = set(), set()
for p, v in project_info.items():
    st1.add(v['category'])
    st2.add(v["sub_category"])
st1, st2 = list(st1), list(st2)
d_st1, d_st2 = {}, {}
for i, x in enumerate(st1):
    d_st1[x] = i
for i, x in enumerate(st2):
    d_st2[x] = i

for key in project_info.keys():
    project_info[key]["category"] = d_st1[project_info[key]["category"]]
    project_info[key]["sub_category"] = d_st2[project_info[key]["sub_category"]]

print(st1, st2)

[2, 3, 5, 6, 7, 9, 10] [2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 29, 30, 31, 33, 34, 35, 36, 37, 42, 100]


## read entry

entry_info结构：按照时间戳排序的dict，key为e_id，value：
* entry_id: entry的id (int)
* project_id：所属的project id (int)
* worker_id：完成这个任务的worker的id (int)
* score：完成这个任务的worker得到的分数 (int)
* entry_created_at：entry时间戳 (int)
* withdrawn：是否拒绝 (int, 0和1)

In [8]:
import json, os
entry_info = {}

def time_parse(t: str):
    return int(''.join(c for c in t if c.isdigit()))

entry_paths = os.listdir(entry_path)
for e_path in entry_paths:
    if not "txt" in e_path:
        continue
    p_id = int(e_path.split("_")[1])

    with open(entry_path + e_path, "r") as f:
        entry_datas = json.loads(f.readlines()[0])['results']
        for entry_data in entry_datas:
            e_id = int(entry_data["id"])
            e_dict = {"project_id": p_id, "entry_id": e_id}
            e_dict["worker_id"] = entry_data["author"]
            e_dict["score"] = entry_data["revisions"][-1]["score"]
            e_dict["entry_created_at"] = time_parse(entry_data["entry_created_at"])
            e_dict["withdrawn"] = int(entry_data["withdrawn"])
            entry_info[e_id] = e_dict


print(len(entry_info))

486483


In [9]:
with open("entrys.json", "x") as f:
    f.write(json.dumps(entry_info, ensure_ascii=False))
with open("projects.json", "x") as f:
    f.write(json.dumps(project_info, ensure_ascii=False))
with open("workers.json", "x") as f:
    f.write(json.dumps(worker, ensure_ascii=False))
