In [1]:
import pandas as pd
import glob
import os
from pathlib import Path
import pickle
import random
import numpy as np
from collections import defaultdict
import json
root = Path('../data/Wiki/vews_dataset_v1.1')

##### 1. load raw data  and extract attributes.

In [2]:
editingRecordsRaw = []
usr2Labels = {}

for filename in root.glob('benign_*.csv'):
    _editing_records = pd.read_csv(filename)
    editingRecordsRaw.append(_editing_records[['username', 'pagetitle', 'isReverted']])
    usr2Labels.update(
        dict([(_name, int(_label)) for _name, _label in zip(_editing_records['username'].to_numpy().tolist(), 
                                                       np.zeros(_editing_records.shape[0]).tolist())]))  
for filename in root.glob('vandal_*.csv'):
    _editing_records = pd.read_csv(filename)
    editingRecordsRaw.append(_editing_records[['username', 'pagetitle', 'isReverted']])
    usr2Labels.update(
        dict([(_name, int(_label)) for _name, _label in zip(_editing_records['username'].to_numpy().tolist(), 
                                                       np.ones(_editing_records.shape[0]).tolist())]))  

editingRecordsRaw = pd.concat(editingRecordsRaw, ignore_index=True)
editingRecordsRaw['isReverted'] = editingRecordsRaw['isReverted'].astype('int32')

In [9]:
np.sum(np.array(list(usr2Labels.values()))==0)

16496

In [10]:
len(usr2Labels)

33511

In [8]:
editingRecordsRaw.shape

(756944, 3)

In [7]:
pd.unique(editingRecordsRaw['username']).shape, pd.unique(editingRecordsRaw['pagetitle']).shape

((33511,), (216997,))

##### 2. Randomly select positive and negative samples 

In [22]:
editingRecordsRawPair = editingRecordsRaw.groupby(['username', 'pagetitle']).agg({'pagetitle': ['count']})
editingRecordsRawPair.columns = ['pairC']
editingRecordsRawUserOutDeg = editingRecordsRawPair.groupby(['username']).agg({'pairC': ['count']})
editingRecordsRawUserOutDeg.columns = ['userOutDeg']
editingRecordsRawUserOutDeg = editingRecordsRawUserOutDeg.reset_index()

In [23]:
user_2_outDeg = dict(
    [(editingRecordsRawUserOutDeg.iloc[i]['username'], editingRecordsRawUserOutDeg.iloc[i]['userOutDeg']) 
     for i in np.arange(editingRecordsRawUserOutDeg.shape[0])])

In [24]:
editingRecordsRawRevertRate = editingRecordsRaw.groupby(['username']).agg({'isReverted': ['count', 'sum']})
editingRecordsRawRevertRate.columns = ['Count', 'revertedCount']
editingRecordsRawRevertRate['revertRate'] = editingRecordsRawRevertRate['revertedCount']/editingRecordsRawRevertRate['Count']
editingRecordsRawRevertRate = editingRecordsRawRevertRate.reset_index()

In [25]:
user_2_revertRate = dict(
    [(editingRecordsRawRevertRate.iloc[i]['username'], editingRecordsRawRevertRate.iloc[i]['revertRate']) 
     for i in np.arange(editingRecordsRawRevertRate.shape[0])])

##### Filter users by the edited pages' amount

In [7]:
normal_usrs = list(set([_usr for _usr, _label in usr2Labels.items() if _label == 0 and user_2_outDeg[_usr] > 1 and user_2_revertRate[_usr] < .01]))
abnormal_usrs = list(set([_usr for _usr, _label in usr2Labels.items() if _label == 1 and user_2_outDeg[_usr] >= 20 and 
                         user_2_revertRate[_usr] > .9]))

In [52]:
normal_usrs = list(set([_usr for _usr, _label in usr2Labels.items() if _label == 0 and user_2_outDeg[_usr] > 8]))
abnormal_usrs = list(set([_usr for _usr, _label in usr2Labels.items() if _label == 1 and user_2_outDeg[_usr] >= 100 ]))

In [53]:
len(normal_usrs), len(abnormal_usrs)

(3487, 30)

##### 3. group the selected data by 'user' and 'page'  and derive two new metrics on 'isReverted', editting count and editting revert rate in a group-specific way. 

In [54]:
random.shuffle(normal_usrs)
random.shuffle(abnormal_usrs)
sel_normal_usrs = random.sample(normal_usrs, k=3000)
sel_abnormal_usrs = random.sample(abnormal_usrs, k=30)

sel_usrs = sel_normal_usrs + sel_abnormal_usrs
usr2LabelsSel = {}
for _usr_name in sel_normal_usrs:
    usr2LabelsSel[_usr_name] = 0
for _usr_name in sel_abnormal_usrs:
    usr2LabelsSel[_usr_name] = 1

In [55]:
editingRecordsStatsRaw = editingRecordsRaw.groupby(['username', 'pagetitle']).agg({'isReverted': ['count', 'sum']})
editingRecordsStatsRaw.columns = ['editCount', 'revertCount']
editingRecordsStatsRaw['revertRate'] = editingRecordsStatsRaw['revertCount']/editingRecordsStatsRaw['editCount']
editingRecordsStatsRaw = editingRecordsStatsRaw.reset_index()

In [56]:
editingRecordsStatsRaw

Unnamed: 0,username,pagetitle,editCount,revertCount,revertRate
0,!LIKETHEPOP!,Ernest Shackleton,4,4,1.0
1,!LIKETHEPOP!,User talk:!LIKETHEPOP!,2,1,0.5
2,!bdpqbd! 2,Claire Summers,1,1,1.0
3,!bdpqbd! 2,Daylight (Needtobreathe album),1,1,1.0
4,"""Weird Nerd"" Yankovic",Ball-peen hammer,1,1,1.0
...,...,...,...,...,...
264571,현대증권 시즌1,Jajangmyeon,5,5,1.0
264572,현대증권 시즌1,User talk:현대증권 시즌1,2,0,0.0
264573,현대증권 시즌2,Beef,3,3,1.0
264574,현대증권 시즌2,Mocaccino,11,11,1.0


In [57]:
editingRecordsStatsSel = editingRecordsStatsRaw[editingRecordsStatsRaw.username.isin(sel_usrs)]

In [58]:
editingRecordsStatsSel

Unnamed: 0,username,pagetitle,editCount,revertCount,revertRate
14,*7Risa7*,Ariel (Once Upon a Time),2,0,0.0
15,*7Risa7*,Brady family,1,0,0.0
16,*7Risa7*,Dark Hollow,3,0,0.0
17,*7Risa7*,Deep Breath (Doctor Who),1,0,0.0
18,*7Risa7*,Down the Rabbit Hole (Once Upon a Time in Wond...,1,0,0.0
...,...,...,...,...,...
264246,^v^ FreeBird ^v^,Steve Phillips (musician),2,0,0.0
264247,^v^ FreeBird ^v^,The Devil Went Down to Georgia,2,0,0.0
264248,^v^ FreeBird ^v^,Twang,1,1,1.0
264249,^v^ FreeBird ^v^,User talk:^v^ FreeBird ^v^,1,0,0.0


##### 4. According to the input format of OPERA , remodel the dataset.

In [59]:
id_2_usr = dict([(i, _username) for i, _username in enumerate(set(editingRecordsStatsSel['username'].to_numpy().tolist()))])
usr_2_id = dict([(_username, i) for i, _username in enumerate(set(editingRecordsStatsSel['username'].to_numpy().tolist()))])
usr_num_base = len(usr_2_id)
id_2_page = dict([(i + usr_num_base, _title) for i, _title in enumerate(set(editingRecordsStatsSel['pagetitle'].to_numpy().tolist()))])
page_2_id = dict([(_title, i + usr_num_base) for i, _title in enumerate(set(editingRecordsStatsSel['pagetitle'].to_numpy().tolist()))])

In [60]:
len(id_2_usr), usr_num_base, len(id_2_page), len(page_2_id)

(3030, 3030, 125392, 125392)

In [61]:
with open('../data/Wiki/id_2_usr.json', 'w') as fp:
    json.dump(id_2_usr, fp)
with open('../data/Wiki/usr_2_id.json', 'w') as fp:
    json.dump(usr_2_id, fp)
with open('../data/Wiki/id_2_page.json', 'w') as fp:
    json.dump(id_2_page, fp)
with open('../data/Wiki/page_2_id.json', 'w') as fp:
    json.dump(page_2_id, fp)

In [63]:
f_graph = open('../data/Wiki/usr_page.graph', "w")
f_graphW_binary = open('../data/Wiki/usr_page_wtgraph_binary.txt', "w")
f_graphW_count = open('../data/Wiki/usr_page_wtgraph_count.txt', "w")
f_graphW_revert = open('../data/Wiki/usr_page_wtgraph_revertRate.txt', "w")
  
for i in np.arange(editingRecordsStatsSel.shape[0]):
    
    try:
        _username = editingRecordsStatsSel.iloc[i]['username']
        _pagetitle = editingRecordsStatsSel.iloc[i]['pagetitle']
        _editCount = editingRecordsStatsSel.iloc[i]['editCount']
        _revertRate = editingRecordsStatsSel.iloc[i]['revertRate']
        f_graph.write(str(usr_2_id[_username]) + " " + str(page_2_id[_pagetitle]) + "\n")
        f_graphW_binary.write(str(usr_2_id[_username]) + " " + str(page_2_id[_pagetitle]) + " " + "1"  + "\n")
        f_graphW_count.write(str(usr_2_id[_username]) + " " + str(page_2_id[_pagetitle]) + " " + str(_editCount)  + "\n")
        f_graphW_revert.write(str(usr_2_id[_username]) + " " + str(page_2_id[_pagetitle]) + " " + str(_revertRate)  + "\n")
    except:
        print(_username, _pagetitle, _editCount, _revertRate)
        break

f_graph.close()
f_graphW_binary.close()
f_graphW_count.close()
f_graphW_revert.close()

In [64]:
usr2LabelsSel = dict([(usr_2_id[_usr], _label) for _usr, _label in usr2LabelsSel.items()])
with open('../data/Wiki/usr2LabelsSel.json', 'w') as fp:
    json.dump(usr2LabelsSel, fp)