In [5]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

BASEDIR = "/work3/s204163/wiki/data-batches/"
cutoff_percentile = 98


    

This file is used to filter the data, by cutting off the top percentile in order to remove outliers. Notice that it also deletes all references to the removed data in the other files. 

In [6]:
user_path = "/work3/s204163/wiki/data-batches/users.pickle"


with open(user_path, "rb") as f:
    users = pickle.load(f)
    
user_n_articles = {str(user.user_id): int(len(user.article_ids)) for user in users.values()}

# Divide users into two groups, based on number of articles
# First group should consists of the bottom 99 percentile of users
# Second group should consists of the top 1 percentile of users

cutoff = np.percentile(list(user_n_articles.values()), cutoff_percentile)

group_keep = [user for user, n_articles in user_n_articles.items() if n_articles < cutoff]
group_discard = [user for user, n_articles in user_n_articles.items() if n_articles >= cutoff]

print("Cutoff: ", cutoff)
print("Group 1: ", len(group_keep))
print("Group 2: ", len(group_discard))

user_keep = {user_id: users[user_id] for user_id in group_keep}
user_discard = {user_id: users[user_id] for user_id in group_discard}


with open(BASEDIR + f"users_keep_cutoff{cutoff_percentile}.pickle", "wb") as f:
    pickle.dump(user_keep, f)

with open(BASEDIR + f"users_discard_cutoff{cutoff_percentile}.pickle", "wb") as f:
    pickle.dump(user_discard, f)



Cutoff:  11.0
Group 1:  1390098
Group 2:  28922


In [7]:
article_path = "/work3/s204163/wiki/data-batches/articles.pickle"

with open(article_path, "rb") as f:
    articles = pickle.load(f)
    
article_n_users = {str(article.article_id): int(len(article.user_ids)) for article in articles.values()}

# Divide users into two groups, based on number of users
# First group should consists of the bottom 99 percentile of articles
# Second group should consists of the top 1 percentile of articles

cutoff = np.percentile(list(article_n_users.values()), cutoff_percentile)

group_keep = [article for article, n_users in article_n_users.items() if n_users < cutoff]
group_discard = [article for article, n_users in article_n_users.items() if n_users >= cutoff]

print("Cutoff: ", cutoff)
print("Group 1: ", len(group_keep))
print("Group 2: ", len(group_discard))

article_keep = {article_id: articles[article_id] for article_id in group_keep}
article_discard = {article_id: articles[article_id] for article_id in group_discard}


with open(BASEDIR + f"articles_keep_cutoff{cutoff_percentile}.pickle", "wb") as f:
    pickle.dump(article_keep, f)
    
with open(BASEDIR + f"articles_discard_cutoff{cutoff_percentile}.pickle", "wb") as f:
    pickle.dump(article_discard, f)
    


Cutoff:  1443.0600000000013
Group 1:  19649
Group 2:  401


In [8]:
with open(BASEDIR + f"users_keep_cutoff{cutoff_percentile}.pickle", "rb") as f:
    users = pickle.load(f)

with open(BASEDIR + f"articles_keep_cutoff{cutoff_percentile}.pickle", "rb") as f:
    articles = pickle.load(f)
    
    

In [9]:
from tqdm import tqdm

In [13]:
print("Users: ", len(users))
print("Articles: ", len(articles))

for article in tqdm(articles.values(), total=len(articles)):
    article.user_ids = [user_id for user_id in article.user_ids if user_id in users.keys()]

articles_sync = {article.article_id: article for article in articles.values() if len(article.user_ids) > 0}

for user in tqdm(users.values(), total=len(users)):
    user.article_ids = [article_id for article_id in user.article_ids if article_id in articles.keys()]

users_sync = {user.user_id: user for user in users.values() if len(user.article_ids) > 0}

print("Users: ", len(users_sync))
print("Articles: ", len(articles_sync))


with open(BASEDIR + f"users_keep_cutoff{cutoff_percentile}_sync.pickle", "wb") as f:
    pickle.dump(users_sync, f)

with open(BASEDIR + f"articles_keep_cutoff{cutoff_percentile}_sync.pickle", "wb") as f:
    pickle.dump(articles_sync, f)
    
print(BASEDIR + f"users_keep_cutoff{cutoff_percentile}_sync.pickle")
print(BASEDIR + f"articles_keep_cutoff{cutoff_percentile}_sync.pickle")

Users:  1390098
Articles:  19649


100%|██████████| 19649/19649 [00:01<00:00, 19536.60it/s]
100%|██████████| 1390098/1390098 [00:01<00:00, 789934.03it/s]


Users:  982463
Articles:  17963
/work3/s204163/wiki/data-batches/users_keep_cutoff98_sync.pickle
/work3/s204163/wiki/data-batches/articles_keep_cutoff98_sync.pickle


In [18]:
# Sanity check
nodes = set()
for article in articles_sync.values():
    for user_id in article.user_ids:
        nodes.add(user_id)
print(len(nodes))

unique_articles = set()
for user in users_sync.values():
    for article_id in user.article_ids:
        unique_articles.add(article_id)
print(len(unique_articles))

982463
17963
