In [31]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm

In [32]:
ARTICLE_ID = 'A37764'
USER_ID = 'U165.146.213.31\n'

In [35]:
# Get all revisions containing a specific user or article
def get_data(object_id : str, N : int = None) -> None | list:
    
    data_type = object_id[0]
    object_id = object_id[1:]
    
    match data_type:
        case 'U':
            map_path = "/work3/s204163/wiki/data-batches/user_id_map.pickle"
            column = 'user_id'
            data_type = 'user'
        case 'A':
            map_path = "/work3/s204163/wiki/data-batches/article_id_map.pickle"
            object_id = int(object_id)
            column = 'article_id'
            data_type = 'article'
        case _:
            raise ValueError("Invalid data type, DATATYPE MUST BE SPECIFIED IN THE ID AS THE FIRST CHARACTER (U or A)")
            
    with open(map_path, 'rb') as f:
        mapping = pickle.load(f)
        
    print(f"Looking up object of type {data_type} with id: {object_id}")
    
    batch_ids = mapping.get(str(object_id), None) # Get all batches containing the object
    
    print(f"Found {len(batch_ids)} instances")
    
    if batch_ids == None:
        return None
    
    print(f"Looking up instances")
    revisions = []
    
    # Find all revisions in all the batches containing a reference to the object
    for batch_id in tqdm(batch_ids):
        batch_path = f"/work3/s204163/wiki/data-batches/batch{batch_id}.pickle"
        with open(batch_path, 'rb') as f:
            batch = pickle.load(f)
            
        # Find Article in batch
        revision_indexes = list(batch.index[batch[column] == object_id])
        for revision_index in revision_indexes:
            revision = batch.loc[revision_index]
            revisions.append(revision)
            
            if N:
                if len(revisions) >= N:
                    print(f"Found {len(revisions)} revisions")
                    return revisions
    
    print(f"Found {len(revisions)} revisions")
    return revisions

            
res = get_data(ARTICLE_ID, 10)
        

Looking up object of type article with id: 37764
Found 2566 instances
Looking up instances


  0%|          | 0/2566 [00:00<?, ?it/s]

Found 10 revisions



