In [16]:
import json
import pandas as pd
from sseclient import SSEClient as EventSource
from sklearn.model_selection import train_test_split

In [15]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
sampling_rate = 20
counter = 0
N = 40_000
with open("data/raw_edits.json", "w") as file:
    for event in EventSource(url):
        if not event.event == 'message':
            continue
        try:
            change = json.loads(event.data)
        except (ValueError, KeyError):
            continue
            
        if hash(change["meta"]["id"]) % 100 < sampling_rate:
            json.dump(change, file)
            file.write('\n')  # Write each change on a new line
            
            counter += 1
            if counter % 1000 == 0:
                print(f"Processed {counter}/40000 records...")
                
            if counter > N:
                break

In [19]:
raw_df = pd.read_json("data/raw_edits.json", lines=True)
raw_df.head()

Unnamed: 0,$schema,meta,id,type,namespace,title,title_url,comment,timestamp,user,...,parsedcomment,minor,patrolled,length,revision,log_id,log_type,log_action,log_params,log_action_comment
0,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Ca...,2656239000.0,categorize,14.0,Category:Little Bedwyn,https://commons.wikimedia.org/wiki/Category:Li...,[[:File:Sunflowers at Chisbury Manor Farm - ge...,2024-11-16 14:34:08,GeographBot,...,"<a href=""/wiki/File:Sunflowers_at_Chisbury_Man...",,,,,,,,,
1,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q870695...,2343572000.0,edit,0.0,Q87069579,https://www.wikidata.org/wiki/Q87069579,/* wbeditentity-update-languages-and-other-sho...,2024-11-16 14:34:10,Twofivesixbot,...,"‎<span dir=""auto""><span class=""autocomment"">Ch...",0.0,1.0,"{'old': 28985, 'new': 27368}","{'old': 2198076458, 'new': 2275533933}",,,,,
2,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q131199...,2343572000.0,edit,0.0,Q131199989,https://www.wikidata.org/wiki/Q131199989,/* wbsetclaim-update:2||1|1 */ [[Property:P31]...,2024-11-16 14:34:11,Borealex,...,"‎<span dir=""auto""><span class=""autocomment"">Из...",0.0,1.0,"{'old': 1315, 'new': 951}","{'old': 2275533853, 'new': 2275533946}",,,,,
3,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q115899...,2343572000.0,edit,0.0,Q115899409,https://www.wikidata.org/wiki/Q115899409,/* wbcreateclaim-create:1| */ [[Property:P1104...,2024-11-16 14:34:11,Vicarage,...,"‎<span dir=""auto""><span class=""autocomment"">Cr...",0.0,1.0,"{'old': 3710, 'new': 4070}","{'old': 2269527118, 'new': 2275533947}",,,,,
4,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/2011%E2...,1841691000.0,edit,0.0,2011–12 El Hierro eruption,https://en.wikipedia.org/wiki/2011%E2%80%9312_...,whitespace cleanup; convert special characters...,2024-11-16 14:34:10,Beland,...,whitespace cleanup; convert special characters...,0.0,,"{'old': 22423, 'new': 22416}","{'old': 1243962954, 'new': 1257763333}",,,,,


In [20]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40002 entries, 0 to 40001
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   $schema             40002 non-null  object        
 1   meta                40002 non-null  object        
 2   id                  39248 non-null  float64       
 3   type                40000 non-null  object        
 4   namespace           40000 non-null  float64       
 5   title               40000 non-null  object        
 6   title_url           40000 non-null  object        
 7   comment             40000 non-null  object        
 8   timestamp           40000 non-null  datetime64[ns]
 9   user                39999 non-null  object        
 10  bot                 40000 non-null  float64       
 11  notify_url          38143 non-null  object        
 12  server_url          40000 non-null  object        
 13  server_name         40000 non-null  object    

In [24]:
df = raw_df[raw_df["bot"].notna()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40000 entries, 0 to 40001
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   $schema             40000 non-null  object        
 1   meta                40000 non-null  object        
 2   id                  39248 non-null  float64       
 3   type                40000 non-null  object        
 4   namespace           40000 non-null  float64       
 5   title               40000 non-null  object        
 6   title_url           40000 non-null  object        
 7   comment             40000 non-null  object        
 8   timestamp           40000 non-null  datetime64[ns]
 9   user                39999 non-null  object        
 10  bot                 40000 non-null  float64       
 11  notify_url          38143 non-null  object        
 12  server_url          40000 non-null  object        
 13  server_name         40000 non-null  object        


In [25]:
df["bot"].value_counts()

bot
0.0    26044
1.0    13956
Name: count, dtype: int64

In [34]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["bot"])
len(train_df), len(test_df)

(32000, 8000)

In [37]:
train_df.to_json("data/train.json", orient="records", lines=True)
test_df.to_json("data/test.json", orient="records", lines=True)