In [1]:
import json
import pandas as pd
from sseclient import SSEClient as EventSource
from sklearn.model_selection import train_test_split

In [None]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
sampling_rate = 20
counter = 0
N = 40_000
with open("data/raw_edits.json", "w") as file:
    for event in EventSource(url):
        if not event.event == 'message':
            continue
        try:
            change = json.loads(event.data)
        except (ValueError, KeyError):
            continue

        # Hashing by username to make sure all edits of the same user are taken
        user = change.get("user")
        if user and hash(user) % 100 < sampling_rate:
            json.dump(change, file)
            file.write('\n')  # Write each change on a new line
            
            counter += 1
            if counter % 1000 == 0:
                print(f"Processed {counter}/40000 records...")
                
            if counter > N:
                break

In [6]:
raw_df = pd.read_json("data/raw_edits.json", lines=True)
raw_df.head()

Unnamed: 0,$schema,meta,id,type,namespace,title,title_url,comment,timestamp,user,...,server_url,server_name,server_script_path,wiki,parsedcomment,notify_url,minor,length,revision,patrolled
0,/mediawiki/recentchange/1.0.0,{'uri': 'https://ru.wikiquote.org/wiki/%D0%A3%...,23934530.0,log,2,Участник:209.127.107.218,https://ru.wikiquote.org/wiki/%D0%A3%D1%87%D0%...,{{Blocked proxy}} <!-- 8AFD7D5E0F79 -->,2024-12-14 07:55:41,QBA-bot,...,https://ru.wikiquote.org,ru.wikiquote.org,/w,ruwikiquote,{{Blocked proxy}} &lt;!-- 8AFD7D5E0F79 --&gt;,,,,,
1,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,2683954000.0,log,6,File:Bürgermeisterwahl 2023 Malchow - René Put...,https://commons.wikimedia.org/wiki/File:B%C3%B...,per [[Commons:Deletion requests/File:Bürgermei...,2024-12-14 07:55:39,Krd,...,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"per <a href=""/wiki/Commons:Deletion_requests/F...",,,,,
2,/mediawiki/recentchange/1.0.0,{'uri': 'https://en.wikipedia.org/wiki/2025_Co...,1851732000.0,edit,0,2025 County Championship,https://en.wikipedia.org/wiki/2025_County_Cham...,[[WP:AES|←]]Removed redirect to [[County Champ...,2024-12-14 07:55:35,FieldOfWheat,...,https://en.wikipedia.org,en.wikipedia.org,/w,enwiki,"<a href=""/wiki/Wikipedia:AES"" class=""mw-redire...",https://en.wikipedia.org/w/index.php?diff=1263...,0.0,"{'old': 32, 'new': 53579}","{'old': 1261093459, 'new': 1263028105}",
3,/mediawiki/recentchange/1.0.0,{'uri': 'https://ru.wikiquote.org/wiki/%D0%A3%...,23934530.0,log,2,Участник:209.127.107.89,https://ru.wikiquote.org/wiki/%D0%A3%D1%87%D0%...,{{Blocked proxy}} <!-- E0471734C1B0 -->,2024-12-14 07:55:42,QBA-bot,...,https://ru.wikiquote.org,ru.wikiquote.org,/w,ruwikiquote,{{Blocked proxy}} &lt;!-- E0471734C1B0 --&gt;,,,,,
4,/mediawiki/recentchange/1.0.0,{'uri': 'https://commons.wikimedia.org/wiki/Fi...,2683954000.0,edit,6,File:Minster School ruins - geograph.org.uk - ...,https://commons.wikimedia.org/wiki/File:Minste...,/* wbeditentity-update-languages-and-other-sho...,2024-12-14 07:55:41,BotMultichillT,...,https://commons.wikimedia.org,commons.wikimedia.org,/w,commonswiki,"‎<span dir=""auto""><span class=""autocomment"">Ch...",https://commons.wikimedia.org/w/index.php?diff...,0.0,"{'old': 6842, 'new': 9564}","{'old': 939308420, 'new': 970056491}",1.0


In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40001 entries, 0 to 40000
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   $schema             40001 non-null  object        
 1   meta                40001 non-null  object        
 2   id                  39079 non-null  float64       
 3   type                40001 non-null  object        
 4   namespace           40001 non-null  int64         
 5   title               40001 non-null  object        
 6   title_url           40001 non-null  object        
 7   comment             40001 non-null  object        
 8   timestamp           40001 non-null  datetime64[ns]
 9   user                40001 non-null  object        
 10  bot                 40001 non-null  bool          
 11  log_id              1991 non-null   float64       
 12  log_type            1991 non-null   object        
 13  log_action          1991 non-null   object    

In [8]:
df = raw_df[raw_df["bot"].notna()]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40001 entries, 0 to 40000
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   $schema             40001 non-null  object        
 1   meta                40001 non-null  object        
 2   id                  39079 non-null  float64       
 3   type                40001 non-null  object        
 4   namespace           40001 non-null  int64         
 5   title               40001 non-null  object        
 6   title_url           40001 non-null  object        
 7   comment             40001 non-null  object        
 8   timestamp           40001 non-null  datetime64[ns]
 9   user                40001 non-null  object        
 10  bot                 40001 non-null  bool          
 11  log_id              1991 non-null   float64       
 12  log_type            1991 non-null   object        
 13  log_action          1991 non-null   object    

In [9]:
df["bot"].value_counts()

bot
False    21691
True     18310
Name: count, dtype: int64

In [10]:
df.to_json("data/edits.json", orient="records", lines=True)