In [1]:
import os
import re
import json
import requests
from pathlib import Path
import datetime
import dateparser
import pandas as pd
import numpy as np
from IPython.core.display import HTML
from IPython.display import JSON

from bs4 import BeautifulSoup
from bs4.element import Tag

from utils import get_edit_pages_metadata, _update_date_column

## Initialization

In [2]:
def pretty_json_html(data):
    json_str = json.dumps(data, indent=4, ensure_ascii=False)
    html = f"<pre>{json_str}</pre>"
    display(HTML(html))

In [3]:
# Main folder for data 
folder_main = "../data"
Path(folder_main).mkdir(parents=True, exist_ok=True)

today = str(datetime.datetime.today().date())
print(f'Current Datetime is {today}.')

with open('config_lang.json') as f:
    config_lang = json.load(f)

# Extend config file
config = {}
for item in config_lang:
    lang_wiki = config_lang[item]['lang_wiki']
    config_temp = {
        "path_save_pages_current": f"{folder_main}/dumps/{lang_wiki}_pages_{today}.csv",
        "path_save_pages": f"{folder_main}/{lang_wiki}_pages.csv",
        "path_save_corections": f"{folder_main}/{lang_wiki}_corrections.csv"
    }
    config[item] = {**config_lang[item], **config_temp}
config

print()
print("Config:")
pretty_json_html(config)

print()
print("Languages:")
language_dict = {v["lang_wiki"]: v["language"] for k, v in config.items()}
pretty_json_html(language_dict)

with open('metadata/languages.json', 'w', encoding='utf-8') as f:
    json.dump(language_dict, f, ensure_ascii=False, indent=4)

Current Datetime is 2025-06-18.

Config:



Languages:


In [4]:
rename_columns_pages_dataframe = {
    "rozdíl": "url_diff",
    "historie": "url_history",
    "Unterschied": "url_diff",
    "Versionen": "url_history",
    "izmaiņas": "url_diff",
    "hronoloģija": "url_history",
    "erin": "url_diff",
    "ajal": "url_history",
    "διαφ.": "url_diff",
    "ιστορ.": "url_history",
    "diff": "url_diff",
    "cron": "url_history",
    "razl": "url_diff",
    "zgod": "url_history",
    "skillnad": "url_diff",
    "historik": "url_history",
    "різн.": "url_diff",
    "історія": "url_history",
    "hist": "url_history"
}

## Pages: Data

In [5]:
# create empty csv files for pages if it doesn't exist; 
# for first run
columns_pages = [
    'date_edit_wiki', 'date_update', 'lang', 'lang_code', 'url_diff', 'url_history', 'title',
    'url', 'author', 'tags', 'date_edit'
]

for key, value in config.items():
    if not os.path.isfile(value['path_save_pages']):
        df_empty = pd.DataFrame(list())
        df_empty.to_csv(value['path_save_pages'], index=False)

In [6]:
# test run for one language
language = "ukrainian"
setup = config[language]

url = f"https://{setup['lang_wiki']}.wikipedia.org/w/index.php?hidebots=1&\
hidecategorization=1&\
hideWikibase=1&\
tagfilter=newcomer+task+copyedit&\
limit=5000&days=30&\
title={setup['page_title']}&\
urlversion=2"

metadata_init = {
    "date_update": today,
    "lang": language,
    "lang_code": setup["lang_wiki"]
}

print(f"URL for {language} language: {url}")

metadata = get_edit_pages_metadata(url, metadata_init)
df_metadata = pd.DataFrame(metadata).rename(columns=rename_columns_pages_dataframe)

if not df_metadata.empty:
    _update_date_column(df_metadata, date_col_in="date_edit_wiki", date_col_out="date_edit")
    dates = sorted([i for i in df_metadata["date_edit"].unique() if i is not None])
    print("  final - # rows: {}, start-date: {}, end-date: {}".format(
        df_metadata.shape[0], min(dates), max(dates))
    )
    display(df_metadata.tail(3))

URL for ukrainian language: https://uk.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Спеціальна:Нові_редагування&urlversion=2
  final - # rows: 291, start-date: 2025-05-19, end-date: 2025-06-17


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
288,19 травня 2025,2025-06-18,ukrainian,uk,/w/index.php?title=%D0%A2%D0%B5%D1%80%D0%B5%D0...,/w/index.php?title=%D0%A2%D0%B5%D1%80%D0%B5%D0...,Теребовлянський район,/wiki/%D0%A2%D0%B5%D1%80%D0%B5%D0%B1%D0%BE%D0%...,MrPuVo,"[Візуальний редактор, Редагування з мобільного...",2025-05-19
289,19 травня 2025,2025-06-18,ukrainian,uk,/w/index.php?title=%D0%A2%D0%B5%D1%80%D0%B5%D0...,/w/index.php?title=%D0%A2%D0%B5%D1%80%D0%B5%D0...,Теребовлянський район,/wiki/%D0%A2%D0%B5%D1%80%D0%B5%D0%B1%D0%BE%D0%...,MrPuVo,"[Візуальний редактор, Редагування з мобільного...",2025-05-19
290,19 травня 2025,2025-06-18,ukrainian,uk,/w/index.php?title=%D0%9C%D0%B0%D0%B1%D0%B0%D1...,/w/index.php?title=%D0%9C%D0%B0%D0%B1%D0%B0%D1...,Мабар,/wiki/%D0%9C%D0%B0%D0%B1%D0%B0%D1%80,Фаєнгольд Евеліна Миколаївна,"[перше редагування, Візуальний редактор, Редаг...",2025-05-19


In [7]:
# run for all languages (see config dictionary)

print("Support Languages:", ', '.join(list(language_dict.values())))
for language in list(config.keys())[:]:
    print(language.upper())
    setup = config[language]
            
    url = f"https://{setup['lang_wiki']}.wikipedia.org/w/index.php?hidebots=1&\
hidecategorization=1&\
hideWikibase=1&\
tagfilter=newcomer+task+copyedit&\
limit=5000&days=30&\
title={setup['page_title']}&\
urlversion=2"
    
    metadata_init = {
            "date_update": today,
            "lang": language,
            "lang_code": setup["lang_wiki"]
        }
    
    print(f"  URL for {language} language: {url}")
    
    if setup["is_process"]:
        metadata = get_edit_pages_metadata(url, metadata_init)
        df_metadata_cur = pd.DataFrame(metadata).rename(columns=rename_columns_pages_dataframe)

        if "author" in df_metadata_cur.columns:
            df_metadata_cur = df_metadata_cur[df_metadata_cur["author"].notna()].copy().reset_index(drop=True)

        if not df_metadata_cur.empty:
            _update_date_column(df_metadata_cur, date_col_in="date_edit_wiki", date_col_out="date_edit")
            print("  current - # rows: {}, start-date: {}, end-date: {}".format(
                df_metadata_cur.shape[0], 
                df_metadata_cur["date_edit"].min(), 
                df_metadata_cur["date_edit"].max()
            ))

            df_metadata_old = pd.read_csv(setup["path_save_pages"])

            df_metadata = pd.concat([df_metadata_cur, df_metadata_old], axis=0)\
                            .sort_values(["date_update", "date_edit"])\
                            .drop_duplicates(subset=["url_diff"], keep="last")\
                            .reset_index(drop=True)

            print("  final - # rows: {}, start-date: {}, end-date: {}".format(
                df_metadata.shape[0], 
                df_metadata["date_edit"].min(), 
                df_metadata["date_edit"].max()
            ))
            display(df_metadata.tail(3))
            df_metadata_cur.to_csv(setup["path_save_pages_current"], index=False)
            df_metadata.to_csv(setup["path_save_pages"], index=False)
        else:
            print("  THERE ARE NO DATA FROM THE PAST 30 DAYS.")
        print()
    else:
        print(f"  {language.title()} is not included. Check config file.")
        print()

Support Languages: english, czech, estonian, german, greek, icelandic, italian, latvian, slovene, swedish, ukrainian
ENGLISH
  URL for english language: https://en.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Special:RecentChanges&urlversion=2
  English is not included. Check config file.

CZECH
  URL for czech language: https://cs.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Speciální:Poslední_změny&urlversion=2
  current - # rows: 59, start-date: 2025-05-19, end-date: 2025-06-17
  final - # rows: 538, start-date: 2024-09-29, end-date: 2025-06-17


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
535,17. 6. 2025,2025-06-18,czech,cs,/w/index.php?title=Wapiti_v%C3%BDchodn%C3%AD&c...,/w/index.php?title=Wapiti_v%C3%BDchodn%C3%AD&c...,Wapiti východní,/wiki/Wapiti_v%C3%BDchodn%C3%AD,Emzys,"[editace z Vizuálního editoru, Editační tipy, ...",2025-06-17
536,17. 6. 2025,2025-06-18,czech,cs,/w/index.php?title=SkyEurope_Airlines&curid=24...,/w/index.php?title=SkyEurope_Airlines&curid=24...,SkyEurope Airlines,/wiki/SkyEurope_Airlines,Dominik Záškoda,"[editace z Vizuálního editoru, Editační tipy, ...",2025-06-17
537,17. 6. 2025,2025-06-18,czech,cs,/w/index.php?title=Hladk%C3%A9_%C5%BDivotice&c...,/w/index.php?title=Hladk%C3%A9_%C5%BDivotice&c...,Hladké Životice,/wiki/Hladk%C3%A9_%C5%BDivotice,Nitram1906,"[editace z Vizuálního editoru, Editační tipy, ...",2025-06-17



ESTONIAN
  URL for estonian language: https://et.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Eri:Viimased_muudatused&urlversion=2
  current - # rows: 6, start-date: 2025-06-02, end-date: 2025-06-06
  final - # rows: 54, start-date: 2024-10-25, end-date: 2025-06-06


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
51,4. juuni 2025,2025-06-18,estonian,et,/w/index.php?title=India_pantergeko&curid=6323...,/w/index.php?title=India_pantergeko&curid=6323...,India pantergeko,/wiki/India_pantergeko,Ojassaar,"[Uue kasutaja ülesanne, Uue kasutaja ülesanne:...",2025-06-04
52,5. juuni 2025,2025-06-18,estonian,et,/w/index.php?title=Rippaed&curid=674836&diff=6...,/w/index.php?title=Rippaed&curid=674836&action...,Rippaed,/wiki/Rippaed,Sininine,"[Visuaalmuudatus, Uue kasutaja ülesanne, Uue k...",2025-06-05
53,6. juuni 2025,2025-06-18,estonian,et,/w/index.php?title=Javier_Cercas&curid=595519&...,/w/index.php?title=Javier_Cercas&curid=595519&...,Javier Cercas,/wiki/Javier_Cercas,Sininine,"[Visuaalmuudatus, Uue kasutaja ülesanne, Uue k...",2025-06-06



GERMAN
  URL for german language: https://de.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Spezial:Letzte_%C3%84nderungen&urlversion=2
  THERE ARE NO DATA FROM THE PAST 30 DAYS.

GREEK
  URL for greek language: https://el.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Ειδικό:ΠρόσφατεςΑλλαγές&urlversion=2
  current - # rows: 20, start-date: 2025-05-22, end-date: 2025-06-16
  final - # rows: 175, start-date: 2024-09-28, end-date: 2025-06-16


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
172,14 Ιουνίου 2025,2025-06-18,greek,el,/w/index.php?title=%CE%9C%CE%B9%CF%83%CF%81%CE...,/w/index.php?title=%CE%9C%CE%B9%CF%83%CF%81%CE...,Μισραΐμ Μέμφις,/wiki/%CE%9C%CE%B9%CF%83%CF%81%CE%B1%CE%90%CE%...,Kostivo,"[Οπτική επεξεργασία, Επεξεργασία από κινητό, Δ...",2025-06-14
173,16 Ιουνίου 2025,2025-06-18,greek,el,/w/index.php?title=%CE%A0%CE%B1%CE%BD%CE%B1%CE...,/w/index.php?title=%CE%A0%CE%B1%CE%BD%CE%B1%CE...,Παναγία η Μυρτιδιώτισσα,/wiki/%CE%A0%CE%B1%CE%BD%CE%B1%CE%B3%CE%AF%CE%...,Κισσόβρυος,"[Οπτική επεξεργασία, Αποστολή νεοεισερχόμενου,...",2025-06-16
174,16 Ιουνίου 2025,2025-06-18,greek,el,/w/index.php?title=%CE%A0%CE%B1%CE%BB%CE%AC%CF...,/w/index.php?title=%CE%A0%CE%B1%CE%BB%CE%AC%CF...,Παλάτι,/wiki/%CE%A0%CE%B1%CE%BB%CE%AC%CF%84%CE%B9,Κισσόβρυος,"[Οπτική επεξεργασία, Αποστολή νεοεισερχόμενου,...",2025-06-16



ICELANDIC
  URL for icelandic language: https://is.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Kerfiss%C3%AD%C3%B0a:N%C3%BDlegar_breytingar&urlversion=2
  THERE ARE NO DATA FROM THE PAST 30 DAYS.

ITALIAN
  URL for italian language: https://it.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Speciale:UltimeModifiche&urlversion=2
  current - # rows: 267, start-date: 2025-05-19, end-date: 2025-06-18
  final - # rows: 2836, start-date: 2024-09-28, end-date: 2025-06-18


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
2833,17 giu 2025,2025-06-18,italian,it,/w/index.php?title=Nintendo_New_York&curid=146...,/w/index.php?title=Nintendo_New_York&curid=146...,Nintendo New York,/wiki/Nintendo_New_York,Vukozlav,"[Modifica visuale, Modifica da mobile, Modific...",2025-06-17
2834,17 giu 2025,2025-06-18,italian,it,/w/index.php?title=Pietro_Sterbini&curid=22589...,/w/index.php?title=Pietro_Sterbini&curid=22589...,Pietro Sterbini,/wiki/Pietro_Sterbini,Vukozlav,"[Annullato, Modifica visuale, Modifica da mobi...",2025-06-17
2835,18 giu 2025,2025-06-18,italian,it,/w/index.php?title=Sam_Bartram&curid=8063046&d...,/w/index.php?title=Sam_Bartram&curid=8063046&a...,Sam Bartram,/wiki/Sam_Bartram,Allahuakibaro,"[Modifica visuale, Modifica da mobile, Modific...",2025-06-18



LATVIAN
  URL for latvian language: https://lv.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Special:RecentChanges&urlversion=2
  THERE ARE NO DATA FROM THE PAST 30 DAYS.

SLOVENE
  URL for slovene language: https://sl.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Posebno:ZadnjeSpremembe&urlversion=2
  current - # rows: 6, start-date: 2025-05-21, end-date: 2025-06-16
  final - # rows: 34, start-date: 2024-10-05, end-date: 2025-06-16


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
31,22. maj 2025,2025-06-18,slovene,sl,/w/index.php?title=%C5%A0panska_knji%C5%BEevno...,/w/index.php?title=%C5%A0panska_knji%C5%BEevno...,Španska književnost,/wiki/%C5%A0panska_knji%C5%BEevnost,Ostrorogi Jelen 1,"[vizualno urejanje, mobilno urejanje, mobilno ...",2025-05-22
32,22. maj 2025,2025-06-18,slovene,sl,/w/index.php?title=Pape%C5%BEinja_Ivana&curid=...,/w/index.php?title=Pape%C5%BEinja_Ivana&curid=...,Papežinja Ivana,/wiki/Pape%C5%BEinja_Ivana,Ostrorogi Jelen 1,"[vizualno urejanje, mobilno urejanje, mobilno ...",2025-05-22
33,16. junij 2025,2025-06-18,slovene,sl,/w/index.php?title=Tadej_Poga%C4%8Dar&curid=46...,/w/index.php?title=Tadej_Poga%C4%8Dar&curid=46...,Tadej Pogačar,/wiki/Tadej_Poga%C4%8Dar,Ostrorogi Jelen 1,"[vizualno urejanje, mobilno urejanje, mobilno ...",2025-06-16



SWEDISH
  URL for swedish language: https://sv.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Special:Senaste_ändringar&urlversion=2
  current - # rows: 103, start-date: 2025-05-19, end-date: 2025-06-18
  final - # rows: 358, start-date: 2024-09-29, end-date: 2025-06-18


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
355,18 juni 2025,2025-06-18,swedish,sv,/w/index.php?title=Camp_Sweden&curid=271219&di...,/w/index.php?title=Camp_Sweden&curid=271219&ac...,Camp Sweden,/wiki/Camp_Sweden,Mother of all evil,"[VE, Mobilredigering, Redigering via mobilsajt...",2025-06-18
356,18 juni 2025,2025-06-18,swedish,sv,/w/index.php?title=Kortes_tredje_lag_om_skenba...,/w/index.php?title=Kortes_tredje_lag_om_skenba...,Kortes tredje lag om skenbar rörelse,/wiki/Kortes_tredje_lag_om_skenbar_r%C3%B6relse,Mother of all evil,"[VE, Mobilredigering, Redigering via mobilsajt...",2025-06-18
357,18 juni 2025,2025-06-18,swedish,sv,/w/index.php?title=Dalm%C3%A5lare&curid=8597&d...,/w/index.php?title=Dalm%C3%A5lare&curid=8597&a...,Dalmålare,/wiki/Dalm%C3%A5lare,Mother of all evil,"[VE, Mobilredigering, Redigering via mobilsajt...",2025-06-18



UKRAINIAN
  URL for ukrainian language: https://uk.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Спеціальна:Нові_редагування&urlversion=2
  current - # rows: 291, start-date: 2025-05-19, end-date: 2025-06-17
  final - # rows: 1952, start-date: 2024-09-28, end-date: 2025-06-17


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
1949,17 червня 2025,2025-06-18,ukrainian,uk,/w/index.php?title=%D0%A1%D1%82%D1%80%D0%B0%D0...,/w/index.php?title=%D0%A1%D1%82%D1%80%D0%B0%D0...,Страговський стадіон,/wiki/%D0%A1%D1%82%D1%80%D0%B0%D0%B3%D0%BE%D0%...,Anna Lozian,"[Візуальний редактор, Завдання новачку, Завдан...",2025-06-17
1950,17 червня 2025,2025-06-18,ukrainian,uk,/w/index.php?title=%D0%A4%D0%B5%D0%BD%D0%B5%D1...,/w/index.php?title=%D0%A4%D0%B5%D0%BD%D0%B5%D1...,Фенербахче (спортивний клуб),/wiki/%D0%A4%D0%B5%D0%BD%D0%B5%D1%80%D0%B1%D0%...,Anna Lozian,"[Візуальний редактор, Завдання новачку, Завдан...",2025-06-17
1951,17 червня 2025,2025-06-18,ukrainian,uk,/w/index.php?title=%D0%A7%D0%B5%D1%80%D0%B2%D0...,/w/index.php?title=%D0%A7%D0%B5%D1%80%D0%B2%D0...,Червона гвардія Донбасу,/wiki/%D0%A7%D0%B5%D1%80%D0%B2%D0%BE%D0%BD%D0%...,Городоцька центральна публічна бібліотека,"[Візуальний редактор, Завдання новачку, Завдан...",2025-06-17





## Pages: Statistics

In [8]:
df_list = []
for i in sorted(os.listdir(folder_main)):
    if i.split('.')[-1]=='csv' and 'pages' in i:
        df_temp = pd.read_csv(os.path.join(folder_main, i))
        print(f"{i}, # rows: {df_temp.shape[0]}")
        df_list.append(df_temp)

df_pages_all = pd.concat(df_list, axis=0).reset_index(drop=True)

df_num_pages = df_pages_all['lang'].value_counts().reset_index()\
                                   .rename(columns={'index': 'language', 'lang': '# pages'})

with open('metadata/pages.json', 'w', encoding='utf-8') as f:
    json.dump(df_num_pages.to_dict('records'), f, ensure_ascii=False, indent=4)

cs_pages.csv, # rows: 538
de_pages.csv, # rows: 1706
el_pages.csv, # rows: 175
en_pages.csv, # rows: 9842
et_pages.csv, # rows: 54
is_pages.csv, # rows: 0
it_pages.csv, # rows: 2836
lv_pages.csv, # rows: 20
sl_pages.csv, # rows: 34
sv_pages.csv, # rows: 358
uk_pages.csv, # rows: 1952


In [9]:
df_pages_all.groupby(['lang', 'date_update'], dropna=False)['lang_code'].count()[:]['ukrainian']

date_update
2024-10-28    146
2024-11-06     63
2024-11-10    110
2024-11-25     76
2024-12-16     75
2025-01-15    216
2025-02-15    307
2025-03-14    122
2025-04-12     18
2025-04-17    153
2025-05-04     88
2025-05-13    287
2025-06-18    291
Name: lang_code, dtype: int64

In [10]:
print("start-date: {}, end-date: {}".format(
    df_pages_all[df_pages_all['date_edit'].notna()]['date_edit'].min(),
    df_pages_all[df_pages_all['date_edit'].notna()]['date_edit'].max()
))

df_pages_all[df_pages_all['date_edit'].isna()]['lang'].value_counts()
# df_pages_all['date_edit'].value_counts(dropna=False).sort_index()[:]

start-date: 2024-09-28, end-date: 2025-06-18


latvian    20
Name: lang, dtype: int64

## Data: Corrections Extraction

In [11]:
# create empty csv files for corrections if it doesn't exist; 
# for first run
columns_corrections = [
    'delitions', 'n_del', 'text_del', 'text_del_tag', 'insertions', 'n_ins', 'text_ins',
    'text_ins_tag', 'diff_url', 'diff_page'
]
for key, value in config.items():
    if not os.path.isfile(value['path_save_corections']):
        df_empty = pd.DataFrame(list(), columns=columns_corrections)
        df_empty.to_csv(value['path_save_corections'], index=False)

In [12]:
# test API
session = requests.Session()
api_url = "https://uk.wikipedia.org/w/api.php"

PARAMS = {
    'action': "compare",
    'format': "json",
    'prop': "diff|diffsize|user|size|comment",
    'fromrev': 43736886, #42726929,
    'torev': 41719237, #43671592
}

response = session.get(url=api_url, params=PARAMS).json()
# print(response.keys())
display(pd.json_normalize(response['compare']))
HTML(response['compare']['*'])

Unnamed: 0,fromsize,fromuser,fromuserid,fromcomment,fromparsedcomment,tosize,touser,touserid,tocomment,toparsedcomment,diffsize,*
0,12386,MinecAnton209,774975,"Виправлені орфографчні помилки, перефразування...","Виправлені орфографчні помилки, перефразування...",12145,Бучач-Львів,164714,,,32123,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Ря..."


In [13]:
def _add_tag(text_obj, cnt_input, tag_find, tag_wrap):
    """
    """
    text_temp = text_obj
    cnt = 0
    while cnt != cnt_input:
        text_init = text_temp.find_next(tag_find, {"class": "diffchange diffchange-inline"})
        text_temp.insert_before(tag_wrap)
        text_temp.insert_after(tag_wrap)
        text_temp = text_init
        cnt += 1
    return text_obj

def get_corrections(content, init_dict):
    """
    """
    content_obj = BeautifulSoup(content)

    results_list = []
    for row in content_obj.find_all('tr'):
        results_dict = {}
        before_obj = row.find("td", {"class": "diff-deletedline"})
        after_obj = row.find("td", {"class": "diff-addedline"})
#         print('b', before_obj)
#         print('a', after_obj)
    
        if isinstance(before_obj, Tag):
            text_deletions = before_obj.get_text()
            deletions_obj = before_obj.find_all("del", {"class": "diffchange diffchange-inline"})
            deletions = [i.contents for i in deletions_obj]
            n_deletions = len(deletions)
            tb = _add_tag(before_obj, cnt_input=n_deletions+1, tag_find="del", tag_wrap="{:-:}").get_text()
            status_b = True
        else:
            status_b = False
            
        if isinstance(after_obj, Tag):
            text_insertions = after_obj.get_text()
            insertions_obj = after_obj.find_all("ins", {"class": "diffchange diffchange-inline"})
            insertions = [i.contents for i in insertions_obj]
            n_insertions = len(insertions)
            ta = _add_tag(after_obj, cnt_input=n_insertions+1, tag_find="ins", tag_wrap="{:+:}").get_text()
            status_a = True
        else:
            status_a = False
        
        if status_b and status_a:
            results_dict['delitions'] = deletions
            results_dict['n_del'] = n_deletions
            results_dict['text_del'] = text_deletions
            results_dict['text_del_tag'] = tb
            results_dict['insertions'] = insertions
            results_dict['n_ins'] = n_insertions
            results_dict['text_ins'] = text_insertions
            results_dict['text_ins_tag'] = ta

            results_list.append({**results_dict, **init_dict})

    return results_list

In [14]:
pd.DataFrame(get_corrections(response['compare']['*'], {'date': 'x'}))

Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,date
0,[[ ]],1,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,[[&nbsp;]],1,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,x
1,"[[років], [виступав], [зігравши], [забивши]]",4,Почав грати в футбол за команду «Уредд» із Пор...,Почав грати в футбол за команду «Уредд» із Пор...,"[[роув], [виступив], [зіграв], [забив]]",4,Почав грати в футбол за команду «Уредд» із Пор...,Почав грати в футбол за команду «Уредд» із Пор...,x
2,"[[ ], [та]]",2,З 1928 по 1937 роки Йерген Юве грав і за збірн...,З 1928 по 1937 роки Йерген Юве грав і за збірн...,[],0,З 1928 по 1937 роки Йерген Юве грав і за збірн...,З 1928 по 1937 роки Йерген Юве грав і за збірн...,x
3,"[[[[Літні Олімпійські ігри 1936|], []]], [[[Бе...",8,Юве був капітаном норвезької команди і на [[Лі...,Юве був капітаном норвезької команди і на {:-:...,[],0,Юве був капітаном норвезької команди і на Олім...,Юве був капітаном норвезької команди і на Олім...,x
4,"[[Вінгео], [читав]]",2,Футбольні експерти-сучасники називали Юве одни...,Футбольні експерти-сучасники називали Юве одни...,"[[вінгео], [читал]]",2,Футбольні експерти-сучасники називали Юве одни...,Футбольні експерти-сучасники називали Юве одни...,x
5,"[[[[Базель|], []]], [ З], [та], [ рік редактор...",9,Юве вивчився на юриста в [[Базель|Базелі]] в 1...,Юве вивчився на юриста в {:-:}[[Базель|{:-:}Ба...,[],0,"Юве вивчився на юриста в Базелі в 1931 році, п...","Юве вивчився на юриста в Базелі в 1931 році, п...",x
6,[[декілька]],1,Йорґен Юве написав декілька книг на спортивну ...,Йорґен Юве написав {:-:}декілька{:-:} книг на ...,[[декількп]],1,Йорґен Юве написав декількп книг на спортивну ...,Йорґен Юве написав {:+:}декількп{:+:} книг на ...,x
7,"[[ ], [вела]]",2,"Батько&nbsp;— Уле Мартін Юве, дубильщик по про...","Батько&nbsp;— Уле Мартін Юве, дубильщик по про...",[],0,"Батько&nbsp;— Уле Мартін Юве, дубильщик по про...","Батько&nbsp;— Уле Мартін Юве, дубильщик по про...",x
8,[[ ]],1,"Йорген&nbsp;— старший із шести дітей в сім'ї, ...","Йорген&nbsp;— старший із шести дітей в сім'ї, ...",[],0,"Йорген&nbsp;— старший із шести дітей в сім'ї, ...","Йорген&nbsp;— старший із шести дітей в сім'ї, ...",x
9,"[[[[Друга світова війна|], []]]]",2,Під час [[Друга світова війна|Другої сівтової ...,Під час {:-:}[[Друга світова війна|{:-:}Другої...,[],0,Під час Другої сівтової війни служив в британс...,Під час Другої сівтової війни служив в британс...,x


In [15]:
pd.DataFrame(get_corrections(response['compare']['*'], {'date': 'x'})).iloc[1].values

array([list([['років'], ['виступав'], ['зігравши'], ['забивши']]), 4,
       'Почав грати в футбол за команду «Уредд» із Порсгрунна у віці 16 років. В 1926 років переїхав в [[Осло]], де став гравцем «Люни». В 1928 році команда Юве пройшла у фінал Кубка Норвегії, але під час вирішальної гри програла «Ерн-Хортену». В сезоні 1930/1931 Юве виступав за кордоном, зігравши 12 матчів за швейцарський «Базель» та забивши за нього 10 голів.',
       'Почав грати в футбол за команду «Уредд» із Порсгрунна у віці 16 років. В 1926 {:-:}років{:-:} переїхав в [[Осло]], де став гравцем «Люни». В 1928 році команда Юве пройшла у фінал Кубка Норвегії, але під час вирішальної гри програла «Ерн-Хортену». В сезоні 1930/1931 Юве {:-:}виступав{:-:} за кордоном, {:-:}зігравши{:-:} 12 матчів за швейцарський «Базель» та {:-:}забивши{:-:} за нього 10 голів.',
       list([['роув'], ['виступив'], ['зіграв'], ['забив']]), 4,
       'Почав грати в футбол за команду «Уредд» із Порсгрунна у віці 16 років. В 1926 роув пе

In [16]:
def parse_column_url(df):
    """
    """
    df_ = df.copy()
    df_ = df_[df_['url_diff'].notna()].copy().reset_index(drop=True)
    df_['id_cur'] = df_['url_diff'].str.split('&').apply(lambda x: [i for i in x if 'curid' in i][0]).str.split('=').str[1]
    df_['id_old'] = df_['url_diff'].str.split('&').apply(lambda x: [i for i in x if 'oldid' in i][0]).str.split('=').str[1]
    df_['id_diff'] = df_['url_diff'].str.split('&').apply(lambda x: [i for i in x if 'diff' in i][0]).str.split('=').str[1]
    return df_

def get_corrections_all(df):
    """
    """
    lang_code = df.iloc[0]['lang_code']
    session = requests.Session()
    api_url = f"https://{lang_code}.wikipedia.org/w/api.php"

    results_list = []
    for i, row in df.iterrows():
        PARAMS = {
            'action': "compare",
            'format': "json",
            'prop': "diff|diffsize|user|size|comment",
            'fromrev': row['id_old'],
            'torev': row['id_diff']
        }

        response = session.get(url=api_url, params=PARAMS).json()
        
        fromsize = int(response['compare'].get('fromsize', 0)) if 'compare' in response.keys() else 0
        tosize = int(response['compare'].get('tosize', 0)) if 'compare' in response.keys() else 0
        
        init_dict = {
            'diff_url': f"https://{row['lang_code']}.wikipedia.org" + row['url_diff'], 
            'diff_page': fromsize - tosize
        }

        response_data = get_corrections(response['compare']['*'], init_dict) if 'compare' in response.keys() else []
        results_list.extend(response_data)

    return pd.DataFrame(results_list)

In [17]:
%%time

# for one language
language = 'estonian'
setup = config[language]

# old
print("OLD:")
df_correction_old = pd.read_csv(setup['path_save_corections'])
display(df_correction_old.tail(3))

# current
df_pages_cur = pd.read_csv(setup['path_save_pages_current'])
df_pages_cur = parse_column_url(df_pages_cur)
df_correction_cur = get_corrections_all(df_pages_cur)
print("NEW:")
display(df_correction_cur.tail(3))

# results
df_correction = pd.concat([df_correction_old, df_correction_cur], axis=0)\
                  .drop_duplicates(['text_del', 'text_ins', 'diff_url'], keep='first')\
                  .reset_index(drop=True)
print("RESULT:")
df_correction.tail(3)

OLD:


Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,diff_url,diff_page
135,[['1 400']],1,Metsad on säilinud vaid mägistes piirkondades ...,Metsad on säilinud vaid mägistes piirkondades ...,[['1400']],1,Metsad on säilinud vaid mägistes piirkondades ...,Metsad on säilinud vaid mägistes piirkondades ...,https://et.wikipedia.org/w/index.php?title=Jaa...,-9
136,[],0,Ta on maailma kõige põhjapoolsema levikuga mad...,Ta on maailma kõige põhjapoolsema levikuga mad...,[['<ref>{{Netiviide |kuupäev=2023-07-04 |pealk...,1,Ta on maailma kõige põhjapoolsema levikuga mad...,Ta on maailma kõige põhjapoolsema levikuga mad...,https://et.wikipedia.org/w/index.php?title=Har...,-238
137,[['o ']],1,"Vereplasma on õrnkollakas vedelik, mis moodust...","Vereplasma on õrnkollakas vedelik, mis moodust...",[['°']],1,"Vereplasma on õrnkollakas vedelik, mis moodust...","Vereplasma on õrnkollakas vedelik, mis moodust...",https://et.wikipedia.org/w/index.php?title=Ver...,0


NEW:


Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,diff_url,diff_page
15,[[.]],1,Aretatud on palju sorte ja need on populaarsed...,Aretatud on palju sorte ja need on populaarsed...,[],0,Aretatud on palju sorte ja need on populaarsed...,Aretatud on palju sorte ja need on populaarsed...,https://et.wikipedia.org/w/index.php?title=Jaa...,9
16,"[[jaapani], [kaldakindlustaj]]",2,Mullatüübina eelistab jaapani enelas lubjarohk...,Mullatüübina eelistab {:-:}jaapani{:-:} enelas...,"[[Jaapani], [ on], [kalda kindlustaja]]",3,Mullatüübina eelistab Jaapani enelas lubjarohk...,Mullatüübina eelistab {:+:}Jaapani{:+:} enelas...,https://et.wikipedia.org/w/index.php?title=Jaa...,9
17,"[[jaapani], [ tervislikku], [ ]]",3,Pärast istutamist nõuab jaapani enelas hooldus...,Pärast istutamist nõuab {:-:}jaapani{:-:} enel...,[[Jaapani]],1,Pärast istutamist nõuab Jaapani enelas hooldus...,Pärast istutamist nõuab {:+:}Jaapani{:+:} enel...,https://et.wikipedia.org/w/index.php?title=Jaa...,9


RESULT:
CPU times: user 230 ms, sys: 16.1 ms, total: 246 ms
Wall time: 1.76 s


Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,diff_url,diff_page
153,[[.]],1,Aretatud on palju sorte ja need on populaarsed...,Aretatud on palju sorte ja need on populaarsed...,[],0,Aretatud on palju sorte ja need on populaarsed...,Aretatud on palju sorte ja need on populaarsed...,https://et.wikipedia.org/w/index.php?title=Jaa...,9
154,"[[jaapani], [kaldakindlustaj]]",2,Mullatüübina eelistab jaapani enelas lubjarohk...,Mullatüübina eelistab {:-:}jaapani{:-:} enelas...,"[[Jaapani], [ on], [kalda kindlustaja]]",3,Mullatüübina eelistab Jaapani enelas lubjarohk...,Mullatüübina eelistab {:+:}Jaapani{:+:} enelas...,https://et.wikipedia.org/w/index.php?title=Jaa...,9
155,"[[jaapani], [ tervislikku], [ ]]",3,Pärast istutamist nõuab jaapani enelas hooldus...,Pärast istutamist nõuab {:-:}jaapani{:-:} enel...,[[Jaapani]],1,Pärast istutamist nõuab Jaapani enelas hooldus...,Pärast istutamist nõuab {:+:}Jaapani{:+:} enel...,https://et.wikipedia.org/w/index.php?title=Jaa...,9


In [18]:
%%time
# for all languages. It processes only diff.
for language in list(config.keys())[:]:
    print(language.upper())
    setup = config[language]
    
    df_correction_old = pd.read_csv(setup['path_save_corections'])
    num_corrections_old = df_correction_old.shape[0]
    print('  Corrections Old: {}'.format(num_corrections_old))
 
    if os.path.isfile(setup['path_save_pages_current']):
        df_pages_cur = pd.read_csv(setup['path_save_pages_current'])
        df_pages_cur = parse_column_url(df_pages_cur)
        df_correction_cur = get_corrections_all(df_pages_cur)
        num_corrections_cur = df_correction_cur.shape[0]
        print('  Corrections Current: {}'.format(num_corrections_cur))

        df_correction = pd.concat([df_correction_old, df_correction_cur], axis=0)\
                          .drop_duplicates(['text_del', 'text_ins', 'diff_url'], keep='first')\
                          .reset_index(drop=True)

        num_corrections_final = df_correction.shape[0]
        print('  Corrections Final: {}'.format(num_corrections_final))
        print('  {} corrections were added.'.format(num_corrections_final-num_corrections_old))
        # update corrections files
        if num_corrections_cur>0:
            df_correction.to_csv(setup['path_save_corections'], index=False)
            print('Saved!')
            print()
    else:
        print('  Corrections Current: {}'.format(0))
        print('  Corrections Final: {}'.format(0))
        print('  {} corrections were added.'.format(0))
        print()

ENGLISH
  Corrections Old: 29454
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

CZECH
  Corrections Old: 1193
  Corrections Current: 141
  Corrections Final: 1330
  137 corrections were added.
Saved!
ESTONIAN
  Corrections Old: 138
  Corrections Current: 18
  Corrections Final: 156
  18 corrections were added.
Saved!
GERMAN
  Corrections Old: 4672
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

GREEK
  Corrections Old: 562
  Corrections Current: 61
  Corrections Final: 622
  60 corrections were added.
Saved!
ICELANDIC
  Corrections Old: 0
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

ITALIAN
  Corrections Old: 6488
  Corrections Current: 783
  Corrections Final: 7252
  764 corrections were added.
Saved!
LATVIAN
  Corrections Old: 75
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

SLOVENE
  Corrections Old: 121
  Corrections Current: 14
  Corrections Final: 135
  14 cor

In [19]:
# %%time
# # run corrections for all languages from scratch. It processes a full [lang]_pages.csv file.
# for language in list(config.keys())[:]:
#     print(language.upper())
#     setup = config[language]

#     df_pages_temp = pd.read_csv(setup['path_save_pages'])
#     df_pages_temp = parse_column_url(df_pages_temp)
#     print('  Input: {}'.format(df_pages_temp.shape[0]))

#     if not df_pages_temp.empty:
#         df_correction_temp = get_corrections_all(df_pages_temp)
#         print('  Output: {}'.format(df_correction_temp.shape[0]))
#         df_correction_temp.to_csv(setup['path_save_corections'], index=False)

## Corrections: Statistics

In [20]:
df_list = []
for i in sorted(os.listdir(folder_main)):
    if i.split('.')[-1]=='csv' and 'correction' in i:
        df_temp = pd.read_csv(os.path.join(folder_main, i))
        df_temp['code_lang'] = i.split('_')[0]
        print(f"{i}, # rows: {df_temp.shape[0]}")
        df_list.append(df_temp)

df_correction_all = pd.concat(df_list, axis=0).reset_index(drop=True)

df_num_corrections = df_correction_all['code_lang'].value_counts().reset_index()\
                        .rename(columns={'index': 'language_code', 'code_lang': '# edits'})

df_num_corrections['language'] = df_num_corrections['language_code'].replace(language_dict)

with open('metadata/corrections.json', 'w', encoding='utf-8') as f:
    json.dump(df_num_corrections.to_dict('records'), f, ensure_ascii=False, indent=4)

cs_corrections.csv, # rows: 1330
de_corrections.csv, # rows: 4672
el_corrections.csv, # rows: 622
en_corrections.csv, # rows: 29454
et_corrections.csv, # rows: 156
is_corrections.csv, # rows: 0
it_corrections.csv, # rows: 7252
lv_corrections.csv, # rows: 75
sl_corrections.csv, # rows: 135
sv_corrections.csv, # rows: 936
uk_corrections.csv, # rows: 6998
