In [1]:
import os
import re
import json
import requests
from pathlib import Path
import datetime
import dateparser
import pandas as pd
import numpy as np
from IPython.core.display import HTML
from IPython.display import JSON

from bs4 import BeautifulSoup
from bs4.element import Tag

from utils import get_edit_pages_metadata, _update_date_column

## Initialization

In [2]:
def pretty_json_html(data):
    json_str = json.dumps(data, indent=4, ensure_ascii=False)
    html = f"<pre>{json_str}</pre>"
    display(HTML(html))

In [3]:
# Main folder for data 
folder_main = "../data"
Path(folder_main).mkdir(parents=True, exist_ok=True)

today = str(datetime.datetime.today().date())
print(f'Current Datetime is {today}.')

with open('config_lang.json') as f:
    config_lang = json.load(f)

# Extend config file
config = {}
for item in config_lang:
    lang_wiki = config_lang[item]['lang_wiki']
    config_temp = {
        "path_save_pages_current": f"{folder_main}/dumps/{lang_wiki}_pages_{today}.csv",
        "path_save_pages": f"{folder_main}/{lang_wiki}_pages.csv",
        "path_save_corections": f"{folder_main}/{lang_wiki}_corrections.csv"
    }
    config[item] = {**config_lang[item], **config_temp}
config

print()
print("Config:")
pretty_json_html(config)

print()
print("Languages:")
language_dict = {v["lang_wiki"]: v["language"] for k, v in config.items()}
pretty_json_html(language_dict)

with open('metadata/languages.json', 'w', encoding='utf-8') as f:
    json.dump(language_dict, f, ensure_ascii=False, indent=4)

Current Datetime is 2025-05-15.

Config:



Languages:


In [4]:
rename_columns_pages_dataframe = {
    "rozdíl": "url_diff",
    "historie": "url_history",
    "Unterschied": "url_diff",
    "Versionen": "url_history",
    "izmaiņas": "url_diff",
    "hronoloģija": "url_history",
    "erin": "url_diff",
    "ajal": "url_history",
    "διαφ.": "url_diff",
    "ιστορ.": "url_history",
    "diff": "url_diff",
    "cron": "url_history",
    "razl": "url_diff",
    "zgod": "url_history",
    "skillnad": "url_diff",
    "historik": "url_history",
    "різн.": "url_diff",
    "історія": "url_history",
    "hist": "url_history"
}

## Pages: Data

In [None]:
# create empty csv files for pages if it doesn't exist; 
# for first run
columns_pages = ['date_edit_wiki', 'date_update', 'lang', 'lang_code', 'url_diff',
                 'url_history', 'title', 'url', 'author', 'tags', 'date_edit']
for key, value in config.items():
    if not os.path.isfile(value['path_save_pages']):
        df_empty = pd.DataFrame(list())
        df_empty.to_csv(value['path_save_pages'], index=False)

In [5]:
# test run for one language
language = "ukrainian"
setup = config[language]

url = f"https://{setup['lang_wiki']}.wikipedia.org/w/index.php?hidebots=1&\
hidecategorization=1&\
hideWikibase=1&\
tagfilter=newcomer+task+copyedit&\
limit=5000&days=30&\
title={setup['page_title']}&\
urlversion=2"

metadata_init = {
    "date_update": today,
    "lang": language,
    "lang_code": setup["lang_wiki"]
}

print(f"URL for {language} language: {url}")

metadata = get_edit_pages_metadata(url, metadata_init)
df_metadata = pd.DataFrame(metadata).rename(columns=rename_columns_pages_dataframe)

if not df_metadata.empty:
    _update_date_column(df_metadata, date_col_in="date_edit_wiki", date_col_out="date_edit")
    dates = sorted([i for i in df_metadata["date_edit"].unique() if i is not None])
    print("  final - # rows: {}, start-date: {}, end-date: {}".format(
        df_metadata.shape[0], min(dates), max(dates))
    )
    display(df_metadata.tail(3))

URL for ukrainian language: https://uk.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Спеціальна:Нові_редагування&urlversion=2
  final - # rows: 288, start-date: 2025-04-13, end-date: 2025-05-13


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
285,14 квітня 2025,2025-05-13,ukrainian,uk,/w/index.php?title=%D0%9D%D0%BE%D0%B2%D0%BE%D0...,/w/index.php?title=%D0%9D%D0%BE%D0%B2%D0%BE%D0...,Новодівичий монастир,/wiki/%D0%9D%D0%BE%D0%B2%D0%BE%D0%B4%D1%96%D0%...,Nina Lpsh.,"[Завдання новачку, Завдання новачку: коректура]",2025-04-14
286,14 квітня 2025,2025-05-13,ukrainian,uk,/w/index.php?title=%D0%9D%D0%BE%D0%B2%D0%BE%D0...,/w/index.php?title=%D0%9D%D0%BE%D0%B2%D0%BE%D0...,Новодівичий монастир,/wiki/%D0%9D%D0%BE%D0%B2%D0%BE%D0%B4%D1%96%D0%...,Nina Lpsh.,"[Візуальний редактор, Завдання новачку, Завдан...",2025-04-14
287,13 квітня 2025,2025-05-13,ukrainian,uk,/w/index.php?title=%D0%A0%D0%B5%D0%B9%D0%BD%D0...,/w/index.php?title=%D0%A0%D0%B5%D0%B9%D0%BD%D0...,Рейнгольд Нібур,/wiki/%D0%A0%D0%B5%D0%B9%D0%BD%D0%B3%D0%BE%D0%...,Олег Бараболя,"[перше редагування, Візуальний редактор, Редаг...",2025-04-13


In [6]:
# run for all languages (see config dictionary)

print("Support Languages:", ', '.join(list(language_dict.values())))
for language in list(config.keys())[:]:
    print(language.upper())
    setup = config[language]
            
    url = f"https://{setup['lang_wiki']}.wikipedia.org/w/index.php?hidebots=1&\
hidecategorization=1&\
hideWikibase=1&\
tagfilter=newcomer+task+copyedit&\
limit=5000&days=30&\
title={setup['page_title']}&\
urlversion=2"
    
    metadata_init = {
            "date_update": today,
            "lang": language,
            "lang_code": setup["lang_wiki"]
        }
    
    print(f"  URL for {language} language: {url}")
    
    if setup["is_process"]:
        metadata = get_edit_pages_metadata(url, metadata_init)
        df_metadata_cur = pd.DataFrame(metadata).rename(columns=rename_columns_pages_dataframe)

        if "author" in df_metadata_cur.columns:
            df_metadata_cur = df_metadata_cur[df_metadata_cur["author"].notna()].copy().reset_index(drop=True)

        if not df_metadata_cur.empty:
            _update_date_column(df_metadata_cur, date_col_in="date_edit_wiki", date_col_out="date_edit")
            print("  current - # rows: {}, start-date: {}, end-date: {}".format(
                df_metadata_cur.shape[0], 
                df_metadata_cur["date_edit"].min(), 
                df_metadata_cur["date_edit"].max()
            ))

            df_metadata_old = pd.read_csv(setup["path_save_pages"])

            df_metadata = pd.concat([df_metadata_cur, df_metadata_old], axis=0)\
                            .sort_values(["date_update", "date_edit"])\
                            .drop_duplicates(subset=["url_diff"], keep="last")\
                            .reset_index(drop=True)

            print("  final - # rows: {}, start-date: {}, end-date: {}".format(
                df_metadata.shape[0], 
                df_metadata["date_edit"].min(), 
                df_metadata["date_edit"].max()
            ))
            display(df_metadata.tail(3))
            df_metadata_cur.to_csv(setup["path_save_pages_current"], index=False)
            df_metadata.to_csv(setup["path_save_pages"], index=False)
        else:
            print("  THERE ARE NO DATA FROM THE PAST 30 DAYS")
        print()
    else:
        print(f"  {language.title()} is not included. Check config file.")
        print()

Support Languages: english, czech, estonian, german, greek, icelandic, italian, latvian, slovene, swedish, ukrainian
ENGLISH
  URL for english language: https://en.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Special:RecentChanges&urlversion=2
  English is not included. Check config file.

CZECH
  URL for czech language: https://cs.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Speciální:Poslední_změny&urlversion=2
  current - # rows: 46, start-date: 2025-04-14, end-date: 2025-05-13
  final - # rows: 479, start-date: 2024-09-29, end-date: 2025-05-13


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
476,13. 5. 2025,2025-05-13,czech,cs,/w/index.php?title=Josef_Pleskot&curid=390531&...,/w/index.php?title=Josef_Pleskot&curid=390531&...,Josef Pleskot,/wiki/Josef_Pleskot,DobrodruhZnalec,"['editace z Vizuálního editoru', 'Editační tip...",2025-05-13
477,13. 5. 2025,2025-05-13,czech,cs,/w/index.php?title=Krotitel%C3%A9_duch%C5%AF&c...,/w/index.php?title=Krotitel%C3%A9_duch%C5%AF&c...,Krotitelé duchů,/wiki/Krotitel%C3%A9_duch%C5%AF,DobrodruhZnalec,"['editace z Vizuálního editoru', 'Editační tip...",2025-05-13
478,13. 5. 2025,2025-05-13,czech,cs,/w/index.php?title=Petr_Houdek&curid=1368352&d...,/w/index.php?title=Petr_Houdek&curid=1368352&a...,Petr Houdek,/wiki/Petr_Houdek,Davaderner,"['první editace', 'editace z Vizuálního editor...",2025-05-13



ESTONIAN
  URL for estonian language: https://et.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Eri:Viimased_muudatused&urlversion=2
  current - # rows: 9, start-date: 2025-04-28, end-date: 2025-05-10
  final - # rows: 48, start-date: 2024-10-25, end-date: 2025-05-10


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
45,10. mai 2025,2025-05-13,estonian,et,/w/index.php?title=Jaava_leopard&curid=692470&...,/w/index.php?title=Jaava_leopard&curid=692470&...,Jaava leopard,/wiki/Jaava_leopard,Ojassaar,"['Visuaalmuudatus', 'Uue kasutaja ülesanne', '...",2025-05-10
46,10. mai 2025,2025-05-13,estonian,et,/w/index.php?title=Harilik_r%C3%A4stik&curid=1...,/w/index.php?title=Harilik_r%C3%A4stik&curid=1...,Harilik rästik,/wiki/Harilik_r%C3%A4stik,Ojassaar,"['Visuaalmuudatus', 'Uue kasutaja ülesanne', '...",2025-05-10
47,10. mai 2025,2025-05-13,estonian,et,/w/index.php?title=Veri&curid=92510&diff=68747...,/w/index.php?title=Veri&curid=92510&action=his...,Veri,/wiki/Veri,Ojassaar,"['Visuaalmuudatus', 'Uue kasutaja ülesanne', '...",2025-05-10



GERMAN
  URL for german language: https://de.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Spezial:Letzte_%C3%84nderungen&urlversion=2
  THERE ARE NO DATA FROM THE PAST 30 DAYS

GREEK
  URL for greek language: https://el.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Ειδικό:ΠρόσφατεςΑλλαγές&urlversion=2
  current - # rows: 23, start-date: 2025-04-14, end-date: 2025-05-12
  final - # rows: 155, start-date: 2024-09-28, end-date: 2025-05-12


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
152,11 Μαΐου 2025,2025-05-13,greek,el,"/w/index.php?title=TSR,_Inc.&curid=49756&diff=...","/w/index.php?title=TSR,_Inc.&curid=49756&actio...","TSR, Inc.","/wiki/TSR,_Inc.",Antigoni R,"['Οπτική επεξεργασία', 'Επεξεργασία από κινητό...",2025-05-11
153,12 Μαΐου 2025,2025-05-13,greek,el,/w/index.php?title=Thor:_Love_and_Thunder&curi...,/w/index.php?title=Thor:_Love_and_Thunder&curi...,Thor: Love and Thunder,/wiki/Thor:_Love_and_Thunder,Romanouanat,"['Οπτική επεξεργασία', 'Αποστολή νεοεισερχόμεν...",2025-05-12
154,12 Μαΐου 2025,2025-05-13,greek,el,/w/index.php?title=%CE%9C%CE%B1%CF%81%CE%AF%CE...,/w/index.php?title=%CE%9C%CE%B1%CF%81%CE%AF%CE...,Μαρία Παπαγιάννη,/wiki/%CE%9C%CE%B1%CF%81%CE%AF%CE%B1_%CE%A0%CE...,Romanouanat,"['Οπτική επεξεργασία', 'Αποστολή νεοεισερχόμεν...",2025-05-12



ICELANDIC
  URL for icelandic language: https://is.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Kerfiss%C3%AD%C3%B0a:N%C3%BDlegar_breytingar&urlversion=2
  THERE ARE NO DATA FROM THE PAST 30 DAYS

ITALIAN
  URL for italian language: https://it.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Speciale:UltimeModifiche&urlversion=2
  current - # rows: 194, start-date: 2025-04-14, end-date: 2025-05-13
  final - # rows: 2569, start-date: 2024-09-28, end-date: 2025-05-13


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
2566,13 mag 2025,2025-05-13,italian,it,/w/index.php?title=Redo_Rescue&curid=10234484&...,/w/index.php?title=Redo_Rescue&curid=10234484&...,Redo Rescue,/wiki/Redo_Rescue,Sazek,"['Modifica visuale', 'Attività per i nuovi ute...",2025-05-13
2567,13 mag 2025,2025-05-13,italian,it,/w/index.php?title=AngelScript&curid=9996389&d...,/w/index.php?title=AngelScript&curid=9996389&a...,AngelScript,/wiki/AngelScript,Sazek,"['Modifica visuale', 'Attività per i nuovi ute...",2025-05-13
2568,13 mag 2025,2025-05-13,italian,it,/w/index.php?title=5_cm_al_secondo&curid=15433...,/w/index.php?title=5_cm_al_secondo&curid=15433...,5 cm al secondo,/wiki/5_cm_al_secondo,Sazek,"['Modifica visuale', 'Attività per i nuovi ute...",2025-05-13



LATVIAN
  URL for latvian language: https://lv.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Special:RecentChanges&urlversion=2
  THERE ARE NO DATA FROM THE PAST 30 DAYS

SLOVENE
  URL for slovene language: https://sl.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Posebno:ZadnjeSpremembe&urlversion=2
  current - # rows: 2, start-date: 2025-04-30, end-date: 2025-05-03
  final - # rows: 28, start-date: 2024-10-05, end-date: 2025-05-03


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
25,8. april 2025,2025-05-04,slovene,sl,/w/index.php?title=Osapska_jama&curid=158247&d...,/w/index.php?title=Osapska_jama&curid=158247&a...,Osapska jama,/wiki/Osapska_jama,SloUser25,"['vizualno urejanje', 'naloga začetnika', 'nal...",2025-04-08
26,30. april 2025,2025-05-13,slovene,sl,/w/index.php?title=Orang_Asli&curid=443502&dif...,/w/index.php?title=Orang_Asli&curid=443502&act...,Orang Asli,/wiki/Orang_Asli,Ostrorogi Jelen 1,"['vizualno urejanje', 'naloga začetnika', 'nal...",2025-04-30
27,3. maj 2025,2025-05-13,slovene,sl,/w/index.php?title=Streif&curid=196343&diff=64...,/w/index.php?title=Streif&curid=196343&action=...,Streif,/wiki/Streif,Jurekrajnc,"['vrnjeno', 'vizualno urejanje', 'mobilno urej...",2025-05-03



SWEDISH
  URL for swedish language: https://sv.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Special:Senaste_ändringar&urlversion=2
  current - # rows: 39, start-date: 2025-04-18, end-date: 2025-05-13
  final - # rows: 255, start-date: 2024-09-29, end-date: 2025-05-13


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
252,13 maj 2025,2025-05-13,swedish,sv,/w/index.php?title=%C3%96vers%C3%A4ttning&curi...,/w/index.php?title=%C3%96vers%C3%A4ttning&curi...,Översättning,/wiki/%C3%96vers%C3%A4ttning,Lille Omar,"['VE', 'Nybörjaruppgift', 'Nybörjaruppgift: ko...",2025-05-13
253,13 maj 2025,2025-05-13,swedish,sv,/w/index.php?title=Set-top-box&curid=484246&di...,/w/index.php?title=Set-top-box&curid=484246&ac...,Set-top-box,/wiki/Set-top-box,Brickmule,"['VE', 'Nybörjaruppgift', 'Nybörjaruppgift: ko...",2025-05-13
254,13 maj 2025,2025-05-13,swedish,sv,/w/index.php?title=Skorpion_I&curid=8761685&di...,/w/index.php?title=Skorpion_I&curid=8761685&ac...,Skorpion I,/wiki/Skorpion_I,Hamnpus,"['VE', 'Mobilredigering', 'Redigering via mobi...",2025-05-13



UKRAINIAN
  URL for ukrainian language: https://uk.wikipedia.org/w/index.php?hidebots=1&hidecategorization=1&hideWikibase=1&tagfilter=newcomer+task+copyedit&limit=5000&days=30&title=Спеціальна:Нові_редагування&urlversion=2
  current - # rows: 287, start-date: 2025-04-13, end-date: 2025-05-13
  final - # rows: 1661, start-date: 2024-09-28, end-date: 2025-05-13


Unnamed: 0,date_edit_wiki,date_update,lang,lang_code,url_diff,url_history,title,url,author,tags,date_edit
1658,13 травня 2025,2025-05-13,ukrainian,uk,/w/index.php?title=%D0%A1%D0%BF%D1%80%D0%B0%D0...,/w/index.php?title=%D0%A1%D0%BF%D1%80%D0%B0%D0...,Справа Єврейського антифашистського комітету,/wiki/%D0%A1%D0%BF%D1%80%D0%B0%D0%B2%D0%B0_%D0...,Світогор Лелеко,"[Завдання новачку, Завдання новачку: коректура]",2025-05-13
1659,13 травня 2025,2025-05-13,ukrainian,uk,/w/index.php?title=%D0%99%D0%BE%D1%81%D0%B8%D1...,/w/index.php?title=%D0%99%D0%BE%D1%81%D0%B8%D1...,Йосиф де Вохт,/wiki/%D0%99%D0%BE%D1%81%D0%B8%D1%84_%D0%B4%D0...,Світогор Лелеко,"[Завдання новачку, Завдання новачку: коректура]",2025-05-13
1660,13 травня 2025,2025-05-13,ukrainian,uk,/w/index.php?title=%D0%A1%D1%96%D0%BB%D0%B5%D1...,/w/index.php?title=%D0%A1%D1%96%D0%BB%D0%B5%D1...,Сілець (Червоноградська міська громада),/wiki/%D0%A1%D1%96%D0%BB%D0%B5%D1%86%D1%8C_(%D...,Світогор Лелеко,"[Завдання новачку, Завдання новачку: коректура]",2025-05-13





## Pages: Statistics

In [5]:
df_list = []
for i in sorted(os.listdir(folder_main)):
    if i.split('.')[-1]=='csv' and 'pages' in i:
        df_temp = pd.read_csv(os.path.join(folder_main, i))
        print(f"{i}, # rows: {df_temp.shape[0]}")
        df_list.append(df_temp)

df_pages_all = pd.concat(df_list, axis=0).reset_index(drop=True)

df_num_pages = df_pages_all['lang'].value_counts().reset_index()\
                                   .rename(columns={'index': 'language', 'lang': '# pages'})

with open('metadata/pages.json', 'w', encoding='utf-8') as f:
    json.dump(df_num_pages.to_dict('records'), f, ensure_ascii=False, indent=4)

cs_pages.csv, # rows: 479
de_pages.csv, # rows: 1706
el_pages.csv, # rows: 155
en_pages.csv, # rows: 9842
et_pages.csv, # rows: 48
is_pages.csv, # rows: 0
it_pages.csv, # rows: 2569
lv_pages.csv, # rows: 20
sl_pages.csv, # rows: 28
sv_pages.csv, # rows: 255
uk_pages.csv, # rows: 1661


In [6]:
df_pages_all.groupby(['lang', 'date_update'], dropna=False)['lang_code'].count()[:]['ukrainian']

date_update
2024-10-28    146
2024-11-06     63
2024-11-10    110
2024-11-25     76
2024-12-16     75
2025-01-15    216
2025-02-15    307
2025-03-14    122
2025-04-12     18
2025-04-17    153
2025-05-04     88
2025-05-13    287
Name: lang_code, dtype: int64

In [7]:
print("start-date: {}, end-date: {}".format(
    df_pages_all[df_pages_all['date_edit'].notna()]['date_edit'].min(),
    df_pages_all[df_pages_all['date_edit'].notna()]['date_edit'].max()
))

df_pages_all[df_pages_all['date_edit'].isna()]['lang'].value_counts()
# df_pages_all['date_edit'].value_counts(dropna=False).sort_index()[:]

start-date: 2024-09-28, end-date: 2025-05-13


latvian    20
Name: lang, dtype: int64

## Data: Corrections Extraction

In [45]:
# create empty csv files for corrections if it doesn't exist; 
# for first run
columns_corrections = ['delitions', 'n_del', 'text_del', 'text_del_tag', 'insertions', 
                       'n_ins', 'text_ins', 'text_ins_tag', 'diff_url', 'diff_page']
for key, value in config.items():
    if not os.path.isfile(value['path_save_corections']):
        df_empty = pd.DataFrame(list(), columns=columns_corrections)
        df_empty.to_csv(value['path_save_corections'], index=False)

In [17]:
# test API
session = requests.Session()
api_url = "https://uk.wikipedia.org/w/api.php"

PARAMS = {
    'action': "compare",
    'format': "json",
    'prop': "diff|diffsize|user|size|comment",
    'fromrev': 43736886, #42726929,
    'torev': 41719237, #43671592
}

response = session.get(url=api_url, params=PARAMS).json()
# print(response.keys())
display(pd.json_normalize(response['compare']))
HTML(response['compare']['*'])

Unnamed: 0,fromsize,fromuser,fromuserid,fromcomment,fromparsedcomment,tosize,touser,touserid,tocomment,toparsedcomment,diffsize,*
0,12386,MinecAnton209,774975,"Виправлені орфографчні помилки, перефразування...","Виправлені орфографчні помилки, перефразування...",12145,Бучач-Львів,164714,,,32123,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Ря..."


In [18]:
def _add_tag(text_obj, cnt_input, tag_find, tag_wrap):
    """
    """
    text_temp = text_obj
    cnt = 0
    while cnt != cnt_input:
        text_init = text_temp.find_next(tag_find, {"class": "diffchange diffchange-inline"})
        text_temp.insert_before(tag_wrap)
        text_temp.insert_after(tag_wrap)
        text_temp = text_init
        cnt += 1
    return text_obj

def get_corrections(content, init_dict):
    """
    """
    content_obj = BeautifulSoup(content)

    results_list = []
    for row in content_obj.find_all('tr'):
        results_dict = {}
        before_obj = row.find("td", {"class": "diff-deletedline"})
        after_obj = row.find("td", {"class": "diff-addedline"})
#         print('b', before_obj)
#         print('a', after_obj)
    
        if isinstance(before_obj, Tag):
            text_deletions = before_obj.get_text()
            deletions_obj = before_obj.find_all("del", {"class": "diffchange diffchange-inline"})
            deletions = [i.contents for i in deletions_obj]
            n_deletions = len(deletions)
            tb = _add_tag(before_obj, cnt_input=n_deletions+1, tag_find="del", tag_wrap="{:-:}").get_text()
            status_b = True
        else:
            status_b = False
            
        if isinstance(after_obj, Tag):
            text_insertions = after_obj.get_text()
            insertions_obj = after_obj.find_all("ins", {"class": "diffchange diffchange-inline"})
            insertions = [i.contents for i in insertions_obj]
            n_insertions = len(insertions)
            ta = _add_tag(after_obj, cnt_input=n_insertions+1, tag_find="ins", tag_wrap="{:+:}").get_text()
            status_a = True
        else:
            status_a = False
        
        if status_b and status_a:
            results_dict['delitions'] = deletions
            results_dict['n_del'] = n_deletions
            results_dict['text_del'] = text_deletions
            results_dict['text_del_tag'] = tb
            results_dict['insertions'] = insertions
            results_dict['n_ins'] = n_insertions
            results_dict['text_ins'] = text_insertions
            results_dict['text_ins_tag'] = ta

            results_list.append({**results_dict, **init_dict})

    return results_list

In [19]:
pd.DataFrame(get_corrections(response['compare']['*'], {'date': 'x'}))

Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,date
0,[[ ]],1,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,[[&nbsp;]],1,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,'''Єрген Юве'''&nbsp;({{lang-no|Jørgen Juve}};...,x
1,"[[років], [виступав], [зігравши], [забивши]]",4,Почав грати в футбол за команду «Уредд» із Пор...,Почав грати в футбол за команду «Уредд» із Пор...,"[[роув], [виступив], [зіграв], [забив]]",4,Почав грати в футбол за команду «Уредд» із Пор...,Почав грати в футбол за команду «Уредд» із Пор...,x
2,"[[ ], [та]]",2,З 1928 по 1937 роки Йерген Юве грав і за збірн...,З 1928 по 1937 роки Йерген Юве грав і за збірн...,[],0,З 1928 по 1937 роки Йерген Юве грав і за збірн...,З 1928 по 1937 роки Йерген Юве грав і за збірн...,x
3,"[[[[Літні Олімпійські ігри 1936|], []]], [[[Бе...",8,Юве був капітаном норвезької команди і на [[Лі...,Юве був капітаном норвезької команди і на {:-:...,[],0,Юве був капітаном норвезької команди і на Олім...,Юве був капітаном норвезької команди і на Олім...,x
4,"[[Вінгео], [читав]]",2,Футбольні експерти-сучасники називали Юве одни...,Футбольні експерти-сучасники називали Юве одни...,"[[вінгео], [читал]]",2,Футбольні експерти-сучасники називали Юве одни...,Футбольні експерти-сучасники називали Юве одни...,x
5,"[[[[Базель|], []]], [ З], [та], [ рік редактор...",9,Юве вивчився на юриста в [[Базель|Базелі]] в 1...,Юве вивчився на юриста в {:-:}[[Базель|{:-:}Ба...,[],0,"Юве вивчився на юриста в Базелі в 1931 році, п...","Юве вивчився на юриста в Базелі в 1931 році, п...",x
6,[[декілька]],1,Йорґен Юве написав декілька книг на спортивну ...,Йорґен Юве написав {:-:}декілька{:-:} книг на ...,[[декількп]],1,Йорґен Юве написав декількп книг на спортивну ...,Йорґен Юве написав {:+:}декількп{:+:} книг на ...,x
7,"[[ ], [вела]]",2,"Батько&nbsp;— Уле Мартін Юве, дубильщик по про...","Батько&nbsp;— Уле Мартін Юве, дубильщик по про...",[],0,"Батько&nbsp;— Уле Мартін Юве, дубильщик по про...","Батько&nbsp;— Уле Мартін Юве, дубильщик по про...",x
8,[[ ]],1,"Йорген&nbsp;— старший із шести дітей в сім'ї, ...","Йорген&nbsp;— старший із шести дітей в сім'ї, ...",[],0,"Йорген&nbsp;— старший із шести дітей в сім'ї, ...","Йорген&nbsp;— старший із шести дітей в сім'ї, ...",x
9,"[[[[Друга світова війна|], []]]]",2,Під час [[Друга світова війна|Другої сівтової ...,Під час {:-:}[[Друга світова війна|{:-:}Другої...,[],0,Під час Другої сівтової війни служив в британс...,Під час Другої сівтової війни служив в британс...,x


In [20]:
pd.DataFrame(get_corrections(response['compare']['*'], {'date': 'x'})).iloc[1].values

array([list([['років'], ['виступав'], ['зігравши'], ['забивши']]), 4,
       'Почав грати в футбол за команду «Уредд» із Порсгрунна у віці 16 років. В 1926 років переїхав в [[Осло]], де став гравцем «Люни». В 1928 році команда Юве пройшла у фінал Кубка Норвегії, але під час вирішальної гри програла «Ерн-Хортену». В сезоні 1930/1931 Юве виступав за кордоном, зігравши 12 матчів за швейцарський «Базель» та забивши за нього 10 голів.',
       'Почав грати в футбол за команду «Уредд» із Порсгрунна у віці 16 років. В 1926 {:-:}років{:-:} переїхав в [[Осло]], де став гравцем «Люни». В 1928 році команда Юве пройшла у фінал Кубка Норвегії, але під час вирішальної гри програла «Ерн-Хортену». В сезоні 1930/1931 Юве {:-:}виступав{:-:} за кордоном, {:-:}зігравши{:-:} 12 матчів за швейцарський «Базель» та {:-:}забивши{:-:} за нього 10 голів.',
       list([['роув'], ['виступив'], ['зіграв'], ['забив']]), 4,
       'Почав грати в футбол за команду «Уредд» із Порсгрунна у віці 16 років. В 1926 роув пе

In [21]:
def parse_column_url(df):
    """
    """
    df_ = df.copy()
    df_ = df_[df_['url_diff'].notna()].copy().reset_index(drop=True)
    df_['id_cur'] = df_['url_diff'].str.split('&').apply(lambda x: [i for i in x if 'curid' in i][0]).str.split('=').str[1]
    df_['id_old'] = df_['url_diff'].str.split('&').apply(lambda x: [i for i in x if 'oldid' in i][0]).str.split('=').str[1]
    df_['id_diff'] = df_['url_diff'].str.split('&').apply(lambda x: [i for i in x if 'diff' in i][0]).str.split('=').str[1]
    return df_

def get_corrections_all(df):
    """
    """
    lang_code = df.iloc[0]['lang_code']
    session = requests.Session()
    api_url = f"https://{lang_code}.wikipedia.org/w/api.php"

    results_list = []
    for i, row in df.iterrows():
        PARAMS = {
            'action': "compare",
            'format': "json",
            'prop': "diff|diffsize|user|size|comment",
            'fromrev': row['id_old'],
            'torev': row['id_diff']
        }

        response = session.get(url=api_url, params=PARAMS).json()
        
        fromsize = int(response['compare'].get('fromsize', 0)) if 'compare' in response.keys() else 0
        tosize = int(response['compare'].get('tosize', 0)) if 'compare' in response.keys() else 0
        
        init_dict = {
            'diff_url': f"https://{row['lang_code']}.wikipedia.org" + row['url_diff'], 
            'diff_page': fromsize - tosize
        }

        response_data = get_corrections(response['compare']['*'], init_dict) if 'compare' in response.keys() else []
        results_list.extend(response_data)

    return pd.DataFrame(results_list)

In [23]:
%%time

# for one language
language = 'estonian'
setup = config[language]

# old
print("OLD:")
df_correction_old = pd.read_csv(setup['path_save_corections'])
display(df_correction_old.tail(3))

# current
df_pages_cur = pd.read_csv(setup['path_save_pages_current'])
df_pages_cur = parse_column_url(df_pages_cur)
df_correction_cur = get_corrections_all(df_pages_cur)
print("NEW:")
display(df_correction_cur.tail(3))

# results
df_correction = pd.concat([df_correction_old, df_correction_cur], axis=0)\
                  .drop_duplicates(['text_del', 'text_ins', 'diff_url'], keep='first')\
                  .reset_index(drop=True)
print("RESULT:")
df_correction.tail(3)

OLD:


Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,diff_url,diff_page
123,"[[','], ['[[Operatsioon'], ['Kõrbetorm]]'], ['...",6,"15. jaanuaril 1995, [[Operatsioon Kõrbetorm]] ...","15. jaanuaril 1995{:-:},{:-:} {:-:}[[Operatsio...","[['tõsteti'], ['operatsioon'], ['Kõrbetormi kä...",3,15. jaanuaril 1995 tõsteti operatsioon Kõrbeto...,15. jaanuaril 1995 {:+:}tõsteti{:+:} {:+:}oper...,https://et.wikipedia.org/w/index.php?title=DEF...,7
124,[],0,Fotograaf Helsingi Ülikooli Aasia ja Aafrika k...,Fotograaf Helsingi Ülikooli Aasia ja Aafrika k...,"[['* '], ['[[Helsingi Ülikool|'], [']]'], [' k...",4,* Fotograaf [[Helsingi Ülikool|Helsingi Ülikoo...,{:+:}* {:+:}Fotograaf {:+:}[[Helsingi Ülikool|...,https://et.wikipedia.org/w/index.php?title=Ros...,-148
125,"[['Õppis'], ['õigusteadust'], ['alates 1267']]",3,Õppis õigusteadust [[Pariisi Ülikool]]is 10 aa...,{:-:}Õppis{:-:} {:-:}õigusteadust{:-:} [[Parii...,"[['1267.'], ['aastal astus Erwan'], [' ning õp...",4,1267. aastal astus Erwan [[Pariisi Ülikool]]is...,{:+:}1267.{:+:} {:+:}aastal astus Erwan{:+:} [...,https://et.wikipedia.org/w/index.php?title=Erw...,-85


NEW:


Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,diff_url,diff_page
9,[[ sajandi kavalerid]],1,"Nagu paljud tema kaasaegsed, keskendus ta ajal...","Nagu paljud tema kaasaegsed, keskendus ta ajal...",[],0,"Nagu paljud tema kaasaegsed, keskendus ta ajal...","Nagu paljud tema kaasaegsed, keskendus ta ajal...",https://et.wikipedia.org/w/index.php?title=Lud...,18
10,[[ameerika]],1,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,[[Ameerika]],1,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,https://et.wikipedia.org/w/index.php?title=Sea...,0
11,[[maismaa]],1,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,[[maismaal asuvaid]],1,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,https://et.wikipedia.org/w/index.php?title=RBS...,-9


RESULT:
CPU times: user 266 ms, sys: 22.8 ms, total: 288 ms
Wall time: 1.99 s


Unnamed: 0,delitions,n_del,text_del,text_del_tag,insertions,n_ins,text_ins,text_ins_tag,diff_url,diff_page
135,[[ sajandi kavalerid]],1,"Nagu paljud tema kaasaegsed, keskendus ta ajal...","Nagu paljud tema kaasaegsed, keskendus ta ajal...",[],0,"Nagu paljud tema kaasaegsed, keskendus ta ajal...","Nagu paljud tema kaasaegsed, keskendus ta ajal...",https://et.wikipedia.org/w/index.php?title=Lud...,18
136,[[ameerika]],1,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,[[Ameerika]],1,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,'''Sean Scully''' (sündinud [[30. juuni]]l [[1...,https://et.wikipedia.org/w/index.php?title=Sea...,0
137,[[maismaa]],1,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,[[maismaal asuvaid]],1,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,'''RBS 15''' (Robotsystem 15) on [[lase-ja-unu...,https://et.wikipedia.org/w/index.php?title=RBS...,-9


In [26]:
%%time
# for all languages. It processes only diff.
for language in list(config.keys())[:]:
    print(language.upper())
    setup = config[language]
    
    df_correction_old = pd.read_csv(setup['path_save_corections'])
    num_corrections_old = df_correction_old.shape[0]
    print('  Corrections Old: {}'.format(num_corrections_old))
 
    if os.path.isfile(setup['path_save_pages_current']):
        df_pages_cur = pd.read_csv(setup['path_save_pages_current'])
        df_pages_cur = parse_column_url(df_pages_cur)
        df_correction_cur = get_corrections_all(df_pages_cur)
        num_corrections_cur = df_correction_cur.shape[0]
        print('  Corrections Current: {}'.format(num_corrections_cur))

        df_correction = pd.concat([df_correction_old, df_correction_cur], axis=0)\
                          .drop_duplicates(['text_del', 'text_ins', 'diff_url'], keep='first')\
                          .reset_index(drop=True)

        num_corrections_final = df_correction.shape[0]
        print('  Corrections Final: {}'.format(num_corrections_final))
        print('  {} corrections were added.'.format(num_corrections_final-num_corrections_old))
        # update corrections files
        if num_corrections_cur>0:
            df_correction.to_csv(setup['path_save_corections'], index=False)
            print('Saved!')
    else:
        print('  Corrections Current: {}'.format(0))
        print('  Corrections Final: {}'.format(0))
        print('  {} corrections were added.'.format(0))
        print()

ENGLISH
  Corrections Old: 12465
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

CZECH
  Corrections Old: 1114
  Corrections Current: 112
  Corrections Final: 1189
  75 corrections were added.

ESTONIAN
  Corrections Old: 126
  Corrections Current: 12
  Corrections Final: 138
  12 corrections were added.

GERMAN
  Corrections Old: 4672
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

GREEK
  Corrections Old: 492
  Corrections Current: 77
  Corrections Final: 561
  69 corrections were added.

ICELANDIC
  Corrections Old: 0
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

ITALIAN
  Corrections Old: 6024
  Corrections Current: 524
  Corrections Final: 6471
  447 corrections were added.

LATVIAN
  Corrections Old: 75
  Corrections Current: 0
  Corrections Final: 0
  0 corrections were added.

SLOVENE
  Corrections Old: 108
  Corrections Current: 13
  Corrections Final: 121
  13 corrections were added.

SWE

In [41]:
# %%time
# # run corrections for all languages from scratch. It processes a full [lang]_pages.csv file.
# for language in list(config.keys())[:]:
#     print(language.upper())
#     setup = config[language]

#     df_pages_temp = pd.read_csv(setup['path_save_pages'])
#     df_pages_temp = parse_column_url(df_pages_temp)
#     print('  Input: {}'.format(df_pages_temp.shape[0]))

#     if not df_pages_temp.empty:
#         df_correction_temp = get_corrections_all(df_pages_temp)
#         print('  Output: {}'.format(df_correction_temp.shape[0]))
#         df_correction_temp.to_csv(setup['path_save_corections'], index=False)

## Corrections: Statistics

In [8]:
df_list = []
for i in sorted(os.listdir(folder_main)):
    if i.split('.')[-1]=='csv' and 'correction' in i:
        df_temp = pd.read_csv(os.path.join(folder_main, i))
        df_temp['code_lang'] = i.split('_')[0]
        print(f"{i}, # rows: {df_temp.shape[0]}")
        df_list.append(df_temp)

df_correction_all = pd.concat(df_list, axis=0).reset_index(drop=True)

df_num_corrections = df_correction_all['code_lang'].value_counts().reset_index()\
                        .rename(columns={'index': 'language_code', 'code_lang': '# edits'})

df_num_corrections['language'] = df_num_corrections['language_code'].replace(language_dict)

with open('metadata/corrections.json', 'w', encoding='utf-8') as f:
    json.dump(df_num_corrections.to_dict('records'), f, ensure_ascii=False, indent=4)

cs_corrections.csv, # rows: 1193
de_corrections.csv, # rows: 4672
el_corrections.csv, # rows: 562
en_corrections.csv, # rows: 29454
et_corrections.csv, # rows: 138
is_corrections.csv, # rows: 0
it_corrections.csv, # rows: 6488
lv_corrections.csv, # rows: 75
sl_corrections.csv, # rows: 121
sv_corrections.csv, # rows: 654
uk_corrections.csv, # rows: 6165
