In [1]:
import pandas as pd
import csv

<b>Aim</b>: This script performs a data preprocessing to obtain the main metrics of Wikipedia article pages by WikiProject quality grades.

# 1. Data preprocessing

There are a total of 53,710,529 Wikipedia pages, but 6,328,134 are articles.

In [2]:
df_pages = pd.read_csv('data/page.tsv', sep='\t', quoting=csv.QUOTE_NONE)
df_pages.shape

(53710529, 15)

In [3]:
df_pages_m = pd.read_csv('results/page_metrics.tsv', sep='\t')
df_pages_m.shape

(6328134, 16)

In [5]:
df_pages_m.groupby(lambda x: True).mean().round(2).reset_index()

Unnamed: 0,index,page_id,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls,ref_urls,ref_pubs
0,True,29591817.2,2010.9,9.59,3345.07,101.92,48.38,7844.68,4.6,9.19,5.38,87.77,80.53,10.33,3.79,0.59


Categories are imported and WikiProjects grades are identified through them.

In [4]:
df_ca = pd.read_csv('data/category.tsv', sep='\t')
df_ca.shape

(2179622, 5)

All grades are identified.

In [None]:
df_fa = df_ca[df_ca['title'].str.contains('^FA-Class_.*_articles$', regex= True, na=False)]
df_fa.loc[:,'grade'] = 'FA'
df_fl = df_ca[df_ca['title'].str.contains('^FL-Class_.*_articles$', regex= True, na=False)]
df_fl.loc[:,'grade'] = 'FL'
df_ga = df_ca[df_ca['title'].str.contains('^GA-Class_.*_articles$', regex= True, na=False)]
df_ga.loc[:,'grade'] = 'GA'
df_aa = df_ca[df_ca['title'].str.contains('^A-Class_.*_articles$', regex= True, na=False)]
df_aa.loc[:,'grade'] = 'A'
df_ba = df_ca[df_ca['title'].str.contains('^B-Class_.*_articles$', regex= True, na=False)]
df_ba.loc[:,'grade'] = 'B'
df_cca = df_ca[df_ca['title'].str.contains('^C-Class_.*_articles$', regex= True, na=False)]
df_cca.loc[:,'grade'] = 'C'
df_sa = df_ca[df_ca['title'].str.contains('^Start-Class_.*_articles$', regex= True, na=False)]
df_sa.loc[:,'grade'] = 'Start'
df_sta = df_ca[df_ca['title'].str.contains('^Stub-Class_.*_articles$', regex= True, na=False)]
df_sta.loc[:,'grade'] = 'Stub'
df_la = df_ca[df_ca['title'].str.contains('^List-Class_.*_articles$', regex= True, na=False)]
df_la.loc[:,'grade'] = 'List'
df_as = pd.concat([df_fa, df_fl, df_ga, df_aa, df_ba, df_cca, df_sa, df_sta, df_la])

The pages associated to each category are used to link WikiProjects to talk pages.

In [6]:
df_cl = pd.read_csv('data/page_category.tsv', sep='\t')
df_cl.shape

(165501704, 3)

Category links are filtered to only talk pages.

In [7]:
df_cl = df_cl[df_cl['page_id'].isin(df_pages[df_pages['namespace']==1]['page_id'])]
df_cl

Unnamed: 0,page_id,category_id,type
88,19887,2865710,page
125,30675,2865710,page
150,38987,2865710,page
231,104475,2865710,page
261,156815,2865710,page
...,...,...,...
165501562,68090768,231654172,page
165501570,68091549,168870449,page
165501574,68091652,334736,page
165501680,68102180,308387,page


There are a total of 5,635,313 talk pages (89.05% of Wikipedia pages) with any assessment category. It is possible that this number may be reduced due to data inconsistency or that the discussion pages are linked to articles that are redirects.

In [8]:
df_cl[df_cl['category_id'].isin(df_as['category_id'])][['page_id']].drop_duplicates()

Unnamed: 0,page_id
14060850,128
14060851,1817
14060852,2214
14060853,3272
14060854,5417
...,...
164306402,66098958
164625233,51887553
164911541,67514625
165426867,65654138


The quality grade is added to links data.frame and reduced to only these articles.

In [9]:
df_cl = df_cl.merge(df_as[['category_id', 'title', 'grade']], how='inner', on='category_id')
df_cl

Unnamed: 0,page_id,category_id,type,title,grade
0,128,504542,page,B-Class_Libertarianism_articles,B
1,1817,504542,page,B-Class_Libertarianism_articles,B
2,2214,504542,page,B-Class_Libertarianism_articles,B
3,3272,504542,page,B-Class_Libertarianism_articles,B
4,5417,504542,page,B-Class_Libertarianism_articles,B
...,...,...,...,...,...
15798518,67386334,38245,page,A-Class_Religion_articles,A
15798519,67416310,305261,page,Stub-Class_Unknown-importance_Pornography_arti...,Stub
15798520,67615296,247609774,page,Stub-Class_Dua_Lipa_articles,Stub
15798521,67970097,246710894,page,List-Class_African_diaspora_visual_arts_articles,List


Most Wikipedia pages are classified as Start and Stub.

In [10]:
df_cl[['page_id', 'grade']].drop_duplicates().groupby('grade').size().reset_index(name='talks')

Unnamed: 0,grade,talks
0,A,959
1,B,109893
2,C,396525
3,FA,5945
4,FL,3818
5,GA,34008
6,List,265946
7,Start,1841113
8,Stub,3154753


To select articles with grades it is necessary to first identify the talk pages.

In [11]:
df_pages = df_pages[df_pages['namespace']==1]
df_pages = df_pages.merge(df_cl[['page_id', 'title', 'grade']], how='inner', on='page_id')[['page_id', 'title_x', 'title_y', 'grade']].drop_duplicates()
df_pages

Unnamed: 0,page_id,title_x,title_y,grade
0,128,Atlas_Shrugged,B-Class_Libertarianism_articles,B
1,128,Atlas_Shrugged,B-Class_Objectivism_articles,B
2,128,Atlas_Shrugged,B-Class_Philosophy_articles,B
3,128,Atlas_Shrugged,B-Class_Women_writers_articles,B
4,128,Atlas_Shrugged,B-Class_novel_articles,B
...,...,...,...,...
15798518,68103035,Ross_McCann,Stub-Class_rugby_union_articles,Stub
15798519,68103039,Holly_Aitchison_(rugby_union),Stub-Class_England-related_articles,Stub
15798520,68103039,Holly_Aitchison_(rugby_union),Stub-Class_rugby_union_articles,Stub
15798521,68103043,Abi_Burton,Stub-Class_England-related_articles,Stub


Wikipedia article grade is identified (some may have more than one).

In [12]:
df_pages_m = df_pages_m.merge(df_pages[['title_x', 'grade']].drop_duplicates(), how='left', left_on='title', right_on='title_x')
df_pages_m = df_pages_m.drop(columns=['title_x']).drop_duplicates()

In [15]:
df_pages_m

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls,ref_urls,ref_pubs,grade
0,12,Anarchism,2001,19,237226,19819,3773,96584,92,18720,925,1541,4130,69,43,65,GA
1,25,Autism,2001,19,469365,10563,3731,133536,226,5524,883,602,2454,175,70,185,FA
2,39,Albedo,2001,19,82923,1225,686,45483,37,136,72,245,1428,82,26,22,B
3,290,A,2001,20,449105,4778,2597,28174,30,618,391,275,823,21,15,9,C
4,303,Alabama,2001,19,324587,9997,4381,197906,207,464,188,1598,16227,273,177,26,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6504425,68103340,Karen_Doell,2021,0,0,1,1,1223,0,1,1,0,1,0,0,0,
6504426,68103349,John_W._Fewell,2021,0,0,15,4,521,0,1,1,0,0,0,0,0,
6504427,68103359,Carrie_Flemmer,2021,0,0,1,1,1300,0,1,1,0,1,0,0,0,
6504428,68103365,Dapp_Browsers,2021,0,0,3,2,2682,0,0,0,0,0,0,0,0,


In cases where there is no degree, `Rest` is used.

In [16]:
df_pages_m.grade = df_pages_m.grade.fillna('Rest')

# 2. Stats

In [17]:
df_pages_m[df_pages_m['grade'] == 'Rest']

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls,ref_urls,ref_pubs,grade
9,309,An_American_in_Paris,2001,20,10379,346,216,22835,23,85,38,199,230,37,23,4,Rest
28,579,Alien,2001,19,48328,4513,2480,5860,0,204,139,72,21,0,0,0,Rest
31,590,Austin_(disambiguation),2001,19,1811,759,462,2327,0,20,12,46,13,0,0,0,Rest
50,630,Ada,2001,19,5841,524,320,3869,0,11,9,59,4,0,0,0,Rest
51,632,Aberdeen_(disambiguation),2001,19,2249,402,179,6925,0,6,5,102,66,0,0,0,Rest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6504425,68103340,Karen_Doell,2021,0,0,1,1,1223,0,1,1,0,1,0,0,0,Rest
6504426,68103349,John_W._Fewell,2021,0,0,15,4,521,0,1,1,0,0,0,0,0,Rest
6504427,68103359,Carrie_Flemmer,2021,0,0,1,1,1300,0,1,1,0,1,0,0,0,Rest
6504428,68103365,Dapp_Browsers,2021,0,0,3,2,2682,0,0,0,0,0,0,0,0,Rest


In [21]:
df_pages_m.groupby('grade').mean().round(2).reset_index()

Unnamed: 0,grade,page_id,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls,ref_urls,ref_pubs
0,A,17027737.88,2007.74,12.74,16011.9,564.91,176.8,43329.86,31.76,92.21,27.9,236.56,202.01,33.32,18.54,8.51
1,B,18723808.9,2008.03,12.47,30359.32,705.41,297.62,35009.98,26.51,88.35,28.16,233.87,417.0,40.31,20.53,4.77
2,C,24478274.55,2009.57,10.92,15829.78,369.89,165.36,21676.02,15.4,35.32,15.03,164.23,234.08,25.95,12.48,2.37
3,FA,12134479.45,2006.16,14.33,64801.57,1491.35,516.93,61248.04,53.95,258.4,66.17,329.68,725.25,58.03,32.75,14.27
4,FL,22081283.09,2008.99,11.52,26685.54,593.61,179.13,51549.21,55.49,42.36,16.62,270.16,175.84,67.32,49.33,2.34
5,GA,20132601.0,2008.44,12.06,29229.07,724.13,275.71,39444.47,38.87,88.56,29.64,224.88,330.18,46.1,30.23,5.83
6,List,31677343.81,2011.4,9.13,3777.21,159.8,56.27,18202.39,9.2,9.07,4.98,174.78,107.34,22.82,7.87,0.53
7,Rest,33835613.67,2011.91,8.53,1600.68,43.78,22.22,4639.8,1.9,2.83,1.97,44.52,27.35,4.75,1.62,0.2
8,Start,25960561.0,2010.04,10.45,4094.29,129.52,63.13,10033.48,5.79,9.69,6.56,101.28,93.03,12.9,4.88,0.69
9,Stub,31138668.01,2011.29,9.2,710.17,40.23,22.85,3748.17,1.84,4.32,3.64,69.9,55.7,6.09,1.51,0.22


In [25]:
df_pages_m[['grade']].groupby('grade').size().reset_index(name='articles')

Unnamed: 0,grade,articles
0,A,958
1,B,109019
2,C,394065
3,FA,5945
4,FL,3816
5,GA,34004
6,List,253066
7,Rest,805423
8,Start,1818356
9,Stub,3079778
