In [1]:
import pandas as pd
import numpy as np
import csv

<b>Aim</b>: This script performs a data preprocessing to obtain the main metrics of Wikipedia article pages (e.g. number of views, editors...).

# 1. Data preprocessing

## 1.1. Pages

There are a total of 53,710,529 Wikipedia pages.

In [2]:
df_pages = pd.read_csv('data/page.tsv', sep='\t', quoting=csv.QUOTE_NONE)
df_pages

Unnamed: 0,page_id,namespace,title,is_redirect,is_new,touched,links_updated,latest,len,content_model,page_edits,creation,editors,views,references
0,10,0,AccessibleComputing,1,0,20210607122734,2.021061e+13,1002250816,111,wikitext,14.0,2001-01-21,13.0,186.0,
1,12,0,Anarchism,0,0,20210701093040,2.021070e+13,1030472204,96584,wikitext,19819.0,2001-10-11,3773.0,237226.0,92.0
2,13,0,AfghanistanHistory,1,0,20210629133822,2.021061e+13,783865149,90,wikitext,6.0,2001-04-05,5.0,47.0,
3,14,0,AfghanistanGeography,1,0,20210607122734,2.021061e+13,783865160,92,wikitext,7.0,2001-01-21,7.0,23.0,
4,15,0,AfghanistanPeople,1,0,20210629123442,2.021061e+13,783865293,95,wikitext,8.0,2001-01-21,7.0,16.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53710524,68103374,11,World_championships_in_2023,0,1,20210701094743,2.021070e+13,1031387361,22,wikitext,1.0,2021-07-01,1.0,,
53710525,68103375,3,2C0F:F5F0:4290:3F0:74A4:790D:12EC:9B01,0,1,20210701094755,2.021070e+13,1031387381,799,wikitext,1.0,2021-07-01,1.0,,
53710526,68103376,2,Asif_Khan_Tarand,0,1,20210701094801,2.021070e+13,1031387393,11,wikitext,3.0,2021-07-01,1.0,,
53710527,68103377,118,Juanita_Head_Walton,1,1,20210701094810,2.021070e+13,1031387407,80,wikitext,1.0,2021-07-01,1.0,,


<div class="alert alert-block alert-warning"><b>Warning: </b> There are a few pages without creation date.</div> 

Instead of the creation date, the number of years since the creation of a Wikipedia is calculated.

In [3]:
df_pages.loc[~df_pages['creation'].isna(),'age'] = (np.floor((pd.to_datetime('2021-07-01', format='%Y-%m-%d') - 
             pd.to_datetime(df_pages.loc[~df_pages['creation'].isna(),'creation'], format='%Y-%m-%d')).dt.days / 365.25)).astype(int)
df_pages

Unnamed: 0,page_id,namespace,title,is_redirect,is_new,touched,links_updated,latest,len,content_model,page_edits,creation,editors,views,references,age
0,10,0,AccessibleComputing,1,0,20210607122734,2.021061e+13,1002250816,111,wikitext,14.0,2001-01-21,13.0,186.0,,20.0
1,12,0,Anarchism,0,0,20210701093040,2.021070e+13,1030472204,96584,wikitext,19819.0,2001-10-11,3773.0,237226.0,92.0,19.0
2,13,0,AfghanistanHistory,1,0,20210629133822,2.021061e+13,783865149,90,wikitext,6.0,2001-04-05,5.0,47.0,,20.0
3,14,0,AfghanistanGeography,1,0,20210607122734,2.021061e+13,783865160,92,wikitext,7.0,2001-01-21,7.0,23.0,,20.0
4,15,0,AfghanistanPeople,1,0,20210629123442,2.021061e+13,783865293,95,wikitext,8.0,2001-01-21,7.0,16.0,,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53710524,68103374,11,World_championships_in_2023,0,1,20210701094743,2.021070e+13,1031387361,22,wikitext,1.0,2021-07-01,1.0,,,0.0
53710525,68103375,3,2C0F:F5F0:4290:3F0:74A4:790D:12EC:9B01,0,1,20210701094755,2.021070e+13,1031387381,799,wikitext,1.0,2021-07-01,1.0,,,0.0
53710526,68103376,2,Asif_Khan_Tarand,0,1,20210701094801,2.021070e+13,1031387393,11,wikitext,3.0,2021-07-01,1.0,,,0.0
53710527,68103377,118,Juanita_Head_Walton,1,1,20210701094810,2.021070e+13,1031387407,80,wikitext,1.0,2021-07-01,1.0,,,0.0


The creation date is also reduced to the creation year.

In [4]:
df_pages['creation'] = pd.to_numeric(df_pages['creation'].str.slice(start=0, stop=4))
df_pages

Unnamed: 0,page_id,namespace,title,is_redirect,is_new,touched,links_updated,latest,len,content_model,page_edits,creation,editors,views,references,age
0,10,0,AccessibleComputing,1,0,20210607122734,2.021061e+13,1002250816,111,wikitext,14.0,2001.0,13.0,186.0,,20.0
1,12,0,Anarchism,0,0,20210701093040,2.021070e+13,1030472204,96584,wikitext,19819.0,2001.0,3773.0,237226.0,92.0,19.0
2,13,0,AfghanistanHistory,1,0,20210629133822,2.021061e+13,783865149,90,wikitext,6.0,2001.0,5.0,47.0,,20.0
3,14,0,AfghanistanGeography,1,0,20210607122734,2.021061e+13,783865160,92,wikitext,7.0,2001.0,7.0,23.0,,20.0
4,15,0,AfghanistanPeople,1,0,20210629123442,2.021061e+13,783865293,95,wikitext,8.0,2001.0,7.0,16.0,,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53710524,68103374,11,World_championships_in_2023,0,1,20210701094743,2.021070e+13,1031387361,22,wikitext,1.0,2021.0,1.0,,,0.0
53710525,68103375,3,2C0F:F5F0:4290:3F0:74A4:790D:12EC:9B01,0,1,20210701094755,2.021070e+13,1031387381,799,wikitext,1.0,2021.0,1.0,,,0.0
53710526,68103376,2,Asif_Khan_Tarand,0,1,20210701094801,2.021070e+13,1031387393,11,wikitext,3.0,2021.0,1.0,,,0.0
53710527,68103377,118,Juanita_Head_Walton,1,1,20210701094810,2.021070e+13,1031387407,80,wikitext,1.0,2021.0,1.0,,,0.0


Wikipedia user talk pages are the most numerous, followed by the articles and their discussions.  More info: [https://en.wikipedia.org/wiki/Wikipedia:Namespace](https://en.wikipedia.org/wiki/Wikipedia:Namespace)

In [5]:
df_ns = df_pages.groupby(['namespace']).size().reset_index(name='pages')
df_ns['%'] = round(100*df_ns['pages']/sum(df_ns['pages']),2)
df_ns

Unnamed: 0,namespace,pages,%
0,0,15973778,29.74
1,1,8486598,15.8
2,2,3613692,6.73
3,3,17107370,31.85
4,4,1308017,2.44
5,5,183856,0.34
6,6,932500,1.74
7,7,695580,1.3
8,8,2246,0.0
9,9,1793,0.0


In [6]:
df_ns = df_pages.groupby(['namespace', 'is_redirect']).size().reset_index(name='pages')
df_ns['%'] = round(100*df_ns['pages']/sum(df_ns['pages']),2)
df_ns

Unnamed: 0,namespace,is_redirect,pages,%
0,0,0,6328134,11.78
1,0,1,9645644,17.96
2,1,0,7076573,13.18
3,1,1,1410025,2.63
4,2,0,3396651,6.32
5,2,1,217041,0.4
6,3,0,17011490,31.67
7,3,1,95880,0.18
8,4,0,1165841,2.17
9,4,1,142176,0.26


<div class="alert alert-block alert-info"> <b>Note:</b> Talks pages are preprocessed separately to obtaing Wikipedia article talks metrics.</div>

In [7]:
df_talks = df_pages[df_pages['namespace']==1]
df_talks = df_talks[df_talks['is_redirect']==0]
df_talks

Unnamed: 0,page_id,namespace,title,is_redirect,is_new,touched,links_updated,latest,len,content_model,page_edits,creation,editors,views,references,age
33,128,1,Atlas_Shrugged,0,0,20210630082942,2.021063e+13,1001415316,7577,wikitext,741.0,2001.0,241.0,132.0,,20.0
87,354,1,Algeria,0,0,20210630182441,2.021063e+13,1031274050,82853,wikitext,906.0,2001.0,337.0,493.0,,19.0
96,582,1,Altruism/Archive_1,0,0,20210606185146,2.021061e+13,966903238,149702,wikitext,284.0,2001.0,39.0,13.0,,20.0
150,672,1,Arc_de_Triomphe,0,0,20210622143531,2.021062e+13,1018969872,72419,wikitext,235.0,2002.0,94.0,186.0,,19.0
167,692,1,Archaeology,0,0,20210623123336,2.021062e+13,1030015934,51782,wikitext,430.0,2001.0,154.0,131.0,,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53710497,68103347,1,The_Vision_Splendid_Outback_Film_Festival,0,1,20210701094340,2.021070e+13,1031386933,178,wikitext,1.0,2021.0,1.0,,,0.0
53710511,68103361,1,Carrie_Flemmer,0,1,20210701094555,2.021070e+13,1031387186,374,wikitext,1.0,2021.0,1.0,,,0.0
53710521,68103371,1,John_W._Fewell,0,1,20210701094733,2.021070e+13,1031387345,162,wikitext,1.0,2021.0,1.0,,,0.0
53710522,68103372,1,"Kosh-Döbö,_Ak-Taala",0,1,20210701094739,2.021070e+13,1031387351,368,wikitext,1.0,2021.0,1.0,,,0.0


Fixed reading errors of some titles in the talk pages.

In [8]:
df_talks[df_talks['title'].isna()]

Unnamed: 0,page_id,namespace,title,is_redirect,is_new,touched,links_updated,latest,len,content_model,page_edits,creation,editors,views,references,age
197047,254874,1,,0,0,20210621201129,20210420000000.0,1015522850,15262,wikitext,63.0,2003.0,39.0,,,18.0
12723207,19370868,1,,0,0,20210622022107,20210510000000.0,1019418013,9597,wikitext,52.0,2008.0,27.0,,,12.0


In [9]:
df_talks.loc[df_talks['page_id']==254874, 'title'] = 'NaN'
df_talks.loc[df_talks['page_id']==19370868, 'title'] = 'N/A'

Only Wikipedia articles are selected in the main `dataframe`. There are 15,973,778 Wikipedia articles

In [10]:
df_pages = df_pages[df_pages['namespace']==0]
df_pages.shape

(15973778, 16)

Wikipedia articles that are redirect pages are removed. Wikipedia articles are reduced to 6,328,134.

In [11]:
df_pages = df_pages[df_pages['is_redirect']==0]
df_pages = df_pages[['page_id', 'title', 'creation', 'age', 'views', 'page_edits', 'editors', 'len', 'references']]
df_pages.shape

(6328134, 9)

Fixed the same reading errors of some titles in the article pages.

In [12]:
df_pages[df_pages['title'].isna()]

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references
37183,49244,,2002.0,19.0,,718.0,369.0,20179,10.0
12013796,18460536,,2008.0,12.0,,228.0,146.0,5062,1.0


In [13]:
df_pages.loc[df_pages['page_id']==49244, 'title'] = 'NaN'
df_pages.loc[df_pages['page_id']==18460536, 'title'] = 'N/A'

The null values of the metrics are substituted by 0.

In [14]:
df_pages[['creation', 'age', 'views', 'page_edits', 'editors', 'len', 'references']] = df_pages[['creation', 'age', 'views', 'page_edits', 'editors', 'len', 'references']].fillna(0)

<div class="alert alert-block alert-warning"><b>Warning: </b> Wikipedia talk pages views are from articles. There are also a few duplicate pages.</div> 

In [15]:
df_talks = df_talks[df_talks['title'].isin(df_pages['title'].tolist())]
df_talks = df_talks[['title','page_edits','editors']]
df_talks.rename({'page_edits':'talks', 'editors':'talkers'}, axis=1, inplace=True)
df_talks

Unnamed: 0,title,talks,talkers
33,Atlas_Shrugged,741.0,241.0
87,Algeria,906.0,337.0
150,Arc_de_Triomphe,235.0,94.0
167,Archaeology,430.0,154.0
182,Android_(robot),244.0,124.0
...,...,...,...
53710497,The_Vision_Splendid_Outback_Film_Festival,1.0,1.0
53710511,Carrie_Flemmer,1.0,1.0
53710521,John_W._Fewell,1.0,1.0
53710522,"Kosh-Döbö,_Ak-Taala",1.0,1.0


There are 0 duplicated talk pages.

In [16]:
talks_duplicated = df_talks[['title']].groupby('title').size().reset_index(name='count')
talks_duplicated = talks_duplicated[talks_duplicated['count']>1]['title'].tolist()
len(talks_duplicated)

0

Talks page metrics are merged with article pages.

In [17]:
df_pages = df_pages.merge(df_talks, how='left', on='title')
df_pages[['talks', 'talkers']] = df_pages[['talks', 'talkers']].fillna(0)
df_pages

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references,talks,talkers
0,12,Anarchism,2001.0,19.0,237226.0,19819.0,3773.0,96584,92.0,18720.0,925.0
1,25,Autism,2001.0,19.0,469365.0,10563.0,3731.0,133536,226.0,5524.0,883.0
2,39,Albedo,2001.0,19.0,82923.0,1225.0,686.0,45483,37.0,136.0,72.0
3,290,A,2001.0,20.0,449105.0,4778.0,2597.0,28174,30.0,618.0,391.0
4,303,Alabama,2001.0,19.0,324587.0,9997.0,4381.0,197906,207.0,464.0,188.0
...,...,...,...,...,...,...,...,...,...,...,...
6328129,68103340,Karen_Doell,2021.0,0.0,0.0,1.0,1.0,1223,0.0,1.0,1.0
6328130,68103349,John_W._Fewell,2021.0,0.0,0.0,15.0,4.0,521,0.0,1.0,1.0
6328131,68103359,Carrie_Flemmer,2021.0,0.0,0.0,1.0,1.0,1300,0.0,1.0,1.0
6328132,68103365,Dapp_Browsers,2021.0,0.0,0.0,3.0,2.0,2682,0.0,0.0,0.0


## 1.2. Links

### 1.2.1. Linking pages

The total number of Wikipedia pages that it links to.

In [18]:
df_links = pd.read_csv('data/page_links_freq.tsv', sep='\t')
df_links

Unnamed: 0,page_id,links
0,10,3
1,12,1541
2,13,3
3,14,3
4,15,3
...,...,...
15958747,68102338,1
15958748,68102341,5
15958749,68102343,1
15958750,68102349,1


In [19]:
df_pages = df_pages.merge(df_links, how='left', on='page_id')
df_pages[['links']] = df_pages[['links']].fillna(0)

### 1.2.2. Linked pages

The total number of Wikipedia pages from which it is linked.

In [20]:
df_linked = pd.read_csv('data/page_linked_freq.tsv', sep='\t')
df_linked

Unnamed: 0,page_id,linked
0,12,4130
1,25,2454
2,39,1428
3,53,23
4,290,823
...,...,...
9824257,68103340,1
9824258,68103353,15
9824259,68103355,1
9824260,68103359,1


In [21]:
df_pages = df_pages.merge(df_linked, how='left', on='page_id')
df_pages[['linked']] = df_pages[['linked']].fillna(0)

## 1.3. External links

### 1.3.1. All

Links to external websites are calculated.

In [22]:
df_urls = pd.read_csv('data/page_exturls_freq.tsv', sep='\t')
df_urls

Unnamed: 0,page_id,urls
0,12,69
1,25,175
2,39,82
3,290,21
4,303,273
...,...,...
5801987,68103177,2
5801988,68103180,2
5801989,68103214,7
5801990,68103231,1


In [23]:
df_pages = df_pages.merge(df_urls, how='left', on='page_id')
df_pages[['urls']] = df_pages[['urls']].fillna(0)
df_pages

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls
0,12,Anarchism,2001.0,19.0,237226.0,19819.0,3773.0,96584,92.0,18720.0,925.0,1541.0,4130.0,69.0
1,25,Autism,2001.0,19.0,469365.0,10563.0,3731.0,133536,226.0,5524.0,883.0,602.0,2454.0,175.0
2,39,Albedo,2001.0,19.0,82923.0,1225.0,686.0,45483,37.0,136.0,72.0,245.0,1428.0,82.0
3,290,A,2001.0,20.0,449105.0,4778.0,2597.0,28174,30.0,618.0,391.0,275.0,823.0,21.0
4,303,Alabama,2001.0,19.0,324587.0,9997.0,4381.0,197906,207.0,464.0,188.0,1598.0,16227.0,273.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6328129,68103340,Karen_Doell,2021.0,0.0,0.0,1.0,1.0,1223,0.0,1.0,1.0,0.0,1.0,0.0
6328130,68103349,John_W._Fewell,2021.0,0.0,0.0,15.0,4.0,521,0.0,1.0,1.0,0.0,0.0,0.0
6328131,68103359,Carrie_Flemmer,2021.0,0.0,0.0,1.0,1.0,1300,0.0,1.0,1.0,0.0,1.0,0.0
6328132,68103365,Dapp_Browsers,2021.0,0.0,0.0,3.0,2.0,2682,0.0,0.0,0.0,0.0,0.0,0.0


### 1.3.3. URLs referenced

Links to external websites included in references are calculated.

In [24]:
df_refurls = pd.read_csv('data/page_refurls_freq.tsv', sep='\t')
df_refurls.rename(columns={'urls':'ref_urls'}, inplace=True)
df_refurls

Unnamed: 0,page_id,ref_urls
0,12,43
1,25,70
2,39,26
3,290,15
4,303,177
...,...,...
3409207,63833834,3
3409208,63833859,4
3409209,63833892,9
3409210,63833916,3


In [25]:
df_pages = df_pages.merge(df_refurls, how='left', on='page_id')
df_pages[['ref_urls']] = df_pages[['ref_urls']].fillna(0)
df_pages

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls,ref_urls
0,12,Anarchism,2001.0,19.0,237226.0,19819.0,3773.0,96584,92.0,18720.0,925.0,1541.0,4130.0,69.0,43.0
1,25,Autism,2001.0,19.0,469365.0,10563.0,3731.0,133536,226.0,5524.0,883.0,602.0,2454.0,175.0,70.0
2,39,Albedo,2001.0,19.0,82923.0,1225.0,686.0,45483,37.0,136.0,72.0,245.0,1428.0,82.0,26.0
3,290,A,2001.0,20.0,449105.0,4778.0,2597.0,28174,30.0,618.0,391.0,275.0,823.0,21.0,15.0
4,303,Alabama,2001.0,19.0,324587.0,9997.0,4381.0,197906,207.0,464.0,188.0,1598.0,16227.0,273.0,177.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6328129,68103340,Karen_Doell,2021.0,0.0,0.0,1.0,1.0,1223,0.0,1.0,1.0,0.0,1.0,0.0,0.0
6328130,68103349,John_W._Fewell,2021.0,0.0,0.0,15.0,4.0,521,0.0,1.0,1.0,0.0,0.0,0.0,0.0
6328131,68103359,Carrie_Flemmer,2021.0,0.0,0.0,1.0,1.0,1300,0.0,1.0,1.0,0.0,1.0,0.0,0.0
6328132,68103365,Dapp_Browsers,2021.0,0.0,0.0,3.0,2.0,2682,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 1.4. References

In [26]:
df_ref = pd.read_csv('data/page_pub.tsv', sep='\t')
df_ref

Unnamed: 0,page_id,pub_id
0,10630303,1
1,12008665,2
2,23560884,3
3,3144280,3
4,652221,4
...,...,...
3728517,3148435,2367544
3728518,34084825,2367545
3728519,17547034,2367546
3728520,6265065,2367547


References to publications are identified.

In [27]:
df_pub = pd.read_csv('data/pub.tsv', sep='\t')
df_pub

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,pub_id,arxiv,asin,bibcode,doi,isbn,ismn,jfm,jstor,lccn,...,oclc,ol,osti,pmc,pmid,rfc,ssrn,url,usenetid,zbl
0,1,,,,10.4064/fm-17-1-152-170,,,,,,...,,,,,,,,eudml.org/doc/212513,,3.02701
1,2,,,,,9780959659634,,,,,...,,,,,,,,fieldgeologyclubsa.org.au/publications.htm,,
2,3,,,,10.1007/bf02086276,,,,,,...,,,,,,,,zenodo.org/record/1642598,,
3,4,,,,,0719013380,,,,,...,,,,,,,,books.google.com/?id=5367aaaaiaaj,,
4,5,,,,,0618133844,,,,,...,,,,,,,,archive.org/details/eastasiacultural00ebre_0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2367543,2367544,,,,,,,,,,...,,17845409m,,,,,,,,
2367544,2367545,,,,,,,,,,...,,,,,,,2247615.0,,,
2367545,2367546,,,,,,,,,,...,,,,,,,2617574.0,,,
2367546,2367547,,,,,9781585440450,,,,,...,13093276.0,,,,,,,,,


In [28]:
df_pub_doi = df_pub[~df_pub['doi'].isna()][['pub_id', 'doi']]
df_pub_doi

Unnamed: 0,pub_id,doi
0,1,10.4064/fm-17-1-152-170
2,3,10.1007/bf02086276
15,16,10.1177/000312240406900406
25,26,10.1016/0021-8693(83)90007-8
28,29,10.1007/bfb0067491
...,...,...
2367524,2367525,10.1108/eb032812
2367525,2367526,10.1006/bbrc.1993.2507
2367526,2367527,10.1136/bjsm.35.1.60
2367531,2367532,10.1109/mtas.2004.1337889


There are 1,017,673 unique DOI referenced.

In [29]:
len(set(df_pub_doi['doi']))

1017673

In [30]:
df_pub_isbn = df_pub[~df_pub['isbn'].isna()][['pub_id', 'isbn']]
df_pub_isbn

Unnamed: 0,pub_id,isbn
1,2,9780959659634
3,4,0719013380
4,5,0618133844
5,6,0933932049
6,7,9780300014952
...,...,...
2366797,2366798,0765606623
2367527,2367528,0607908262
2367528,2367529,9780319228760
2367534,2367535,0415222966


There are 895,723 unique ISBN referenced.

In [31]:
len(set(df_pub_isbn['isbn']))

895723

### 1.4.1. All

The total number of references is added.

In [32]:
df_ref = df_ref.groupby('page_id').size().reset_index(name='ref_pubs')
df_ref

Unnamed: 0,page_id,ref_pubs
0,12,65
1,25,185
2,39,22
3,290,9
4,303,26
...,...,...
1011643,63833456,5
1011644,63833466,1
1011645,63833648,1
1011646,63833769,1


In [33]:
df_pages = df_pages.merge(df_ref, how='left', on='page_id')
df_pages[['ref_pubs']] = df_pages[['ref_pubs']].fillna(0)
df_pages

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls,ref_urls,ref_pubs
0,12,Anarchism,2001.0,19.0,237226.0,19819.0,3773.0,96584,92.0,18720.0,925.0,1541.0,4130.0,69.0,43.0,65.0
1,25,Autism,2001.0,19.0,469365.0,10563.0,3731.0,133536,226.0,5524.0,883.0,602.0,2454.0,175.0,70.0,185.0
2,39,Albedo,2001.0,19.0,82923.0,1225.0,686.0,45483,37.0,136.0,72.0,245.0,1428.0,82.0,26.0,22.0
3,290,A,2001.0,20.0,449105.0,4778.0,2597.0,28174,30.0,618.0,391.0,275.0,823.0,21.0,15.0,9.0
4,303,Alabama,2001.0,19.0,324587.0,9997.0,4381.0,197906,207.0,464.0,188.0,1598.0,16227.0,273.0,177.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6328129,68103340,Karen_Doell,2021.0,0.0,0.0,1.0,1.0,1223,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
6328130,68103349,John_W._Fewell,2021.0,0.0,0.0,15.0,4.0,521,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
6328131,68103359,Carrie_Flemmer,2021.0,0.0,0.0,1.0,1.0,1300,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
6328132,68103365,Dapp_Browsers,2021.0,0.0,0.0,3.0,2.0,2682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
df_pages.creation = df_pages.creation.astype(int)
df_pages.age = df_pages.age.astype(int)
df_pages.views = df_pages.views.astype(int)
df_pages.page_edits = df_pages.page_edits.astype(int)
df_pages.editors = df_pages.editors.astype(int)
df_pages.len = df_pages.len.astype(int)
df_pages.references = df_pages.references.astype(int)
df_pages.talks = df_pages.talks.astype(int)
df_pages.talkers = df_pages.talkers.astype(int)
df_pages.links = df_pages.links.astype(int)
df_pages.linked = df_pages.linked.astype(int)
df_pages.urls = df_pages.urls.astype(int)
df_pages.ref_urls = df_pages.ref_urls.astype(int)
df_pages.ref_pubs = df_pages.ref_pubs.astype(int)

The indicators of all Wikipedia articles are exported.

In [35]:
df_pages.head(20)

Unnamed: 0,page_id,title,creation,age,views,page_edits,editors,len,references,talks,talkers,links,linked,urls,ref_urls,ref_pubs
0,12,Anarchism,2001,19,237226,19819,3773,96584,92,18720,925,1541,4130,69,43,65
1,25,Autism,2001,19,469365,10563,3731,133536,226,5524,883,602,2454,175,70,185
2,39,Albedo,2001,19,82923,1225,686,45483,37,136,72,245,1428,82,26,22
3,290,A,2001,20,449105,4778,2597,28174,30,618,391,275,823,21,15,9
4,303,Alabama,2001,19,324587,9997,4381,197906,207,464,188,1598,16227,273,177,26
5,305,Achilles,2001,19,383435,7189,3508,77042,27,282,143,637,1586,38,11,12
6,307,Abraham_Lincoln,2001,19,1382625,17489,5061,181659,160,8250,2064,1761,9212,221,51,102
7,308,Aristotle,2001,19,461580,9192,4065,145558,151,1637,563,2061,6167,188,80,81
8,309,An_American_in_Paris,2001,20,10379,346,216,22835,23,85,38,199,230,37,23,4
9,316,Academy_Award_for_Best_Production_Design,2001,20,31035,1231,421,97804,25,25,18,1369,1491,28,24,0


In [36]:
df_pages.to_csv('results/page_metrics.tsv', sep='\t', index=False)

Top Wikipedia articles are exported.

In [52]:
top_pages = []
top_pages = top_pages + df_pages.sort_values(by='views', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='page_edits', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='editors', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='len', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='references', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='talks', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='talkers', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='links', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='linked', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='urls', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='ref_urls', ascending=False)[0:1000]['page_id'].tolist()
top_pages = top_pages + df_pages.sort_values(by='ref_pubs', ascending=False)[0:1000]['page_id'].tolist()
top_pages = list(set(top_pages))
len(top_pages)

7374

In [53]:
df_pages[df_pages.page_id.isin(top_pages)].to_csv('results/page_metrics_top.tsv', sep='\t', index=False)

In [64]:
df_pages.loc[:, df_pages.columns[3:]].to_csv('results/page_only_metrics.tsv', sep='\t', index=False)