# A2 - Bias in Data Assignment

In [1]:
# import libraries 

import pandas as pd
import numpy as np

In [2]:
# import two datasets

page_data = pd.read_csv("page_data.csv")

WPDS_data = pd.read_csv("WPDS_2020_data.csv")

In [3]:
# Check the data has been imported properly

page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [4]:
# Check the WPDS data has been imported properly 

WPDS_data.head()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000


## Data Cleaning

In [5]:
# Remove rows where "page" starts with "Template" in page_data 

page_data = page_data[~page_data.page.str.contains("Template:")]

page_data

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


In [6]:
# remove rows where "Name" contains all caps values  in WPDS_data

WPDS_clean = WPDS_data[~WPDS_data['Name'].str[:].str.isupper()]

# Note there is still one row left where the Type is specified as sub-region, need to remove this row as well
WPDS_clean = WPDS_clean[WPDS_clean['Type']=='Country']

WPDS_clean

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000
5,LY,Libya,Country,2019,6.891,6891000
6,MA,Morocco,Country,2019,35.952,35952000
7,SD,Sudan,Country,2019,43.849,43849000
...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.200,200000
230,SB,Solomon Islands,Country,2019,0.715,715000
231,TO,Tonga,Country,2019,0.099,99000
232,TV,Tuvalu,Country,2019,0.010,10000


## Getting Article Quality Predictions 

In [7]:
# import the libaries

import json
import requests
import os

In [8]:
# Set the endpoint for using the API 
endpoint = 'https://ores.wikimedia.org/v3/scores/enwiki/?models=articlequality&revids={rev_id}'

In [9]:

# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/Sabrinawang06',
    'From': 'lxw5332@uw.edu'
}


# Define the API call
def api_call(endpoint, rev_id):
    call = requests.get(endpoint.format(rev_id = rev_id), headers=headers)
    response = call.json()
    return response




In [10]:
# create batch of 50 rev_id and loop over the whole table of page_data

res = [] # valid API returns 
error_log = [] # API returns with error 
n= page_data.shape[0]

for i in range(0,n,50):
    if i+50 > n:
        batch = api_call(endpoint, "|".join(str(x) for x in page_data.rev_id.iloc[i:n]))
    else:
        batch = api_call(endpoint, "|".join(str(x) for x in page_data.rev_id.iloc[i:i+50]))
        
    
    
    for j in batch['enwiki']['scores'].keys():
        try:     
            res.append([j, batch['enwiki']['scores'][j]['articlequality']['score']['prediction']])
        except KeyError:
            error_log.append([j,batch['enwiki']['scores'][j]])
            pass
        

            

In [11]:
# Conver previous output into dataframe and rename the columns 
prediction = pd.DataFrame(res).rename(columns={0: "rev_id", 1: "prediction"})
prediction

Unnamed: 0,rev_id,prediction
0,355319463,Stub
1,393276188,Stub
2,393822005,Stub
3,395521877,Stub
4,395526568,Stub
...,...,...
46420,807481636,C
46421,807482007,GA
46422,807483006,C
46423,807483153,GA


In [12]:
# Check the error log 
error_log

[['516633096',
  {'articlequality': {'error': {'message': 'RevisionNotFound: Could not find revision ({revision}:516633096)',
     'type': 'RevisionNotFound'}}}],
 ['550682925',
  {'articlequality': {'error': {'message': 'RevisionNotFound: Could not find revision ({revision}:550682925)',
     'type': 'RevisionNotFound'}}}],
 ['627547024',
  {'articlequality': {'error': {'message': 'RevisionNotFound: Could not find revision ({revision}:627547024)',
     'type': 'RevisionNotFound'}}}],
 ['636911471',
  {'articlequality': {'error': {'message': 'RevisionNotFound: Could not find revision ({revision}:636911471)',
     'type': 'RevisionNotFound'}}}],
 ['669987106',
  {'articlequality': {'error': {'message': 'RevisionNotFound: Could not find revision ({revision}:669987106)',
     'type': 'RevisionNotFound'}}}],
 ['671484594',
  {'articlequality': {'error': {'message': 'RevisionNotFound: Could not find revision ({revision}:671484594)',
     'type': 'RevisionNotFound'}}}],
 ['680981536',
  {'art

## Combining the Datasets

In [13]:
# Conver the rev_id type in the prediction dataframe to int64 for merging with page_data
prediction['rev_id'] = prediction['rev_id'].astype('int64')

# Merge the perdiction back to the page_data
pred_page = pd.merge(left=page_data, right=prediction, left_on='rev_id', right_on='rev_id')

In [14]:
# Check if all rows are assigned a prediction
pred_page[pred_page['prediction'].isna()]

Unnamed: 0,page,country,rev_id,prediction


In [15]:
pred_page

Unnamed: 0,page,country,rev_id,prediction
0,Bir I of Kanem,Chad,355319463,Stub
1,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub
2,Yos Por,Cambodia,393822005,Stub
3,Julius Gregr,Czech Republic,395521877,Stub
4,Edvard Gregr,Czech Republic,395526568,Stub
...,...,...,...,...
46420,Hal Bidlack,United States,807481636,C
46421,Yahya Jammeh,Gambia,807482007,GA
46422,Lucius Fairchild,United States,807483006,C
46423,Fahd of Saudi Arabia,Saudi Arabia,807483153,GA


In [16]:
# Combine the WPDS data with the page data with prediction

full_res = pd.merge(left=pred_page, right=WPDS_clean, how='outer', left_on='country', right_on='Name')

In [17]:
full_res

Unnamed: 0,page,country,rev_id,prediction,FIPS,Name,Type,TimeFrame,Data (M),Population
0,Bir I of Kanem,Chad,355319463.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
1,Abdullah II of Kanem,Chad,498683267.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
2,Salmama II of Kanem,Chad,565745353.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
3,Kuri I of Kanem,Chad,565745365.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
4,Mohammed I of Kanem,Chad,565745375.0,Stub,TD,Chad,Country,2019.0,16.877,16877000.0
...,...,...,...,...,...,...,...,...,...,...
46446,,,,,PF,French Polynesia,Country,2019.0,0.280,280000.0
46447,,,,,GU,Guam,Country,2019.0,0.175,175000.0
46448,,,,,NC,New Caledonia,Country,2019.0,0.295,295000.0
46449,,,,,PW,Palau,Country,2019.0,0.018,18000.0


In [18]:
# Extract the rows with a null country or Name (no matching between WPDS and page data)
no_match = full_res[full_res['country'].isna()|full_res['Name'].isna()].reset_index()

In [19]:
# if wanted, the following code can be used to create a new columns in replace of Name and country (since there are no overlapping)
# no_match['new_country']=[no_match['Name'][i] if pd.isna(no_match['country'][i]) else no_match['country'][i] for i in range(len(no_match))]

In [20]:
# Output the csv file for no matching record 
no_match.to_csv('wp_wpds_countries-no_match.csv')

In [21]:
# Extract rows with matching record between WPDS and page data
match_record = pd.merge(left=pred_page, right=WPDS_clean, how='inner', left_on='country', right_on='Name')

# Select and rename interested columns 
match_record_out = match_record[['country','page','rev_id','prediction','Population']]

match_record_out.columns = ['country','article_name','revision_id','article_quality_est.','population']

match_record_out

Unnamed: 0,country,article_name,revision_id,article_quality_est.,population
0,Chad,Bir I of Kanem,355319463,Stub,16877000
1,Chad,Abdullah II of Kanem,498683267,Stub,16877000
2,Chad,Salmama II of Kanem,565745353,Stub,16877000
3,Chad,Kuri I of Kanem,565745365,Stub,16877000
4,Chad,Mohammed I of Kanem,565745375,Stub,16877000
...,...,...,...,...,...
44563,Seychelles,Rita Sinon,800323154,Stub,98000
44564,Seychelles,Sylvette Frichot,800323798,Stub,98000
44565,Seychelles,May De Silva,800969960,Start,98000
44566,Seychelles,Vincent Meriton,802051093,Stub,98000


In [22]:
# Output the matching record to csv file

match_record_out.to_csv('wp_wpds_politicians_by_country.csv')

## Analysis

In [23]:
# Count the number of articles for each country 
count = pd.DataFrame(match_record_out.groupby('country').size()).reset_index()
count.columns=['country','article_count']

count

Unnamed: 0,country,article_count
0,Afghanistan,319
1,Albania,456
2,Algeria,116
3,Andorra,34
4,Angola,106
...,...,...
178,Venezuela,130
179,Vietnam,187
180,Yemen,116
181,Zambia,25


In [24]:
# Count the number of high quality articles (FA or GA class) for each country 

count_high_quality = pd.DataFrame(match_record_out[match_record_out['article_quality_est.'].isin(['FA','GA'])].groupby('country').size()).reset_index()

count_high_quality.columns=['country','high_quality_article_count']

count_high_quality

Unnamed: 0,country,high_quality_article_count
0,Afghanistan,13
1,Albania,3
2,Algeria,2
3,Argentina,16
4,Armenia,5
...,...,...
141,Vanuatu,3
142,Venezuela,3
143,Vietnam,13
144,Yemen,3


In [25]:
# Combine the count tables to the population data 

count_combined = pd.merge(left=count, right=count_high_quality, left_on='country', right_on='country')

In [26]:
# Extract the population data for each country and he sub-region it belongs to 

region = WPDS_data[WPDS_data['Name'].isin(['AFRICA','LATIN AMERICA AND THE CARIBBEAN','ASIA','EUROPE','OCEANIA'])]
region

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
67,LATIN AMERICA AND THE CARIBBEAN,LATIN AMERICA AND THE CARIBBEAN,Sub-Region,2019,651.036,651036000
109,ASIA,ASIA,Sub-Region,2019,4625.927,4625927000
166,EUROPE,EUROPE,Sub-Region,2019,746.622,746622000
216,OCEANIA,OCEANIA,Sub-Region,2019,43.155,43155000


In [27]:
# Create an empty column to store the region information 
WPDS_data['region'] = np.nan

In [28]:
# Assign the region for each row based on the row index 

WPDS_data['region'][:67]='AFRICA'
WPDS_data['region'][67:109]='LATIN AMERICA AND THE CARIBBEAN'
WPDS_data['region'][109:166]='ASIA'
WPDS_data['region'][166:216]='EUROPE'
WPDS_data['region'][216:]='OCEANIA'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WPDS_data['region'][:67]='AFRICA'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WPDS_data['region'][67:109]='LATIN AMERICA AND THE CARIBBEAN'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WPDS_data['region'][109:166]='ASIA'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WPDS_data['region'][166:216]='EUROPE

In [29]:
# Merge all three dataset together to obtain the final table needed for analysis 

final_count = pd.merge(left=WPDS_data[['Name','Population','region']], right=count_combined, how='right', left_on='Name', right_on='country')

In [30]:
final_count

Unnamed: 0,Name,Population,region,country,article_count,high_quality_article_count
0,Afghanistan,38928000,ASIA,Afghanistan,319,13
1,Albania,2838000,EUROPE,Albania,456,3
2,Algeria,44357000,AFRICA,Algeria,116,2
3,Argentina,45377000,LATIN AMERICA AND THE CARIBBEAN,Argentina,491,16
4,Armenia,2956000,ASIA,Armenia,193,5
...,...,...,...,...,...,...
141,Vanuatu,321000,OCEANIA,Vanuatu,58,3
142,Venezuela,28645000,LATIN AMERICA AND THE CARIBBEAN,Venezuela,130,3
143,Vietnam,96209000,ASIA,Vietnam,187,13
144,Yemen,29826000,ASIA,Yemen,116,3


In [31]:
# Calculate the percentage per country 

country_proportion = final_count.copy()
country_proportion['article_per_population']= country_proportion['article_count']/country_proportion['Population']
country_proportion['high_quality_article_per_article']= country_proportion['high_quality_article_count']/country_proportion['article_count']

In [32]:
country_proportion.sort_values(by=['article_per_population'], ascending=False)

Unnamed: 0,Name,Population,region,country,article_count,high_quality_article_count,article_per_population,high_quality_article_per_article
133,Tuvalu,10000,OCEANIA,Tuvalu,54,4,5.400000e-03,0.074074
51,Iceland,368000,EUROPE,Iceland,201,2,5.461957e-04,0.009950
74,Luxembourg,632000,EUROPE,Luxembourg,178,1,2.816456e-04,0.005618
39,Fiji,896000,OCEANIA,Fiji,197,1,2.198661e-04,0.005076
80,Malta,522000,EUROPE,Malta,101,1,1.934866e-04,0.009901
...,...,...,...,...,...,...,...,...
38,Ethiopia,114916000,AFRICA,Ethiopia,101,2,8.789029e-07,0.019802
140,Uzbekistan,34174000,ASIA,Uzbekistan,28,3,8.193363e-07,0.107143
26,China,1402385000,ASIA,China,1129,40,8.050571e-07,0.035430
53,Indonesia,271739000,ASIA,Indonesia,209,9,7.691204e-07,0.043062


In [33]:
# Calculate the percentage per region (using total population from the original WPDS data)

region_proportion = pd.merge(pd.merge(pd.DataFrame(final_count.groupby('region')['article_count'].sum()).reset_index(),
         pd.DataFrame(final_count.groupby('region')['high_quality_article_count'].sum()).reset_index(),
         left_on='region', right_on='region'),
         region[['Name','Population']], left_on='region', right_on='Name').drop(['Name'], axis=1)

region_proportion['article_per_population']= region_proportion['article_count']/region_proportion['Population']
region_proportion['high_quality_article_per_article']= region_proportion['high_quality_article_count']/region_proportion['article_count']

In [34]:
region_proportion.sort_values(by=['article_per_population'], ascending=False)

Unnamed: 0,region,article_count,high_quality_article_count,Population,article_per_population,high_quality_article_per_article
4,OCEANIA,2811,63,43155000,6.5e-05,0.022412
2,EUROPE,14444,350,746622000,1.9e-05,0.024232
3,LATIN AMERICA AND THE CARIBBEAN,4917,76,651036000,8e-06,0.015457
0,AFRICA,8040,223,1337918000,6e-06,0.027736
1,ASIA,11515,316,4625927000,2e-06,0.027442


## Results

### Top 10 countries by coverage

In [35]:
country_proportion.sort_values(by=['article_per_population'], ascending=False).head(10)

Unnamed: 0,Name,Population,region,country,article_count,high_quality_article_count,article_per_population,high_quality_article_per_article
133,Tuvalu,10000,OCEANIA,Tuvalu,54,4,0.0054,0.074074
51,Iceland,368000,EUROPE,Iceland,201,2,0.000546,0.00995
74,Luxembourg,632000,EUROPE,Luxembourg,178,1,0.000282,0.005618
39,Fiji,896000,OCEANIA,Fiji,197,1,0.00022,0.005076
80,Malta,522000,EUROPE,Malta,101,1,0.000193,0.009901
141,Vanuatu,321000,OCEANIA,Vanuatu,58,3,0.000181,0.051724
33,Dominica,72000,LATIN AMERICA AND THE CARIBBEAN,Dominica,12,1,0.000167,0.083333
1,Albania,2838000,EUROPE,Albania,456,3,0.000161,0.006579
91,New Zealand,4987000,OCEANIA,New Zealand,783,13,0.000157,0.016603
78,Maldives,541000,ASIA,Maldives,83,1,0.000153,0.012048


### Bottom 10 countries by coverage

In [36]:
country_proportion.sort_values(by=['article_per_population'], ascending=True).head(10)

Unnamed: 0,Name,Population,region,country,article_count,high_quality_article_count,article_per_population,high_quality_article_per_article
52,India,1400100000,ASIA,India,968,13,6.913792e-07,0.01343
53,Indonesia,271739000,ASIA,Indonesia,209,9,7.691204e-07,0.043062
26,China,1402385000,ASIA,China,1129,40,8.050571e-07,0.03543
140,Uzbekistan,34174000,ASIA,Uzbekistan,28,3,8.193363e-07,0.107143
38,Ethiopia,114916000,AFRICA,Ethiopia,101,2,8.789029e-07,0.019802
63,"Korea, North",25779000,ASIA,"Korea, North",36,8,1.396486e-06,0.222222
129,Thailand,66534000,ASIA,Thailand,112,3,1.68335e-06,0.026786
8,Bangladesh,169809000,ASIA,Bangladesh,317,3,1.866803e-06,0.009464
143,Vietnam,96209000,ASIA,Vietnam,187,13,1.943685e-06,0.069519
121,Sudan,43849000,AFRICA,Sudan,95,2,2.166526e-06,0.021053


### Top 10 countries by relative quality

In [37]:
country_proportion.sort_values(by=['high_quality_article_per_article'], ascending=False).head(10)

Unnamed: 0,Name,Population,region,country,article_count,high_quality_article_count,article_per_population,high_quality_article_per_article
63,"Korea, North",25779000,ASIA,"Korea, North",36,8,1.396486e-06,0.222222
109,Saudi Arabia,35041000,ASIA,Saudi Arabia,117,15,3.338946e-06,0.128205
106,Romania,19241000,EUROPE,Romania,343,42,1.782652e-05,0.122449
23,Central African Republic,4830000,AFRICA,Central African Republic,66,8,1.36646e-05,0.121212
140,Uzbekistan,34174000,ASIA,Uzbekistan,28,3,8.193363e-07,0.107143
82,Mauritania,4650000,AFRICA,Mauritania,48,5,1.032258e-05,0.104167
46,Guatemala,18066000,LATIN AMERICA AND THE CARIBBEAN,Guatemala,83,7,4.594265e-06,0.084337
33,Dominica,72000,LATIN AMERICA AND THE CARIBBEAN,Dominica,12,1,0.0001666667,0.083333
125,Syria,19398000,ASIA,Syria,128,10,6.598618e-06,0.078125
11,Benin,12209000,AFRICA,Benin,91,7,7.453518e-06,0.076923


### Bottom 10 countries by relative quality

In [38]:
country_proportion.sort_values(by=['high_quality_article_per_article'], ascending=True).head(10)

Unnamed: 0,Name,Population,region,country,article_count,high_quality_article_count,article_per_population,high_quality_article_per_article
10,Belgium,11515000,EUROPE,Belgium,519,1,4.5e-05,0.001927
128,Tanzania,59734000,AFRICA,Tanzania,404,1,7e-06,0.002475
124,Switzerland,8634000,EUROPE,Switzerland,402,1,4.7e-05,0.002488
89,Nepal,29996000,ASIA,Nepal,356,1,1.2e-05,0.002809
101,Peru,32824000,LATIN AMERICA AND THE CARIBBEAN,Peru,350,1,1.1e-05,0.002857
94,Nigeria,206140000,AFRICA,Nigeria,676,2,3e-06,0.002959
104,Portugal,10255000,EUROPE,Portugal,318,1,3.1e-05,0.003145
27,Colombia,49444000,LATIN AMERICA AND THE CARIBBEAN,Colombia,285,1,6e-06,0.003509
73,Lithuania,2794000,EUROPE,Lithuania,244,1,8.7e-05,0.004098
87,Morocco,35952000,AFRICA,Morocco,206,1,6e-06,0.004854


### Geographic regions by coverage
#### Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population


In [39]:
region_proportion.sort_values(by=['article_per_population'], ascending=False)

Unnamed: 0,region,article_count,high_quality_article_count,Population,article_per_population,high_quality_article_per_article
4,OCEANIA,2811,63,43155000,6.5e-05,0.022412
2,EUROPE,14444,350,746622000,1.9e-05,0.024232
3,LATIN AMERICA AND THE CARIBBEAN,4917,76,651036000,8e-06,0.015457
0,AFRICA,8040,223,1337918000,6e-06,0.027736
1,ASIA,11515,316,4625927000,2e-06,0.027442


### Geographic regions by coverage
#### Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality

In [40]:
region_proportion.sort_values(by=['high_quality_article_per_article'], ascending=False)

Unnamed: 0,region,article_count,high_quality_article_count,Population,article_per_population,high_quality_article_per_article
0,AFRICA,8040,223,1337918000,6e-06,0.027736
1,ASIA,11515,316,4625927000,2e-06,0.027442
2,EUROPE,14444,350,746622000,1.9e-05,0.024232
4,OCEANIA,2811,63,43155000,6.5e-05,0.022412
3,LATIN AMERICA AND THE CARIBBEAN,4917,76,651036000,8e-06,0.015457
