### Importing necessary modules and libraries

In [2]:
from apikeys.KeyManager import KeyManager
import pandas as pd
import requests
import time
import json

### Read the data files for politicians and population

In [229]:
politicians_data = pd.read_csv("./politicians_by_country_AUG.2024.csv")
population_data = pd.read_csv("./population_by_country_AUG.2024.csv")


### Defining constants to be used in the API calls to Wikimedia and ORES

In [83]:
# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_HEADER_AGENT = 'User-Agent'

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_THROTTLE_WAIT = 0.1

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'rohitch@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = politicians_data['name']

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}



#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_THROTTLE_WAIT = 0.1

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<rohitch@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}


### Reading ORES API access token from Key Manager

In [84]:
keyman = KeyManager()
WIKIMEDIA_USERNAME = "Rohitraju3010"
key_info = keyman.findRecord(domain="api.wikimedia.org")
ACCESS_TOKEN = key_info[0]['key']

### Defining functions for requesting ores score per article, request page info per article

In [85]:
# function to call the wikimedia API and return the pageinfo for the article
def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    if API_HEADER_AGENT not in headers:
        raise Exception(f"The header data should include a '{API_HEADER_AGENT}' field that contains your UW email address.")

    if 'rohitch@uw' not in headers[API_HEADER_AGENT]:
        raise Exception(f"Use your UW email address in the '{API_HEADER_AGENT}' field.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [86]:
def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

### Data acquistion - getting revision id and article quality for all articles

In [None]:
#Request headers
hparams = REQUEST_HEADER_PARAMS_TEMPLATE.copy()
hparams['email_address'] = "rohitch@uw.edu"
hparams['access_token'] = ACCESS_TOKEN

#Request payload
rd = ORES_REQUEST_DATA_TEMPLATE.copy()

scores = []
rev_ids = []
rev_id_not_found_titles = []
score_not_found_titles = []

for i, title in enumerate(ARTICLE_TITLES):
    response = request_pageinfo_per_article(
        article_title = title,
    )
    try:
        rev_id = response['query']['pages'][list(response['query']['pages'].keys())[0]]['lastrevid']
        rev_ids.append(rev_id)
    except Exception as e:
        rev_id_not_found_titles.append(title)
        rev_ids.append("")
        scores.append("")
        continue

    if rev_id:
        rd['rev_id'] = rev_id
    
    try:
        score_response = request_ores_score_per_article(request_data=rd,
                                       header_params=hparams)
        score = score_response["enwiki"]["scores"][str(rev_id)]['articlequality']['score']['prediction']
        scores.append(score)
    except Exception as e :
        score_not_found_titles.append(title)
        scores.append("")
        continue

### Error rate and titles

In [99]:
rev_id_not_found_titles

['Barbara Eibinger-Miedl',
 'Mehrali Gasimov',
 'Kyaw Myint',
 'André Ngongang Ouandji',
 'Tomás Pimentel',
 'Richard Sumah',
 "Segun ''Aeroland'' Adewale",
 'Bashir Bililiqo']

In [101]:
score_not_found_titles

['Carlos Eduardo Moreira Ferreira']

In [103]:
print(f"The error rate of articles is {(len(rev_id_not_found_titles)+len(score_not_found_titles))/len(ARTICLE_TITLES)*100} %")

The error rate of articles is 0.12578616352201258 %


### Create a country, region and population mapping

In [230]:
region = None
country_region_pop_list = []

# Iterate over each row in the DataFrame
for index, row in population_data.iterrows():
    geography = row['Geography']
    population = row['Population']

    if geography.isupper():
        region = geography
    else:
        country_region_pop_list.append({'country': geography, 'region': region, 'population': float(population)})

country_region_pop = pd.DataFrame(country_region_pop_list)

### Combining the population and politician article datasets

In [232]:
politicians_data['article_quality'] = scores
politicians_data['revision_id'] = rev_ids

### Note:
In the politicians dataset, The Koreas are split as **'Korean, South'** and **'Korean'** (assuming the latter is North as there is an explicit category for South). 
In the populations dataset, they are split as **'Korea (North)'**,**'Korea (South)'**

Semantically, these labels refer to 2 countries, hence renaming them with unified labels so as to avoid loss of data while merging.

Similar issue with name is for **'Guinea-Bissau'** and **'GuineaBissau'**

In [238]:
politicians_data['country'] = politicians_data['country'].replace('Korea, South', 'Korea (South)')
politicians_data['country'] = politicians_data['country'].replace('Korean', 'Korea (North)')
politicians_data['country'] = politicians_data['country'].replace('Guinea-Bissau', 'GuineaBissau')

In [245]:
merged_article_data_outer = pd.merge(politicians_data, country_region_pop, how='outer', on='country')

### Finding countries with mismatch in article quality and population data

In [248]:
merged_article_data_outer[merged_article_data_outer.isnull().any(axis=1)]['country'].unique()

array(['Western Sahara', 'Mauritius', 'Mayotte', 'Reunion',
       'Sao Tome and Principe', 'eSwatini', 'Canada', 'United States',
       'Mexico', 'Curacao', 'Dominica', 'Guadeloupe', 'Jamaica',
       'Martinique', 'Puerto Rico', 'French Guiana', 'Suriname',
       'Georgia', 'Brunei', 'Philippines', 'China (Hong Kong SAR)',
       'China (Macao SAR)', 'Denmark', 'Iceland', 'Ireland',
       'United Kingdom', 'Liechtenstein', 'Netherlands', 'Romania',
       'Andorra', 'San Marino', 'Australia', 'Fiji', 'French Polynesia',
       'Guam', 'Kiribati', 'Nauru', 'New Caledonia', 'New Zealand',
       'Palau'], dtype=object)

In [251]:
cleaned_df = merged_article_data_outer[~merged_article_data_outer.isnull().any(axis=1)]
cleaned_df.rename(columns={'name': 'article_title'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df.rename(columns={'name': 'article_title'}, inplace=True)


In [253]:
cleaned_df.to_csv("wp_politicians_by_country.csv")

### Top and bottom countries by coverage

In [280]:
article_counts = cleaned_df.groupby("country").size().reset_index(name = "article_counts")
population_cleaned = cleaned_df[['country', 'population']].drop_duplicates()
article_counts_by_country = pd.merge(article_counts, population_cleaned, on='country')
article_counts_by_country['total_articles_per_capita (million people)'] = article_counts_by_country['article_counts'] / article_counts_by_country['population']

In [281]:
top_10_countries_by_coverage = article_counts_by_country.sort_values(by='total_articles_per_capita (million people)', ascending=False).head(12)
top_10_countries_by_coverage

Unnamed: 0,country,article_counts,population,total_articles_per_capita (million people)
157,Tuvalu,1,0.0,inf
99,Monaco,10,0.0,inf
4,Antigua and Barbuda,33,0.1,330.0
51,Federated States of Micronesia,14,0.1,140.0
96,Marshall Islands,13,0.1,130.0
152,Tonga,10,0.1,100.0
12,Barbados,25,0.3,83.333333
128,Seychelles,6,0.1,60.0
101,Montenegro,36,0.6,60.0
17,Bhutan,44,0.8,55.0


In [282]:
bottom_10_countries_by_coverage = article_counts_by_country.sort_values(by='total_articles_per_capita (million people)', ascending=True).head(10)
bottom_10_countries_by_coverage

Unnamed: 0,country,article_counts,population,total_articles_per_capita (million people)
31,China,16,1411.3,0.011337
67,India,151,1428.6,0.105698
57,Ghana,4,34.1,0.117302
125,Saudi Arabia,5,36.9,0.135501
167,Zambia,3,20.2,0.148515
111,Norway,1,5.5,0.181818
71,Israel,2,9.8,0.204082
45,Egypt,32,105.2,0.304183
37,Cote d'Ivoire,10,30.9,0.323625
50,Ethiopia,44,126.5,0.347826


### Top and bottom countries by high-quality articles

In [283]:
hq_article_counts = cleaned_df[cleaned_df['article_quality'].isin(['FA', 'GA'])].groupby("country").size().reset_index(name = "high_quality_article_counts")
hq_article_counts_by_country = pd.merge(hq_article_counts, population_cleaned, on='country')
hq_article_counts_by_country['high_quality_articles_per_capita (million people)'] = hq_article_counts_by_country['high_quality_article_counts'] / hq_article_counts_by_country['population']

In [285]:
top_10_countries_by_hqcount = hq_article_counts_by_country.sort_values(by='high_quality_articles_per_capita (million people)', ascending=False).head(10)
top_10_countries_by_hqcount

Unnamed: 0,country,high_quality_article_counts,population,high_quality_articles_per_capita (million people)
65,Montenegro,3,0.6,5.0
58,Luxembourg,2,0.7,2.857143
1,Albania,7,2.7,2.592593
52,Kosovo,4,1.7,2.352941
60,Maldives,1,0.6,1.666667
57,Lithuania,4,2.9,1.37931
25,Croatia,5,3.8,1.315789
40,Guyana,1,0.8,1.25
72,Palestinian Territory,6,5.5,1.090909
83,Slovenia,2,2.1,0.952381


#### **Note:** 
These are countries for which there is atleast 1 high-quality article. There are many countries which do not have high-quality articles for which the top 10 will not be unique. Hence, analyzed those countries that have atleast one high-quality article

In [286]:
bottom_10_countries_by_hqcount = hq_article_counts_by_country.sort_values(by='high_quality_articles_per_capita (million people)', ascending=True).head(10)
bottom_10_countries_by_hqcount

Unnamed: 0,country,high_quality_article_counts,population,high_quality_articles_per_capita (million people)
9,Bangladesh,1,173.5,0.005764
29,Egypt,1,105.2,0.009506
31,Ethiopia,2,126.5,0.01581
46,Japan,2,124.5,0.016064
71,Pakistan,4,240.5,0.016632
22,Colombia,1,52.2,0.019157
23,Congo DR,2,102.3,0.01955
102,Vietnam,2,98.9,0.020222
97,Uganda,1,48.6,0.020576
2,Algeria,1,46.8,0.021368


#### Subset of 10 (not unique) countries with zero articles of high-quality

In [321]:
cleaned_df[~cleaned_df['article_quality'].isin(['FA', 'GA'])].groupby("country").size().reset_index(name = "zero_high_quality_counts").sort_values(by='zero_high_quality_counts', ascending=False).head(10)

Unnamed: 0,country,zero_high_quality_counts
109,Nigeria,240
119,Poland,152
67,India,151
72,Italy,146
76,Kenya,122
137,Spain,119
73,Japan,115
53,France,114
122,Russia,110
134,Somalia,101


### Regions by coverage (Total articles per capita)

In [311]:
article_counts = cleaned_df.groupby("region").size().reset_index(name = "article_counts_by_region")
region_population = cleaned_df[['region', 'population']].drop_duplicates().groupby("region", as_index=False).agg({"population": "sum"})
article_counts_by_region = pd.merge(article_counts, region_population, on='region')
article_counts_by_region['total_articles_per_capita - Region (million people)'] = article_counts_by_region['article_counts_by_region'] / article_counts_by_region['population']

In [318]:
top_regions_by_coverage = article_counts_by_region.sort_values(by='total_articles_per_capita - Region (million people)', ascending=False)
top_regions_by_coverage['Rank'] = top_regions_by_coverage['total_articles_per_capita - Region (million people)'].rank(ascending=False).astype(int)
top_regions_by_coverage = top_regions_by_coverage[['Rank'] + [col for col in top_regions_by_coverage.columns if col != 'Rank']]
print(top_regions_by_coverage.to_string(index=False))

 Rank          region  article_counts_by_region  population  total_articles_per_capita - Region (million people)
    1 NORTHERN EUROPE                       191        27.8                                             6.870504
    2         OCEANIA                        72        10.9                                             6.605505
    3       CARIBBEAN                       219        36.3                                             6.033058
    4 SOUTHERN EUROPE                       797       150.9                                             5.281643
    5 CENTRAL AMERICA                       188        51.3                                             3.664717
    6  WESTERN EUROPE                       498       181.3                                             2.746828
    7  EASTERN EUROPE                       709       266.2                                             2.663411
    8    WESTERN ASIA                       610       295.4                                     

### Regions by high-quality article counts (Total articles per capita)

In [323]:
hq_article_counts = cleaned_df[cleaned_df['article_quality'].isin(['FA', 'GA'])].groupby("region").size().reset_index(name = "high_quality_article_counts_by_region")
hq_article_counts_by_region = pd.merge(hq_article_counts, region_population, on='region')
hq_article_counts_by_region['high_quality_articles_per_capita - Region (million people)'] = hq_article_counts_by_region['high_quality_article_counts_by_region'] / hq_article_counts_by_region['population']

In [325]:
top_regions_by_hq = hq_article_counts_by_region.sort_values(by='high_quality_articles_per_capita - Region (million people)', ascending=False)
top_regions_by_hq['Rank'] = top_regions_by_hq['high_quality_articles_per_capita - Region (million people)'].rank(ascending=False).astype(int)
top_regions_by_hq = top_regions_by_hq[['Rank'] + [col for col in top_regions_by_hq.columns if col != 'Rank']]
print(top_regions_by_hq.to_string(index=False))

 Rank          region  high_quality_article_counts_by_region  population  high_quality_articles_per_capita - Region (million people)
    1 SOUTHERN EUROPE                                     53       150.9                                                    0.351226
    2 NORTHERN EUROPE                                      9        27.8                                                    0.323741
    3       CARIBBEAN                                      9        36.3                                                    0.247934
    4 CENTRAL AMERICA                                     10        51.3                                                    0.194932
    5  EASTERN EUROPE                                     38       266.2                                                    0.142750
    6 SOUTHERN AFRICA                                      8        68.3                                                    0.117130
    7  WESTERN EUROPE                                     21       18

## DATA 512 -  Homework 2: Considering Bias in Data

### Goal of the project
The goal of this assignment is to explore the concept of bias in data using Wikipedia articles. It will focus on articles about political figures from various countries, analyzing potential biases in the data obtained from internet sources like Wikipedia, particularly in relation to regional disparities or underrepresentation. The objective is to examine the correlation between the number of articles about politicians, the percentage of high-quality articles, and the population of each country. Through this analysis, we aim to identify any underlying biases in the data and address key questions, such as whether internet articles and content are reliable and unbiased sources for data analysis.

### License
This project is developed and distributed under the [MIT LICENSE](https://opensource.org/licenses/MIT), ensuring flexibility and openness for users and contributors. It allows anyone to use, modify, and distribute the code with minimal restrictions.
Under the MIT License, you are permitted to freely use the software for any purpose, but it comes with no warranty. The only requirement is that the original license and copyright notice must be included in any copies or substantial portions of the software.

### Process flow:
#### 1. Data Acquisition:
The data retrieval beings with two input files `politicians_by_country_AUG.2024.csv` which is obtained by scarping the Wikipedia [Category:Politicians by nationality](https://en.wikipedia.org/wiki/Category:Politicians_by_nationality)  to generate a list of Wikipedia article pages about politicians from a wide range of countries, and the `population_by_country_AUG.2024.csv` downloaded from the [world population data sheet](https://www.prb.org/international/indicator/population/table/) published by the Population Reference Bureau.


