# Assignment 2: Bias in data
### Data 512
### Saturday, October 5
### Tara Wilson

Python version?? Imports??

In [1]:
import pandas as pd
import requests
import json
import logging
import numpy as np

In [2]:
logging.basicConfig(level=logging.INFO,
                    datefmt='%m-%d %H:%M',
                    format='%(message)s',
                    filename='bias_in_data_error_log.log',
                    filemode='w')

In [3]:
page_data = pd.read_csv("source_data/page_data.csv")
population_data = pd.read_csv("source_data/WPDS_2018_data.csv")

In [4]:
page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [5]:
population_data.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


Rows with page names that begin with the string "Template" need to be filtered out of `page_data` as these are not Wikipedia articles and we do not want to include them in the anlysis.

In [6]:
page_data = page_data[~page_data["page"].str.startswith("Template")]
page_data.head()

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568


## ****** NOTE AOUT CAPITAL POPULATION!!!

## ORES Requests

In [7]:
def get_ores_data(revision_ids):
    headers = {'User-Agent' : 'https://github.com/TaraWilson17', 'From' : 'wwtara@uw.edu'}
    
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    return response

In [8]:
page_data.shape[0]

46701

In [9]:
revision_id = []
article_quality = []

page_data["rev_id"] = page_data["rev_id"].astype(np.int64)

for i in range(0, page_data.shape[0], 50):
    ores_responses = get_ores_data(np.array(page_data["rev_id"].iloc[i:i + 50,]))
    for article in ores_responses["enwiki"]["scores"]:
        try:
            article_quality.append(ores_responses["enwiki"]["scores"][article]["wp10"]["score"]["prediction"])
        except:
            logging.info("Unable to get a ORES response for revision id: %s", article)
        else:
            revision_id.append(article)

In [19]:
article_data = pd.DataFrame()
article_data["revision_id"] = revision_id
article_data["article_quality"] = article_quality
article_data.head()

Unnamed: 0,revision_id,article_quality
0,355319463,Stub
1,393276188,Stub
2,393822005,Stub
3,395521877,Stub
4,395526568,Stub


In [20]:
article_data["revision_id"] = article_data["revision_id"].astype(str).astype(int)
all_article_data = pd.merge(article_data, page_data, left_on="revision_id", right_on="rev_id")
all_article_data = all_article_data.drop(columns=["rev_id"])
all_article_data.head()

Unnamed: 0,revision_id,article_quality,page,country
0,355319463,Stub,Bir I of Kanem,Chad
1,393276188,Stub,Information Minister of the Palestinian Nation...,Palestinian Territory
2,393822005,Stub,Yos Por,Cambodia
3,395521877,Stub,Julius Gregr,Czech Republic
4,395526568,Stub,Edvard Gregr,Czech Republic


In [21]:
all_data = pd.merge(all_article_data, population_data, left_on="country", right_on="Geography")
all_data = all_data.drop(columns=["Geography"])
all_data = all_data.rename(columns={"Population mid-2018 (millions)": "population"})
all_data.head()

Unnamed: 0,revision_id,article_quality,page,country,population
0,355319463,Stub,Bir I of Kanem,Chad,15.4
1,498683267,Stub,Abdullah II of Kanem,Chad,15.4
2,565745353,Stub,Salmama II of Kanem,Chad,15.4
3,565745365,Stub,Kuri I of Kanem,Chad,15.4
4,565745375,Stub,Mohammed I of Kanem,Chad,15.4


In [22]:
all_data.to_csv("wp_wpds_politicians_by_country.csv", sep=",", columns=["country", "article_name", "revision_id", "article_quality", "population"])

## Analysis

In [27]:
article_stats = pd.DataFrame()
country_list = []
counts = []
populations= []
high_quality_counts = []

countries = all_data["country"].unique()
for country in countries:
    country_list.append(country)
    articles_from_country = all_data[all_data["country"] == country]
    counts.append(len(articles_from_country))
    count = 0
    for index, row in articles_from_country.iterrows():
        if row["article_quality"] == "FA" or row["article_quality"] == "GA":
            count += 1
    high_quality_counts.append(count)
    populations.append(row["population"])
    
article_stats["country"] = country_list
article_stats["num_articles"] = counts
article_stats["population"] = populations
article_stats["num_high_quality_articles"] = high_quality_counts
article_stats.head()

Unnamed: 0,country,num_articles,population,num_high_quality_articles
0,Chad,97,15.4,2
1,Cambodia,213,16.0,4
2,Canada,843,37.2,22
3,Egypt,235,97.0,9
4,Pakistan,1023,200.6,19


In [28]:
# populations in millions
article_stats["population"] = article_stats["population"].str.replace(",","")
article_stats["population"] = article_stats["population"].astype(float) * 1000000
article_stats.head()

Unnamed: 0,country,num_articles,population,num_high_quality_articles
0,Chad,97,15400000.0,2
1,Cambodia,213,16000000.0,4
2,Canada,843,37200000.0,22
3,Egypt,235,97000000.0,9
4,Pakistan,1023,200600000.0,19


In [29]:
article_stats["articles_per_population"] = article_stats["num_articles"] / article_stats["population"]
article_stats["quality_articles_per_population"] = article_stats["num_high_quality_articles"] / article_stats["population"]
article_stats.head()

Unnamed: 0,country,num_articles,population,num_high_quality_articles,articles_per_population,quality_articles_per_population
0,Chad,97,15400000.0,2,6e-06,1.298701e-07
1,Cambodia,213,16000000.0,4,1.3e-05,2.5e-07
2,Canada,843,37200000.0,22,2.3e-05,5.913978e-07
3,Egypt,235,97000000.0,9,2e-06,9.278351e-08
4,Pakistan,1023,200600000.0,19,5e-06,9.471585e-08


## Result tables

### 1. Top 10 countries by coverage

In [33]:
article_stats.nlargest(10, "articles_per_population")

Unnamed: 0,country,num_articles,population,num_high_quality_articles,articles_per_population,quality_articles_per_population
98,Tuvalu,54,10000.0,5,0.0054,0.0005
149,Nauru,52,10000.0,0,0.0052,0.0
39,San Marino,81,30000.0,0,0.0027,0.0
63,Monaco,40,40000.0,0,0.001,0.0
97,Liechtenstein,28,40000.0,0,0.0007,0.0
86,Tonga,63,100000.0,0,0.00063,0.0
104,Marshall Islands,37,60000.0,0,0.000617,0.0
66,Iceland,201,400000.0,2,0.000503,5e-06
166,Andorra,34,80000.0,0,0.000425,0.0
77,Grenada,36,100000.0,1,0.00036,1e-05


### 2. Bottom 10 countries by coverage

In [40]:
article_stats.nsmallest(10, "articles_per_population")

Unnamed: 0,country,num_articles,population,num_high_quality_articles,articles_per_population,quality_articles_per_population
6,India,980,1371300000.0,17,7.146503e-07,1.2397e-08
58,Indonesia,210,265200000.0,10,7.918552e-07,3.770739e-08
20,China,1130,1393800000.0,41,8.107332e-07,2.941599e-08
150,Uzbekistan,28,32900000.0,2,8.510638e-07,6.079027e-08
106,Ethiopia,101,107500000.0,2,9.395349e-07,1.860465e-08
163,"Korea, North",36,25600000.0,7,1.40625e-06,2.734375e-07
178,Zambia,25,17700000.0,0,1.412429e-06,0.0
126,Thailand,112,66200000.0,3,1.691843e-06,4.531722e-08
125,Mozambique,58,30500000.0,0,1.901639e-06,0.0
115,Bangladesh,319,166400000.0,3,1.917067e-06,1.802885e-08


### 3. Top 10 countries by relative quality

In [36]:
article_stats.nlargest(10, "quality_articles_per_population")

Unnamed: 0,country,num_articles,population,num_high_quality_articles,articles_per_population,quality_articles_per_population
98,Tuvalu,54,10000.0,5,0.0054,0.0005
172,Dominica,12,70000.0,1,0.000171,1.4e-05
77,Grenada,36,100000.0,1,0.00036,1e-05
121,Vanuatu,58,300000.0,3,0.000193,1e-05
66,Iceland,201,400000.0,2,0.000503,5e-06
31,Ireland,376,4900000.0,21,7.7e-05,4e-06
124,Bhutan,33,800000.0,3,4.1e-05,4e-06
111,Maldives,83,400000.0,1,0.000208,3e-06
56,New Zealand,783,4900000.0,12,0.00016,2e-06
129,Israel,493,8500000.0,20,5.8e-05,2e-06


### 4. Bottom 10 countries by relative quality

In [35]:
article_stats.nsmallest(10, "quality_articles_per_population")

Unnamed: 0,country,num_articles,population,num_high_quality_articles,articles_per_population,quality_articles_per_population
14,Malta,103,500000.0,0,0.000206,0.0
22,Angola,106,30400000.0,0,3e-06,0.0
28,Finland,569,5500000.0,0,0.000103,0.0
32,Tunisia,138,11600000.0,0,1.2e-05,0.0
39,San Marino,81,30000.0,0,0.0027,0.0
50,Uganda,185,44100000.0,0,4e-06,0.0
52,Moldova,423,3500000.0,0,0.000121,0.0
63,Monaco,40,40000.0,0,0.001,0.0
76,Turkmenistan,32,5900000.0,0,5e-06,0.0
80,Slovakia,116,5400000.0,0,2.1e-05,0.0


### 5. Geographic regions by coverage (by politician articles from countries in each region as a proportion of total regional population)

### 6. Geographic regions by coverage (by relative proportion of politician articles from countries in each region that are of GA and FA-quality)

## Reflections and implications