In [2]:
import os, json, time, datetime, urllib, re, requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Creating a df of politicans
politicians_df = pd.read_csv('/content/drive/My Drive/DATA_512_HW_2/Datasets/us_cities_by_state_SEPT.2023.csv')
population_df = pd.read_excel('/content/drive/My Drive/DATA_512_HW_2/Datasets/NST-EST2022.xlsx')


In [6]:
politicians_df.head(10)

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"
5,Alabama,"Albertville, Alabama","https://en.wikipedia.org/wiki/Albertville,_Ala..."
6,Alabama,"Alexander City, Alabama","https://en.wikipedia.org/wiki/Alexander_City,_..."
7,Alabama,"Aliceville, Alabama","https://en.wikipedia.org/wiki/Aliceville,_Alabama"
8,Alabama,"Allgood, Alabama","https://en.wikipedia.org/wiki/Allgood,_Alabama"
9,Alabama,"Altoona, Alabama","https://en.wikipedia.org/wiki/Altoona,_Alabama"


In [7]:
population_df.head()

Unnamed: 0,State,2022 Population
0,.Alabama,5074296
1,.Alaska,733583
2,.Arizona,7359197
3,.Arkansas,3045637
4,.California,39029342


Validating and cleaning data to handle inconsistencies


In [8]:
politicians_df.describe()

Unnamed: 0,state,page_title,url
count,22157,22157,22157
unique,48,21519,21519
top,Pennsylvania,County (United States),https://en.wikipedia.org/wiki/County_(United_S...
freq,2556,7,7


In [9]:
print(f'Politicians data shape: {politicians_df.shape}')
print(f'Politicians data duplicate record count: {politicians_df[politicians_df.duplicated()].shape}')
print(f'Politicians data duplicates (based on article url only) count: {politicians_df[politicians_df.duplicated(subset=["url"])].shape}')

Politicians data shape: (22157, 3)
Politicians data duplicate record count: (632, 3)
Politicians data duplicates (based on article url only) count: (638, 3)


In [10]:

# Considering only the last entry from the politicians data to eliminate duplicates
politicians_df = politicians_df[-politicians_df.duplicated()]

# Politicians data post clean up
print(f'Politicians data shape: {politicians_df.shape}')
print(f'Politicians data duplicate record count: {politicians_df[politicians_df.duplicated()].shape[0]}')

Politicians data shape: (21525, 3)
Politicians data duplicate record count: 0


In [11]:
ARTICLE_TITLES = politicians_df['page_title']
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'voreddy@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}


In [12]:

#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None,
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT,
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In the second step, you'll need to acquire estimated quality ratings for articles in the Wikipedia dataset using a machine learning system known as ORES (no longer an acronym). These ratings can be categorized from excellent to poor, ranging from FA (Featured article) to Stub (Stub-class article). The process includes:

a) Going through each line in the "politicians_by_country.SEPT.2023.csv" file.
b) Requesting page information to access the current page revision.
c) Submitting a request to ORES with the page's title and the current revision ID to receive the anticipated quality rating for each article.

In [None]:
all_pages = []
for article in ARTICLE_TITLES:
  info = request_pageinfo_per_article(article)
  for name in info['query']['pages']:
    all_pages.append(info['query']['pages'][name])

In [14]:
with open('/content/drive/My Drive/DATA_512_HW_2/processed_data/city_data.json', 'w') as f:
    json.dump(all_pages, f)
    print(f'Saved the file to {f.name}')

Saved the file to /content/drive/My Drive/DATA_512_HW_2/processed_data/city_data.json


In [13]:
cities = pd.read_json('/content/drive/My Drive/DATA_512_HW_2/processed_data/city_data.json')
title_ids = pd.Series(cities.lastrevid.values,index=cities.title).to_dict()

In [14]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "voreddy@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
# REQUEST_HEADER_PARAMS_TEMPLATE = {
#     'email_address' : "voreddy@uw.edu",         # your email address should go here
#     'access_token'  : "xxxx"
# }

REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "voreddy@uw.edu",         # your email address should go here
    'access_token'  : "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiJlYzJiOGM5MmI5ZGFjODVkN2YyMDhlMTMzODcyNzc4YiIsImp0aSI6ImM3MjAyMjU1MzQ2MTc0NjhhODg1ODRiYTVkZDgzOWEzYjNkN2Q4ZDUyYThiMjE0Y2I5MDQzOTljZWM3NmMyYjIyZmZlYjVkOGQwYjYwMjA3IiwiaWF0IjoxNjk3NTA5NzE2LjQ3NzA0OCwibmJmIjoxNjk3NTA5NzE2LjQ3NzA1MSwiZXhwIjozMzI1NDQxODUxNi40NzQ2OTMsInN1YiI6Ijc0MDIxOTE4IiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyJdfQ.IeV-rxd_-f4EBUeDVARLoVI_u-J8tzI6HhK5sXCyC5A6-6Vih6A3Yff6VurohjOYv0Aswh8M8dU6zkScaHWA29EE03r9B_OmzUMfu2AdDVBY9OB7Z1q-qvrHCfGE_hDuLPtFGoc1H4jcXlyKasY0YKJ9w3vQ8E3uzgY5XWOwVGf0R6E3dDHPxrChmPrW9c1570g91QCjuXvDLAP62v5h0z-kJJZ1WHm9Opy-Cbic8EKsdcMYYMudthYPH6l08xFBpQIjytNKbOI-8WlCF7jDeeeXxdiSrSTBvusqaY4G1Boo9YGCTqVXjibYrNSOad9p5STjs1LktpBbLkDjAq5HIDLyGNEzOwdlOvvArqTvMMdZctT4dq5BKjXoGa5gX3c6Ee4I2ctxVvC8o673meHSGd9fAFbHyUe2hhqhMB6YPt-q480Pa_11b0NeeC_W5ENh_vKFGMwXshQB96q84SgehDGUyGz2lXBj2l3P1lqOxaeurX3CKzZN6W6BRh_3NHeo3xlfFlPQVS6pa2UDcZGG9I9jO8pRXJVOaLW9HDsGoF7LkzOeyVjyEx2MHu7YZeEMrDcIi5RsqlbeaNideAZ0KeBdnimKWQ0836Z0o_VLC4MlyLVUbbzQ3LywJLBK6wgrOvNGvolDyfLy3w3q2pDnUIfgLQ3PMFR729F7LTTkCxw"          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = title_ids

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""
#

In [1]:
!pip install apikey

Collecting apikey
  Downloading apikey-0.2.4.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: apikey
  Building wheel for apikey (setup.py) ... [?25l[?25hdone
  Created wheel for apikey: filename=apikey-0.2.4-py3-none-any.whl size=6671 sha256=d19ded2c68c3916848a178ae654c27fa95493fd2d35d14f6aab9641e6170b305
  Stored in directory: /root/.cache/pip/wheels/d0/b2/c9/a4400b26c52c13f16c796d15694407a8c610a3098b9e886651
Successfully built apikey
Installing collected packages: apikey
Successfully installed apikey-0.2.4


In [15]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT,
                                   model_name = API_ORES_EN_QUALITY_MODEL,
                                   request_data = ORES_REQUEST_DATA_TEMPLATE,
                                   header_format = REQUEST_HEADER_TEMPLATE,
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):

    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token

    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")

    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)

    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
            # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [None]:
#
#
#   Which article - the key for the article dictionary defined above
article_title = "Harmony, Indiana"
#
print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {ARTICLE_REVISIONS[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
score = request_ores_score_per_article(article_revid=ARTICLE_REVISIONS[article_title],
                                       email_address="voreddy@uw.edu",
                                       access_token=ACCESS_TOKEN)
#
#    Output the result
try:
  for id in score['enwiki']['scores']:
except:
  print("No data for ",article_title)
#

In [16]:
quality = {}
for article_title in ARTICLE_REVISIONS:
  print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {ARTICLE_REVISIONS[article_title]:d}")
  score = request_ores_score_per_article(article_revid=ARTICLE_REVISIONS[article_title],
                                       email_address="voreddy@uw.edu",
                                       access_token=ACCESS_TOKEN)
  for id in score['enwiki']['scores']:
    pred = score['enwiki']['scores'][id]['articlequality']['score']['prediction']
    quality[article_title] = pred

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Getting LiftWing ORES scores for 'Floyd, Iowa' with revid: 1165747026
Getting LiftWing ORES scores for 'Fonda, Iowa' with revid: 1165747057
Getting LiftWing ORES scores for 'Pocahontas County, Iowa' with revid: 1160185843
Getting LiftWing ORES scores for 'Fontanelle, Iowa' with revid: 1165747062
Getting LiftWing ORES scores for 'Forest City, Iowa' with revid: 1171091982
Getting LiftWing ORES scores for 'Fort Atkinson, Iowa' with revid: 1165747182
Getting LiftWing ORES scores for 'Fort Dodge, Iowa' with revid: 1176015674
Getting LiftWing ORES scores for 'Fort Madison, Iowa' with revid: 1165747198
Getting LiftWing ORES scores for 'Fostoria, Iowa' with revid: 1165747259
Getting LiftWing ORES scores for 'Franklin, Iowa' with revid: 1165747331
Getting LiftWing ORES scores for 'Fraser, Iowa' with revid: 1165747368
Getting LiftWing ORES scores for 'Fredericksburg, Iowa' with revid: 1165747373
Getting LiftWing ORES scores for 'Fr

KeyError: ignored

In [17]:
# Assuming you have a "quality" dictionary
quality_cities = list(quality.keys())
quality_predictions = list(quality.values())

quality_data = pd.DataFrame({'city': quality_cities, 'prediction': quality_predictions})
quality_data.to_csv("/content/drive/MyDrive/DATA_512_HW_2/processed_data/quality_predictions.csv", index=False)


In [18]:
import math

regions = pd.read_csv('/content/drive/MyDrive/DATA_512_HW_2/Datasets/US_States_by_Region-US_Census_Burea-Sheet1.csv')
import pandas as pd

# Assuming you have a "regions" DataFrame
regional_divisions = []
num_regions = regions.shape[0]
prev_region = ''
prev_division = ''

for i in range(num_regions):
    division = ''

    if type(regions.iloc[i]['REGION']) == str:
        division = division + str(regions.iloc[i]['REGION'])
        prev_region = regions.iloc[i]['REGION']
    else:
        division = division + str(prev_region)

    if type(regions.iloc[i]['DIVISION']) == str:
        division = division + '_' + str(regions.iloc[i]['DIVISION'])
        prev_division = regions.iloc[i]['DIVISION']
    else:
        division = division + '_' + str(prev_division)

    regional_divisions.append(division)

regions['regional_division'] = regional_divisions

state_data = regions[['STATE', 'regional_division']].dropna()

state_data.head(10)


Unnamed: 0,STATE,regional_division
2,Connecticut,Northeast_New England
3,Maine,Northeast_New England
4,Massachusetts,Northeast_New England
5,New Hampshire,Northeast_New England
6,Rhode Island,Northeast_New England
7,Vermont,Northeast_New England
9,New Jersey,Northeast_Middle Atlantic
10,New York,Northeast_Middle Atlantic
11,Pennsylvania,Northeast_Middle Atlantic
14,Illinois,Midwest_East North Central


In [21]:
# Load the predictions DataFrame from a CSV file
quality_predictions = pd.read_csv("/content/drive/MyDrive/DATA_512_HW_2/processed_data/quality_predictions.csv")

# Articles to be excluded
excluded_articles = ['2020 United States census', '2010 United States census', 'County (United States)',
                    'Population', 'Square mile', 'Federal Information Processing Standards',
                    'American National Standards Institute', 'Geographic Names Information System',
                    'Wikipedia:Citation needed']

# Remove the 'Unnamed: 0' column
quality_predictions = quality_predictions.drop("Unnamed: 0", axis=1)

# Filter out rows with articles to be excluded
filtered_predictions = quality_predictions[~quality_predictions['article'].isin(excluded_articles)]

# Extract city and state information
city_state_list = list(filtered_predictions['article'])
states = []
articles_without_state = []

for city_state in city_state_list:
    parts = city_state.split(", ")
    if len(parts) > 1:
        states.append(parts[-1])
    else:
        states.append("N/A")
        articles_without_state.append(city_state)


In [22]:
articles_without_state

['Utqiaġvik',
 'Los Angeles',
 'San Diego',
 'San Francisco',
 'Denver',
 'Miami',
 'Atlanta',
 'Echols County',
 'Indianapolis',
 'New Orleans',
 'Boston',
 'Nantucket',
 'Detroit',
 'Minneapolis',
 'St. Louis',
 'Las Vegas',
 'New York City',
 'Oklahoma City',
 'Philadelphia',
 'Pittsburgh',
 'Grand Divisions of Tennessee',
 'Salt Lake City',
 'Seattle',
 'Milwaukee']

In [27]:
filtered_predictions['state'] = states


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_predictions['state'] = states


In [23]:
pop_data = pd.read_excel('/content/drive/MyDrive/DATA_512_HW_2/Datasets/NST-EST2022.xlsx')
state_without_dots = []
for i in range(len(pop_data)):
  state_dot = ''.join(pop_data.iloc[i]['State'])
  state_without_dots.append(state_dot[1:])
pop_data['State'] = state_without_dots
pop_data.head()

Unnamed: 0,State,2022 Population
0,Alabama,5074296
1,Alaska,733583
2,Arizona,7359197
3,Arkansas,3045637
4,California,39029342


In [28]:
pred_reg = pd.merge(filtered_predictions,state_data, left_on='state',right_on='STATE',how='inner')


In [29]:
pred_reg.head()

Unnamed: 0,article,prediction,state,STATE,regional_division
0,"Abbeville, Alabama",C,Alabama,Alabama,South_East South Central
1,"Adamsville, Alabama",C,Alabama,Alabama,South_East South Central
2,"Addison, Alabama",C,Alabama,Alabama,South_East South Central
3,"Akron, Alabama",GA,Alabama,Alabama,South_East South Central
4,"Alabaster, Alabama",C,Alabama,Alabama,South_East South Central


In [30]:
pred_reg_pop = pd.merge(pred_reg,pop_data, left_on='state',right_on='State',how='inner')


In [31]:
city_ids = pd.read_json('/content/drive/MyDrive/DATA_512_HW_2/processed_data/city_data.json')
city_ids.head()

Unnamed: 0,pageid,ns,title,contentmodel,pagelanguage,pagelanguagehtmlcode,pagelanguagedir,touched,lastrevid,length,talkid,fullurl,editurl,canonicalurl,watchers,redirect,new
0,104730,0,"Abbeville, Alabama",wikitext,en,en,ltr,2023-10-10T22:35:37Z,1171163550,24706,281244.0,"https://en.wikipedia.org/wiki/Abbeville,_Alabama",https://en.wikipedia.org/w/index.php?title=Abb...,"https://en.wikipedia.org/wiki/Abbeville,_Alabama",,,
1,104761,0,"Adamsville, Alabama",wikitext,en,en,ltr,2023-10-10T22:35:37Z,1177621427,18040,281272.0,"https://en.wikipedia.org/wiki/Adamsville,_Alabama",https://en.wikipedia.org/w/index.php?title=Ada...,"https://en.wikipedia.org/wiki/Adamsville,_Alabama",,,
2,105188,0,"Addison, Alabama",wikitext,en,en,ltr,2023-10-10T22:35:37Z,1168359898,13309,281517.0,"https://en.wikipedia.org/wiki/Addison,_Alabama",https://en.wikipedia.org/w/index.php?title=Add...,"https://en.wikipedia.org/wiki/Addison,_Alabama",,,
3,104726,0,"Akron, Alabama",wikitext,en,en,ltr,2023-10-10T22:35:37Z,1165909508,11710,281240.0,"https://en.wikipedia.org/wiki/Akron,_Alabama",https://en.wikipedia.org/w/index.php?title=Akr...,"https://en.wikipedia.org/wiki/Akron,_Alabama",,,
4,105109,0,"Alabaster, Alabama",wikitext,en,en,ltr,2023-10-10T22:35:37Z,1179139816,20343,281444.0,"https://en.wikipedia.org/wiki/Alabaster,_Alabama",https://en.wikipedia.org/w/index.php?title=Ala...,"https://en.wikipedia.org/wiki/Alabaster,_Alabama",,,


In [32]:

rev_ids = []
for i in range(len(pred_reg_pop)):
  art = pred_reg_pop.iloc[i]['article']
  rev_ids.append(city_ids[city_ids['title']==art].iloc[0]['lastrevid'])

In [33]:
pred_reg_pop['revision_id'] = rev_ids
pred_reg_pop.drop(['STATE','State'],axis=1,inplace=True)
pred_reg_pop.rename(columns={'article': 'article_title', 'prediction': 'article_quality','2022 Population':'population'}, inplace=True)
cols = ['state','regional_division','population','article_title','revision_id','article_quality']
pred_reg_pop = pred_reg_pop[cols]
pred_reg_pop.to_csv('/content/drive/MyDrive/DATA_512_HW_2/processed_data/wp_scored_city_articles_by_state.csv',index=False)
