# Declarations

In [123]:
def import_or_install(package):
#     source : 
#     stackoverflow.com/questions/4527554/check-if-module-exists-if-not-install-it
    """
    Input : string package
    Output : install package if it does not exist
    """
    try:
        __import__(package)
    except ImportError:
        pip.main(['install',package])

In [124]:
import pandas as pd
import numpy as np
import json
import pip
import re
from urllib.request import urlopen
import_or_install('wikidata')
import requests
from wikidata.client import Client
from scipy import stats
import powerlaw
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pickle
from fa2 import ForceAtlas2
import operator


## Functions declarations

In [125]:
def load_all_to_df(*datasets):
    """
    Input : strings of CSVs titles
    Output: pandas dataframe containg all records in all CSVs
    """
    df_list = [pd.read_csv(dataset) for dataset in datasets]
    return pd.concat(df_list, ignore_index=True)

In [126]:
def get_wikipedia_url(wikidata):
    """
    Input : string wikidata ID
    Output: wikipedia page url
    """
    client = Client()
    entity = client.get(wikidata, load=True)
    return entity.data['sitelinks']['enwiki']['url']

In [128]:
def get_json_wikipedia(url):
    baseurl = 'https://en.wikipedia.org/w/api.php?'
    action = 'action=query'
    content = '&prop=revisions&rvprop=content'
    dataformat = '&format=json'
    title = '&titles=' + get_wikipedia_title(url)
    query = "%s%s%s%s%s" % (baseurl,action,content,dataformat,title)
    
    wikiresponse = urlopen(query)
    wikisource = wikiresponse.read()
    wikitext = wikisource.decode('utf-8')
    jsontext = json.loads(wikitext)['query']
    return json.dumps(jsontext)

In [134]:
def find_wikilinks(df = None):
    """
    Input : pandas dataframe
    Output : pandas dataframe with a new column containing the number of links
    """
    linksList = []
    for wikiId in df.wikidata:
        url = get_wikipedia_url(wikiId)
        links = get_links_from_url(url)
        linksList.append(links)

    df['links'] = linksList
    return df

In [135]:
def get_links_from_url(url):
    pattern = r'\[\[(.*?)(?:\|.*?)?\]\]'
    wikitext = get_json_wikipedia(url)
    return re.findall(pattern, wikitext)

# Load data for all members: senate and congress

In [136]:
df_members = load_all_to_df('..//term-116_congress.csv','..//term-116_senate.csv')
df_connections = find_wikilinks(df_members)

In [137]:
df_connections

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,chamber,term,start_date,end_date,image,gender,wikidata,wikidata_group,wikidata_area,links
0,eaf0104e-b8ae-4336-a353-ca5228fe2752,A. Donald McEachin,"McEachin, A.",,RepMcEachin,RepMcEachin,Democrat,democrat,VA-4,Virginia's 4th congressional district,House of Representatives,116,,,https://theunitedstates.io/images/congress/ori...,male,Q4647699,Q29552,Q7934058,"[Virginia, Randy Forbes, Virginia's 9th Senate..."
1,c5441370-42c5-4d83-b73e-6130cc4910cc,A. Drew Ferguson IV,"Ferguson, A.",,RepDrewFerguson,RepDrewFerguson,Republican,republican,GA-3,Georgia's 3rd congressional district,House of Representatives,116,,,https://theunitedstates.io/images/congress/ori...,male,Q26157659,Q29468,Q5547263,[Party leaders of the United States House of R...
2,05165473-b19d-484d-8de3-e6ed420cc299,Abby Finkenauer,"Finkenauer, Abby",,,,Democrat,democrat,IA-1,Iowa's 1st congressional district,House of Representatives,116,,,https://theunitedstates.io/images/congress/ori...,female,Q24435337,Q29552,Q13858962,"[Iowa, Ashley Hinson, Rod Blum, Pat Murphy (Io..."
3,6133038f-b893-45d2-aa09-a7feb433edf7,Abigail Davis Spanberger,"Spanberger, Abigail",,,,Democrat,democrat,VA-7,Virginia's 7th congressional district,House of Representatives,116,,,https://theunitedstates.io/images/congress/ori...,female,Q55603085,Q29552,Q7934064,"[Virginia, Dave Brat, Red Bank, New Jersey, De..."
4,e0b61cab-a183-4a44-bb0a-81f25fda8de3,Adam B. Schiff,"Schiff, Adam",,RepAdamSchiff,RepAdamSchiff,Democrat,democrat,CA-28,California's 28th congressional district,House of Representatives,116,,,https://theunitedstates.io/images/congress/ori...,male,Q350843,Q29552,Q5020000,[United States House Permanent Select Committe...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,be1429b6-d20f-4984-9c55-86fc973242df,Tim Scott,"Scott, Tim",,SenatorTimScott,SenatorTimScott,Republican,republican,ocd-division/country:us/state:sc,South Carolina,Senate,116,,,https://theunitedstates.io/images/congress/ori...,male,Q561315,Q29468,Q1456,[United States Senate Special Committee on Agi...
536,7cd602d6-2486-4971-abda-ae5516b7b6fc,Tina Smith,"Smith, Tina",,SenTinaSmith,USSenTinaSmith,Democrat,democrat,ocd-division/country:us/state:mn,Minnesota,Senate,116,,,https://theunitedstates.io/images/congress/ori...,female,Q18631509,Q29552,Q1527,"[Minnesota, Amy Klobuchar, Al Franken, List of..."
537,99537cde-00d9-41d3-af9c-247f13215944,Todd Young,"Young, Todd",,SenToddYoung,SenatorToddYoung,Republican,republican,ocd-division/country:us/state:in,Indiana,Senate,116,,,https://theunitedstates.io/images/congress/ori...,male,Q25483,Q29468,Q1415,"[Indiana, Mike Braun, Dan Coats, National Repu..."
538,d7bfdfde-86cd-4da5-99d5-5c560561d7cc,Tom Cotton,"Cotton, Tom",,SenTomCotton,SenatorTomCotton,Republican,republican,ocd-division/country:us/state:ar,Arkansas,Senate,116,,,https://theunitedstates.io/images/congress/ori...,male,Q3090307,Q29468,Q1612,"[Arkansas, John Boozman, Mark Pryor, Arkansas,..."


In [122]:
df_connections.to_pickle('connections.pkl')