In [2]:
import pdb
import pandas as pd
import requests
import signal
import pickle

## Get Redirect helper functions

In [2]:
def has_key(key, resp, pageid):
    return bool(resp['query']["pages"][pageid].get(key))

def matches(lst1, lst2):
    return [x for x in lst1 if x in lst2]
    
    
def get_redirect_info(title):
    params = {
        "action": "query",
        "format": "json",

        "titles": title,

        "prop": "redirects",
        
        # redirects
        "rdlimit": "max",
        "rdnamespace": 0,
        "rdprops": "pageid|title|fragment",
        "rdshow": "!fragment",
        
    }

    redirects = []

    def query_info(title, params):

        resp = requests.get(
            url="https://en.wikipedia.org/w/api.php",
            params=params).json()

        pageid = list(resp["query"]['pages'].keys())[0]
        
        if has_key("redirects", resp, pageid):
            for rd in resp['query']["pages"][pageid]['redirects']:                
                redirects.append(rd['title'])
        
        if resp.get('continue'):
            params.update(resp.get("continue"))
            query_info(title, params)

    query_info(title, params)
    
    return redirects

## RedirectsAPI class

In [3]:
class RedirectsAPI:

    def __init__(self, df, min_index=0, max_index=10**7):
        self.df = df        
        
        self.reset_save_df()
        
        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def reset_save_df(self):
        self.save_df = pd.DataFrame(columns=['title', 'redirects'])
        
    def verify_results(self, redirect_info):
        if type(redirect_info) == dict:
            return True
        
        return False
        
    def get_redirect_info(self, title):
        try:
            signal.alarm(5)  # allow up to 5 seconds for each api call
            redirects = get_redirect_info(title)
            formatted_info = {"title": title, "redirects": redirects}
            signal.alarm(0)
            return formatted_info
        except:
            return None
    
    def pickle_save_df(self, filename):
        with open("../../data/raw/pickled_redirect_info/{}".format(filename), "wb") as handler:
            pickle.dump(self.save_df, handler,
                        protocol=pickle.HIGHEST_PROTOCOL)
    
    def get_redirects(self, save_size=1000, save_index=0, save_name="redirect_api"):

        for i, row in self.df.iterrows():

            if row["visited"] == "fail":
                continue

            # get info for article (None if API call error)
            redirect_info = self.get_redirect_info(row["title"])
            
            if self.verify_results(redirect_info):

                # append to df to be saved later
                self.save_df = self.save_df.append(
                    redirect_info, ignore_index=True)

                # if the save df is at capacity, save it to a pkl file and reinitialize
                if self.save_df.shape[0] == save_size:
                    self.pickle_save_df(f"{save_index}-{save_name}.pkl")
                    save_index += 1
                    self.reset_save_df()

        # at the end, save what remaining entries to their own pkl file
        self.pickle_save_df(f"{save_index}-{save_name}.pkl")

## Call API

In [4]:
unique_df = pd.read_csv("../../data/raw/07_updated_unique_articles_visited_api.csv")[0:14000]
rd_api = RedirectsAPI(unique_df)

rd_api.get_redirects(save_name="redirect_api_6")