In [2]:
import sys
sys.path.append("../../utils")
sys.path.append("../../")

import pdb

import requests
import pandas as pd
import re

import signal
import time

import pickle

from keys import *

import warnings
warnings.filterwarnings("ignore")

In [3]:
def has_key(key, resp, pageid):
    return bool(resp['query']["pages"][pageid].get(key))

def matches(lst1, lst2):
    return [x for x in lst1 if x in lst2]
    
    
def get_article_info(title):
    params = {
        "action": "query",
        "format": "json",

        "titles": title,

        "prop": "extracts|info|links|linkshere|categories",
        
        # extracts
        "exintro": True,
        "explaintext": True,
        "exsectionformat": "plain",
        
        # links
        "pllimit": "max",
        "plnamespace": 0,
        
        # linkshere
        "lhlimit": "max",
        "lhnamespace": 0,
        "lhshow": "!redirect",
        
        # categories
        "cllimit": "max",
    }

    extract = []
    links = []
    linkshere = []
    categories = []

    def query_info(title, params):

        resp = requests.get(
            url="https://en.wikipedia.org/w/api.php",
            params=params).json()

        pageid = list(resp["query"]['pages'].keys())[0]
        
        if has_key("extract", resp, pageid):
            extract.append(resp['query']["pages"][pageid]['extract'])
        
        if has_key("links", resp, pageid):
            for link in resp['query']["pages"][pageid]["links"]:
                links.append(link["title"])
        
        if has_key("linkshere", resp, pageid):
            for lh in resp['query']["pages"][pageid]["linkshere"]:
                linkshere.append(lh["title"])
        
        if has_key("categories", resp, pageid):
            for cat in resp['query']["pages"][pageid]["categories"]:
                if not bool(re.findall(r"(articles)|(uses)|(commons)", cat["title"], re.I)):
                    categories.append(cat["title"])
        
        if resp.get('continue'):
            params.update(resp.get("continue"))
            query_info(title, params)

    query_info(title, params)
    
    return extract, links, linkshere, categories

In [10]:
class WikiAPI:

    def __init__(self, path_to_df='../../data/raw/06_unique_articles_visited_api.csv'):

        self.df = pd.read_csv(path_to_df)
        self.path_to_df = path_to_df
        self.reset_save_df()

        # setup timeout function
        def handle_alarm(signum, frame):
            raise RuntimeError

        signal.signal(signal.SIGALRM, handle_alarm)

    def reset_save_df(self):
        self.save_df = pd.DataFrame(
            columns=["title", "extract", "links", "linkshere", "categories"])

    def pickle_save_df(self, filename):
        with open("../../data/raw/pickled_article_info/{}".format(filename), "wb") as handler:
            pickle.dump(self.save_df, handler,
                        protocol=pickle.HIGHEST_PROTOCOL)

    def get_article_info(self, title):
        try:
            signal.alarm(5)  # allow up to 5 seconds for each api call
            extract, links, linkshere, categories = get_article_info(title)
            formatted_info = {"title": title, "extract": extract,
                              "links": links, "linkshere": linkshere, "categories": categories}
            signal.alarm(0)
            return formatted_info
        except:
            return None

    def mark_visited(self, index, state):
        self.df.visited.iloc[index] = state

    def verify_results(self, info):
        if not info:
            return False
        if info["extract"] == [] and info["links"] == [] and info["linkshere"] == [] and info['categories'] == []:
            return False

        return True

    def get_api_info(self, save_size=1000, save_index=0):
        
        for i, row in self.df.iterrows():

            if row["visited"] != "False":
                continue

            # get info for article (None if API call error)
            article_info = self.get_article_info(row["title"])

            if self.verify_results(article_info):
                # if successful, mark as visited with "success"
                self.mark_visited(i, "success")

                # append to df to be saved later
                self.save_df = self.save_df.append(
                    article_info, ignore_index=True)

                # if the save df is at capacity, save it to a pkl file and reinitialize
                if self.save_df.shape[0] == save_size:
                    self.pickle_save_df(f"{save_index}-api_info.pkl")
                    save_index += 1
                    self.reset_save_df()
                    self.df.to_csv(self.path_to_df, index=False)
            else:

                # if not successful mark as visted with "fail"
                self.mark_visited(i, "fail")

        # at the end, save what remaining entries are left as visited and to their own pkl file
        self.pickle_save_df(f"{save_index}-api_info.pkl")
        self.df.to_csv(self.path_to_df, index=False)

    def seen_to_csv(self):
        for title, relation in self.seen.items():
            self.seen[title] = list(relation)

        seen_df = pd.DataFrame(self.seen).T.reset_index()
        seen_df.columns = ["name", "relation"]
        seen_df.to_csv("article_title_relationships.csv", index=False)

In [11]:
unique_visted = pd.read_csv('../../data/raw/06_unique_articles_visited_api.csv')

api = WikiAPI("../../data/raw/07_updated_unique_articles_visited_api.csv")
api.get_api_info(save_size=1000, save_index=112)

In [117]:
unique_visted[0:12]

Unnamed: 0,title,visited
0,Division_by_Zero_(story),False
1,Indeterminate_form,False
2,Zero_divisor,False
3,Count_noun,False
4,Grammatical_conjugation,False
5,Inflection,False
6,Measure_word,False
7,English_numerals,False
8,Foundations_of_mathematics,False
9,0,False


In [8]:
with open("../../data/raw/pickled_article_info/112-api_info.pkl", "rb") as f:
    test_df = pickle.load(f)
    
test_df

Unnamed: 0,title,extract,links,linkshere,categories
