*IMPORTANT: Before using this script, make sure you have executed `citation_helper.py`!!!*

# Bib grabber

For each bib.tex file we need to send two requests to the server, greatly increasing the risk of being rate-limited.

This notebook is a simple way to grab all the bib files with resume from break point.

### Initialize packages

In [None]:
import json
import os
import sys
import time
from tqdm import tqdm
import re
import requests

In [None]:
proxies = {
    "http": "http://127.0.0.1:10809/",
    "https": "http://127.0.0.1:10809/"
}

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Cookie": ""
}

### Setup bs4

In [None]:
import warnings
from bs4 import BeautifulSoup


class SoupKitchen(object):
    """Factory for creating BeautifulSoup instances."""

    @staticmethod
    def make_soup(markup, parser=None):
        """Factory method returning a BeautifulSoup instance. The created
        instance will use a parser of the given name, if supported by
        the underlying BeautifulSoup instance.
        """
        if 'bs4' in sys.modules:
            # We support parser specification. If the caller didn't
            # specify one, leave it to BeautifulSoup to pick the most
            # suitable one, but suppress the user warning that asks to
            # select the most suitable parser ... which BS then
            # selects anyway.
            if parser is None:
                warnings.filterwarnings('ignore', 'No parser was explicitly specified')
            return BeautifulSoup(markup, parser)

        return BeautifulSoup(markup)

In [None]:
def get_http_response(url):
    """
    Helper method, sends HTTP request and returns response payload.
    """
    time.sleep(5)
    try:
        resp = requests.get(url, headers=headers, proxies=proxies)
        if resp.status_code != 200:
            print(f"Failed to fetch {url}, status code {resp.status_code}")
            print(resp.text)
            return None
        return resp.text
    except Exception as err:
        return None

In [None]:
def get_citation_data(aid):
    """
    Given an article, retrieves citation link. Note, this requires that
    you adjusted the settings to tell Google Scholar to actually
    provide this information, *prior* to retrieving the article.
    """
    urlargs = {'aid': aid}
    GET_CITATION_URL = 'https://scholar.google.com/scholar?' \
                       + 'q=info:%(aid)s:scholar.google.com/' \
                       + '&output=cite&scirp=0&hl=en'
    html = get_http_response(url=GET_CITATION_URL % urlargs)
    if html is None:
        return None

    soup = SoupKitchen.make_soup(html)

    tag = soup.findAll('a')
    if tag is None:
        return None
    
    url_citation = re.sub(r'scisf:%d', 'scisf:4', tag[0]['href'])
    data = get_http_response(url=url_citation)
    if data is None:
        return None

    return data

### Load original raw files

In [None]:
pl = os.listdir('../output/raw/')

In [None]:
for p in pl:
    content = ''
    with open('../output/raw/' + p, 'r', encoding='utf-8') as f:
        content = f.read()
    aids = re.findall(r'aid (.*)', content)
    output_file = '../output/' + p + '.json'
    if os.path.exists(output_file):
        data = json.load(open(output_file, 'r', encoding='utf-8'))
    else:
        data = {}
    if (len(data) == len(aids)):
        print(f"{p} finished")
        continue
    for aid in tqdm(aids):
        if aid in data:
            continue
        citation_data = get_citation_data(aid)
        if citation_data is None:
            break
        data[aid] = citation_data
        time.sleep(1)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"{p} finished")

In [None]:
ans = os.listdir('../output/')

### Transfer json to tex file

In [None]:
os.makedirs('./bibs', exist_ok=True)

for a in ans:
    output_file = '../output/' + a
    data = json.load(open(output_file, 'r', encoding='utf-8'))
    with open('./bibs/' + a[:-5], 'a', encoding='utf-8') as f:
        for k, v in data.items():
            f.write(v)
            f.write('\n')