# Downloading All Research Projects XML Files from KAKEN

## Setting

Specify the year for which data is to be retrieved; KAKEN stores data from 1964.

In [None]:
startyear = 1964
endyear = 2022

1. Need to register to use CiNii Web API. See https://support.nii.ac.jp/ja/cinii/api/developer for details. After the registration is completed, you will be notified of the appid by email. 
 
2. Need to install MariaDB locally and keep running while coding. 


In [None]:
import configparser
import os
import re
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm_notebook

In [None]:
appid = 'HhV6SqJ0yYlgkp0a887V'
ROOT_URL = 'https://kaken.nii.ac.jp/opensearch/?appid=' + appid

Create the empty file to storey all xml files.

In [None]:
os.makedirs("/users/norika_machome/GitHub/Capstone_KAKEN/Data/all_xml", exist_ok=True)

## Defining Function to make the file with year and index

In [None]:
def get_number_of_record(year):
    '''
    input: year, Intenger 
    output: count, Intenger
    '''
    url = ROOT_URL + "&format=xhtml" + "&s1=" + str(year) + "&s2=" + str(year) + "&o1=1"
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, "lxml")
    tag = soup.select("p.search-term-selected")
    p = str(tag[0])  # number == index
    pattern = r"([+-]?[0-9]+\,?[0-9]*)"
    numberlist = re.findall(pattern, p)
    cnt = str(numberlist[0])
    cnt = int(cnt.replace(",", ""))
    return cnt

## Download all XML files

In [None]:
for year in tqdm_notebook(range(startyear, endyear + 1)):
    cnt = get_number_of_record(year)
    for start in tqdm_notebook(range(1, cnt, 500)):
        rw = 500
        if start + rw > cnt:
            end = cnt
        else:
            end = start + rw - 1

        url = (
            ROOT_URL
            + "&format=xml"
            + "&s1="
            + str(year)
            + "&s2="
            + str(year)
            + "&o1=1"
            + "&st="
            + str(start)
            + "&rw="
            + str(rw)
        )
        r = requests.get(url)
        if r.status_code == 200:
            filename = "/users/norika_machome/GitHub/Capstone_KAKEN/Data/all_xml/" + str(year) + "_" + str(start) + "-" + str(end) + ".xml"
            with open(filename, mode="w", encoding="utf-8") as f:
                f.write(r.text)