# Store all projects between 2000 and 2022 to local MariaDB

## Overview

- Assumption: XML file of the research proposal is stored in the . /xml folder.
- Save the data at the time of the offer.
- In principle, data is obtained from the summary element. In principle, data should be obtained from the summary element.
Researcher information is located in two places: grantAward/summary/member and grantAward/memberList/member.
In the former, the same person does not appear more than once, but there are no codes for affiliations, etc. In the latter, there are codes for affiliations, etc., but the information is not available every year.
The latter has the affiliated institution code, but the same person appears more than once because of the annual performance report.
For the time being, we will obtain data from the former. When we have enough time, we would like to compare the data with the latter.

### Logistics

1. grantaward : The main part of the research proposal. Proposal number, research type, start year, end year, total direct cost, etc.


- Part 1: Items in the research proposal data that will not change, such as the proposal number and research type, and that are one-to-one with the proposal number.
- Part 2: Research organization in the adopted fiscal year.
- Part 3: Principal investigator for the year of adoption

The above three parts are combined using the proposal number as a key to create a single table, which is written in the DB.

The following table is basically a one-to-many relationship for grantaward. write to DB.

2. grantaward_member : Principal Investigator, Research Assignee, etc.
3. grantaward_field : Research field. Based on the system field sub-division table.
4. grantaward_review_section : Review section. Based on the review section table.
5. grantaward_annual : Amount of direct expenses per fiscal year.
6. grantaward_keyword : Keyword of the research proposal.
7. grantaward_paragraph : text data such as research outline
8. grantaward_product: research deliverables

## Preparation

In [1]:
import configparser
import os
import pickle
import re
import shutil
from glob import glob

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from lxml import etree
from sqlalchemy import create_engine
from sqlalchemy.types import Date, Integer, String, BigInteger
from tqdm import tqdm_notebook as tqdm

In [2]:
username= 'root'
password= 'Norika1124!'
database= 'Capstone_KAKEN'
appid = 'HhV6SqJ0yYlgkp0a887V'
url = 'mysql+pymysql://' + username + ':' + password + '@localhost:3306/' + database + '?charset=utf8'
engine = create_engine(url, echo=True)

データセットを作成する年度を指定する

In [17]:
startyear = 2000
endyear = 2022

## Extract data from xml files

In [18]:
# Projects
def kadai(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    kadailist = []
    for grantAward in tree.iterfind("grantAward"):
        projecttype = grantAward.get("projectType")
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        projectstatus = summary.find("projectStatus")
        try:
            projectstatus_fiscalyear = projectstatus.get("fiscalYear")
        except AttributeError:
            projectstatus_fiscalyear = None
        try:
            projectstatus_statuscode = projectstatus.get("statusCode")
        except AttributeError:
            projectstatus_statuscode = None
        startfiscalyear = summary.find("periodOfAward").get("searchStartFiscalYear")
        endfiscalyear = summary.find("periodOfAward").get("searchEndFiscalYear")
        try:
            category_niicode = summary.find("category").get("niiCode")
        except AttributeError:
            category_niicode = None
        try:
            category = summary.find("category").text
        except AttributeError:
            category = None
        try:
            section_niicode = summary.find("section").get("niiCode")
        except AttributeError:
            section_niicode = None
        try:
            section = summary.find("section").text
        except AttributeError:
            section = None
        try:
            title_ja = summary.find("title").text
        except AttributeError:
            title_ja = None
        try:
            title_en = summary.find("title").text
        except AttributeError:
            title_en = None
        try:
            directcost = summary.find("overallAwardAmount/directCost").text
        except AttributeError:
            directcost = None
        try: 
            allocation_niicode = summary.find("allocation").get("niiCode")
        except AttributeError:
            allocation_niicode = None
        row = [
            awardnumber,
            projecttype,
            projectstatus_fiscalyear,
            projectstatus_statuscode,
            startfiscalyear,
            endfiscalyear,
            category_niicode,
            category,
            section_niicode,
            section,
            title_ja,
            title_en,
            directcost,
            allocation_niicode
        ]
        kadailist.append(row)
    dumpfilename = (
        "dump_kadai_2000_2022/main/main_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(kadailist, f)

In [19]:
# Institution
def institution(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    institutionlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        grantlist = grantAward.find("grantList")
        try:
            for grant in grantlist.iterfind("grant[@xml:lang='ja']", nsmap):
                fiscalyear = grant.get("fiscalYear")
                grant_sequence = grant.get("sequence")
                for institution in grant.iterfind("institution"):
                    institution_sequence = institution.get("sequence")
                    institution_niicode = institution.get("niiCode")
                    institution_mextcode = institution.get("mextCode")
                    institution_jspscode = institution.get("jspsCode")
                    institution_name = institution.text
                    row = [
                        awardnumber,
                        fiscalyear,
                        grant_sequence,
                        institution_sequence,
                        institution_niicode,
                        institution_mextcode,
                        institution_jspscode,
                        institution_name,
                    ]
                    institutionlist.append(row)
        except AttributeError:
            row = [awardnumber] + [None] * 7
    dumpfilename = (
        "dump_kadai_2000_2022/institution/institution_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(institutionlist, f)

In [20]:
# Researcher
def member(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    memberlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        for member in summary.iterfind("member", nsmap):
            sequence = member.get("sequence")
            try:
                participate = member.get("participate")
            except AttributeError:
                participate = None
            eradcode = member.get("eradCode")
            role = member.get("role")
            try:
                fullname = member.find("personalName/fullName").text
            except AttributeError:
                fullname = None
            try:
                familyname = member.find("personalName/familyName").text
            except AttributeError:
                familyname = None
            try:
                givenname = member.find("personalName/givenName").text
            except AttributeError:
                givenname = None
            try:
                familyname_yomi = member.find("personalName/familyName").get("yomi")
            except AttributeError:
                familyname_yomi = None
            try:
                givenname_yomi = member.find("personalName/givenName").get("yomi")
            except AttributeError:
                givenname_yomi = None
            row = [
                awardnumber,
                sequence,
                participate,
                eradcode,
                role,
                fullname,
                familyname,
                givenname,
                familyname_yomi,
                givenname_yomi,
            ]
            memberlist.append(row)
    dumpfilename = (
        "dump_kadai_2000_2022/member/member_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(memberlist, f)

In [21]:
# Project Field
def field(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    fieldlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        for field in summary.iterfind("field"):
            field_sequence = field.get("sequence")
            field_path = field.get("path")
            field_niicode = field.get("niiCode")
            field_table = field.get("fieldTable")
            field_name = field.text
            row = [
                awardnumber,
                field_sequence,
                field_path,
                field_niicode,
                field_table,
                field_name,
            ]
            fieldlist.append(row)
    dumpfilename = (
        "dump_kadai_2000_2022/field/field_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(fieldlist, f)

In [22]:
# Review Section in Application
def review_section(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    review_secitonlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        for review_section in summary.iterfind("review_section", nsmap):
            review_section_sequence = review_section.get("sequence")
            review_section_niicode = review_section.get("niiCode")
            review_section_table_type = review_section.get("tableType")
            review_section_name = review_section.text
            row = [
                awardnumber,
                review_section_sequence,
                review_section_niicode,
                review_section_table_type,
                review_section_name,
            ]
            review_secitonlist.append(row)
        dumpfilename = (
            "dump_kadai_2000_2022/review_section/review_section_"
            + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
            + ".dump"
        )
    with open(dumpfilename, "wb") as f:
        pickle.dump(review_secitonlist, f)

In [23]:
# FY direct cost
def annual(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    directcostlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        for awardamountlist in grantAward.iterfind("awardAmountList"):
            sequence = awardamountlist.get("sequence")
            for awardamount in awardamountlist.iterfind("awardAmount"):
                try:
                    fiscalyear = awardamount.get("fiscalYear")
                except AttributeError:
                    fiscalyear = None
                try:
                    directcost = awardamount.find("directCost").text
                except AttributeError:
                    directcost = None
                row = [awardnumber, sequence, fiscalyear, directcost]
                directcostlist.append(row)
    dumpfilename = (
        "dump_kadai_2000_2022/annual/annual_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(directcostlist, f)

In [24]:
# Keyword in research
def keyword(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    keywordlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        try:
            keywordList = grantAward.find("summary[@xml:lang='ja']/keywordList", nsmap)
            for keyword in keywordList.iterfind("keyword"):
                keyword_sequence = keyword.get("sequence")
                keyword_text = keyword.text
                row = [awardnumber, keyword_sequence, keyword_text]
                keywordlist.append(row)
        except AttributeError:
            row = [awardnumber] + [None] * 2
    dumpfilename = (
        "dump_kadai_2000_2022/keyword/keyword_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(keywordlist, f)

In [25]:
# Text data from research
def paragraph(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    textlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        summary = grantAward.find("summary[@xml:lang='ja']", nsmap)
        try:
            for paragraphlist in summary.iterfind("paragraphList"):
                paragraphlist_sequence = paragraphlist.get("sequence")
                paragraphlist_parentid = paragraphlist.get("parentId")
                paragraphlist_type = paragraphlist.get("type")
                for paragraph in paragraphlist.iterfind("paragraph"):
                    paragraph_sequence = paragraph.get("sequence")
                    paragraph_text = paragraph.text
                    row = [
                        awardnumber,
                        paragraphlist_sequence,
                        paragraphlist_parentid,
                        paragraphlist_type,
                        paragraph_sequence,
                        paragraph_text,
                    ]
                    textlist.append(row)
        except AttributeError:
            row = [awardnumber] + [None] * 5
            textlist.append(row)
    dumpfilename = (
        "dump_kadai_2000_2022/paragraph/paragraph_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(textlist, f)

In [26]:
# RESEARCH PRODUCT
def product(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    productlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        try:
            productlistenriched = grantAward.find("productListEnriched")
            for product in productlistenriched.iterfind("product"):
                product_type = product.get("type")
                sequence = product.get("sequence")
                try:
                    reviewed = product.get("reviewed")
                except AttributeError:
                    reviewed = None
                try:
                    doi = product.find("doi").text
                except AttributeError:
                    doi = None
                try:
                    author_ja = product.find("author[@xml:lang='ja']", nsmap).text
                except AttributeError:
                    author_ja = None
                try:
                    author_en = product.find("author[@xml:lang='en']", nsmap).text
                except AttributeError:
                    author_en = None
                try:
                    title_ja = product.find("title[@xml:lang='ja']", nsmap).text
                except AttributeError:
                    title_ja = None
                try:
                    title_en = product.find("title[@xml:lang='en']", nsmap).text
                except AttributeError:
                    title_en = None
                try:
                    journaltitle_ja = product.find(
                        "journalTitle[@xml:lang='ja']", nsmap
                    ).text
                except AttributeError:
                    journaltitle_ja = None
                try:
                    journaltitle_en = product.find(
                        "journalTitle[@xml:lang='en']", nsmap
                    ).text
                except AttributeError:
                    journaltitle_en = None
                try:
                    year = product.find("year").text
                except AttributeError:
                    year = None
                row = [
                    awardnumber,
                    product_type,
                    sequence,
                    reviewed,
                    doi,
                    author_ja,
                    author_en,
                    title_ja,
                    title_en,
                    journaltitle_ja,
                    journaltitle_en,
                    year,
                ]
                productlist.append(row)
        except:
            row = [awardnumber] + [None] * 11
            productlist.append(row)

    dumpfilename = (
        "dump_kadai_2000_2022/product/product_"
        + re.search("[0-9]{4}_[0-9]+-[0-9]+.xml", xmlfile).group()
        + ".dump"
    )
    with open(dumpfilename, "wb") as f:
        pickle.dump(productlist, f)

Extract the data related to projects in xml file

In [27]:
# Make dump_file empty
target_dir = "dump_kadai_2000_2022"
if os.path.isdir(target_dir):
    shutil.rmtree(target_dir)
parts = [
    "main",
    "institution",
    "member",
    "field",
    "review_section",
    "annual",
    "keyword",
    "paragraph",
    "product",
]
dirlist = [target_dir + "/" + p for p in parts]
for d in dirlist:
    os.makedirs(d)

In [28]:
# create xml file
filenames = []
for i in range(startyear, endyear + 1):
    globdir = "/Users/norika_machome/GitHub/Capstone_KAKEN/Data/all_xml/" + str(i) + "*.xml"
    filenames.extend(glob(globdir))

# parse the code to runt the big funciton 
def parse(xmlfile):
    kadai(xmlfile)
    institution(xmlfile)
    member(xmlfile)
    field(xmlfile)
    review_section(xmlfile)
    annual(xmlfile)
    keyword(xmlfile)
    paragraph(xmlfile)
    product(xmlfile)

# Parallel computation
Parallel(n_jobs=-1, verbose=1)([delayed(parse)(i) for i in filenames])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 1170 out of 1170 | elapsed: 37.3min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

## Crete the function to process data

In [3]:
def merge_list(parts):
    lists = []
    for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):
        with open(dump, mode="rb") as f:
            l = pickle.load(f)
            lists += l
    return lists

## Research Basic Table

### Components 1: Main Research data

In [30]:
# Merge list
lists = merge_list("main")
# Convert list to dataframe
columns = [
    "awardnumber",
    "projecttype",
    "projectstatus_fiscalyear",
    "projectstatus_statuscode",
    "startfiscalyear",
    "endfiscalyear",
    "category_niicode",
    "category",
    "section_niicode",
    "section",
    "title_ja",
    "title_en",
    "directcost",
    "allocation_niicode"
]
base_main = pd.DataFrame(lists, columns=columns)
# If there is no overlap, make it to index
assert not base_main["awardnumber"].duplicated().any(), "awardnumber is duplicated."
base_main = base_main.set_index("awardnumber")
# Category and Section has many missng values, so drop them here
base_main = base_main.drop(columns=["category", "section"])
base_main

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0_level_0,projecttype,projectstatus_fiscalyear,projectstatus_statuscode,startfiscalyear,endfiscalyear,category_niicode,section_niicode,title_ja,title_en,directcost,allocation_niicode
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
17689047,project,2006,project_closed,2005,2006,64,,緑内障に対する神経栄養因子を用いた遺伝子治療の前臨床研究,緑内障に対する神経栄養因子を用いた遺伝子治療の前臨床研究,20200000,hojokin
17689046,project,2006,project_closed,2005,2007,64,,糖尿病網膜症における毛細血管障害・脱落予防と正常血管の再構築,糖尿病網膜症における毛細血管障害・脱落予防と正常血管の再構築,14200000,hojokin
17689045,project,2007,project_closed,2005,2007,64,,骨髄血管前駆細胞を使った網膜血管再生治療及び網膜神経保護療法の開発,骨髄血管前駆細胞を使った網膜血管再生治療及び網膜神経保護療法の開発,21700000,hojokin
17689044,project,2006,project_closed,2005,2006,64,,上気道における樹状細胞の粘膜免疫誘導機能の解析と経鼻DNAワクチンへの応用,上気道における樹状細胞の粘膜免疫誘導機能の解析と経鼻DNAワクチンへの応用,6200000,hojokin
17689043,project,2006,project_closed,2005,2006,64,,cDNAマイクロアレーを用いたCDDP耐性機序の究明と耐性解除による頭頚部癌制圧,cDNAマイクロアレーを用いたCDDP耐性機序の究明と耐性解除による頭頚部癌制圧,7200000,hojokin
...,...,...,...,...,...,...,...,...,...,...,...
17K01675,project,2019,project_closed,2017,2019,72,17,慣性センサーを用いたスポーツ活動のモニタリングとフィードバック手法の開発,慣性センサーを用いたスポーツ活動のモニタリングとフィードバック手法の開発,3400000,kikin
17K01674,project,2019,project_closed,2017,2019,72,17,実践的な中長距離走のランニング技術の測定・評価法の確立,実践的な中長距離走のランニング技術の測定・評価法の確立,2800000,kikin
17K01673,project,2020,granted,2017,2021,72,17,進化論的マインドセットがプレッシャー下の意思決定と運動制御方略に及ぼす影響,進化論的マインドセットがプレッシャー下の意思決定と運動制御方略に及ぼす影響,3600000,kikin
17K01672,project,2019,project_closed,2017,2019,72,17,筋収縮反応と筋疲労後の回復時間を改善させる新たなストレッチング方法の確立,筋収縮反応と筋疲労後の回復時間を改善させる新たなストレッチング方法の確立,3500000,kikin


### Componet2: Institution at the acceptation

In [31]:
# merge list
lists = merge_list("institution")
# convert list to dataframe
columns = [
    "awardnumber",
    "fiscalyear",
    "grant_sequence",
    "institution_sequence",
    "institution_niicode",
    "institution_mextcode",
    "institution_jspscode",
    "institution_name",
]
base_institution = pd.DataFrame(lists, columns=columns)
# Retrieve the row with the smallest fiscalyear per awardnumber (i.e., the row of the research organization at the time of adoption)
oldest = base_institution.groupby("awardnumber")["fiscalyear"].min().reset_index()
# Leave only lines in df that match oldest
base_institution = pd.merge(oldest, base_institution, on=["awardnumber", "fiscalyear"])
# Make sure there are no duplicate issue numbers and set them in the index
assert not base_institution["awardnumber"].duplicated().any(), "awardnumber is duplicated."
base_institution = base_institution.set_index("awardnumber")
# Delete unused columns
base_institution = base_institution.drop(columns=["fiscalyear", "grant_sequence", "institution_sequence"])
base_institution

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0_level_0,institution_niicode,institution_mextcode,institution_jspscode,institution_name
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00F00001,0010101,10101,10101,北海道大学
00F00004,0017102,17102,17102,九州大学
00F00005,0022601,22601,22601,東京都立大学
00F00006,0064401,64401,64401,国立民族学博物館
00F00007,0064401,64401,64401,国立民族学博物館
...,...,...,...,...
99J82502,0013901,13901,13901,名古屋大学
99J82602,0014301,14301,14301,京都大学
99J82901,0011301,11301,11301,東北大学
99J83101,0013901,13901,13901,名古屋大学


### Component3: Researcher info at acceptance

In [32]:
# merge list
lists = merge_list("member")
# convert list to dataframe
columns = [
    "awardnumber",
    "sequence",
    "participate",
    "eradcode",
    "role",
    "fullname",
    "familyname",
    "givenname",
    "familyname_yomi",
    "givenname_yomi",
]
base_member = pd.DataFrame(lists, columns=columns)
# 代表者のみ抽出
daihyou = [
    "principal_investigator",
    "area_organizer",
    "principal_investigator_support",
    "research_fellow",
    "foreign_research_fellow",
]
base_member = base_member[base_member["role"].isin(daihyou)]
# データ型を指定する
base_member = base_member.astype({"sequence": int})
# awardnumberごとにsequenceが最大のレコードのみ抽出する。生のXMLを眺めてみると、sequenceが大きいほど古い年度のデータなので。
seqmax = base_member.groupby('awardnumber')['sequence'].max().reset_index()
base_member = pd.merge(seqmax, base_member, on=['awardnumber', 'sequence'])
# 課題番号に重複がないことを確認して、インデックスに設定する
assert not base_member["awardnumber"].duplicated().any(), "awardnumber is duplicated."
base_member = base_member.set_index("awardnumber")
base_member

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0_level_0,sequence,participate,eradcode,role,fullname,familyname,givenname,familyname_yomi,givenname_yomi
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00F00001,3,,,foreign_research_fellow,HANKO L.,HANKO,L.,,
00F00004,2,,,foreign_research_fellow,OH S.,OH,S.,,
00F00005,2,,,foreign_research_fellow,BAO Z.,BAO,Z.,,
00F00006,2,,,foreign_research_fellow,LEE A.,LEE,A.,,
00F00007,2,,,foreign_research_fellow,KIM H. S.,KIM,H. S.,,
...,...,...,...,...,...,...,...,...,...
99J83101,1,,,research_fellow,藤井 洋之,藤井,洋之,フジイ,ヒロユキ
99J83201,1,,,research_fellow,中石 智之,中石,智之,ナカイシ,トモユキ
S001,1,,60117603,area_organizer,今井 浩三,今井,浩三,イマイ,コウゾウ
S002,1,,70135292,area_organizer,小原 雄治,小原,雄治,コハラ,ユウジ


### Integrate 3 components above

In [33]:
# merge three dataframe 
base = base_main.join(base_institution)
base = base.join(base_member)
base

Unnamed: 0_level_0,projecttype,projectstatus_fiscalyear,projectstatus_statuscode,startfiscalyear,endfiscalyear,category_niicode,section_niicode,title_ja,title_en,directcost,...,institution_name,sequence,participate,eradcode,role,fullname,familyname,givenname,familyname_yomi,givenname_yomi
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17689047,project,2006,project_closed,2005,2006,64,,緑内障に対する神経栄養因子を用いた遺伝子治療の前臨床研究,緑内障に対する神経栄養因子を用いた遺伝子治療の前臨床研究,20200000,...,九州大学,1.0,,20380389,principal_investigator,池田 康博,池田,康博,イケダ,ヤスヒロ
17689046,project,2006,project_closed,2005,2007,64,,糖尿病網膜症における毛細血管障害・脱落予防と正常血管の再構築,糖尿病網膜症における毛細血管障害・脱落予防と正常血管の再構築,14200000,...,京都大学,1.0,,80335265,principal_investigator,鈴間 潔,鈴間,潔,スズマ,キヨシ
17689045,project,2007,project_closed,2005,2007,64,,骨髄血管前駆細胞を使った網膜血管再生治療及び網膜神経保護療法の開発,骨髄血管前駆細胞を使った網膜血管再生治療及び網膜神経保護療法の開発,21700000,...,京都大学,1.0,,30314222,principal_investigator,大谷 篤史,大谷,篤史,オオタニ,アツシ
17689044,project,2006,project_closed,2005,2006,64,,上気道における樹状細胞の粘膜免疫誘導機能の解析と経鼻DNAワクチンへの応用,上気道における樹状細胞の粘膜免疫誘導機能の解析と経鼻DNAワクチンへの応用,6200000,...,大分大学,1.0,,40325717,principal_investigator,児玉 悟,児玉,悟,コダマ,サトル
17689043,project,2006,project_closed,2005,2006,64,,cDNAマイクロアレーを用いたCDDP耐性機序の究明と耐性解除による頭頚部癌制圧,cDNAマイクロアレーを用いたCDDP耐性機序の究明と耐性解除による頭頚部癌制圧,7200000,...,東北大学,1.0,,50375060,principal_investigator,小川 武則,小川,武則,オガワ,タケノリ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17K01675,project,2019,project_closed,2017,2019,72,17,慣性センサーを用いたスポーツ活動のモニタリングとフィードバック手法の開発,慣性センサーを用いたスポーツ活動のモニタリングとフィードバック手法の開発,3400000,...,鹿屋体育大学,1.0,,70325819,principal_investigator,和田 智仁,和田,智仁,ワダ,トモヒト
17K01674,project,2019,project_closed,2017,2019,72,17,実践的な中長距離走のランニング技術の測定・評価法の確立,実践的な中長距離走のランニング技術の測定・評価法の確立,2800000,...,鹿屋体育大学,1.0,,30404544,principal_investigator,松村 勲,松村,勲,マツムラ,イサオ
17K01673,project,2020,granted,2017,2021,72,17,進化論的マインドセットがプレッシャー下の意思決定と運動制御方略に及ぼす影響,進化論的マインドセットがプレッシャー下の意思決定と運動制御方略に及ぼす影響,3600000,...,広島大学,1.0,,40281159,principal_investigator,関矢 寛史,関矢,寛史,セキヤ,ヒロシ
17K01672,project,2019,project_closed,2017,2019,72,17,筋収縮反応と筋疲労後の回復時間を改善させる新たなストレッチング方法の確立,筋収縮反応と筋疲労後の回復時間を改善させる新たなストレッチング方法の確立,3500000,...,広島大学,1.0,,10536783,principal_investigator,前田 慶明,前田,慶明,マエダ,ノリアキ


In [1]:
# Drop foreign key
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward_review_section DROP FOREIGN KEY fk_grantaward_review_section_grantaward;")
        con.execute("ALTER TABLE grantaward_field DROP FOREIGN KEY fk_grantaward_field_grantaward;")
        con.execute("ALTER TABLE grantaward_annual DROP FOREIGN KEY fk_grantaward_annual_grantaward;")
        con.execute("ALTER TABLE grantaward_member DROP FOREIGN KEY fk_grantaward_member_grantaward;")
        con.execute("ALTER TABLE grantaward_paragraph DROP FOREIGN KEY fk_grantaward_paragraph_grantaward;")
        con.execute("ALTER TABLE grantaward_keyword DROP FOREIGN KEY fk_grantaward_keyword_grantaward;")
        con.execute("ALTER TABLE grantaward_product DROP FOREIGN KEY fk_grantaward_product_grantaward;")
        con.execute("ALTER TABLE grantaward DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_member DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_field DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_review_section DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_annual DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_keyword DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_paragraph DROP PRIMARY KEY;")
        con.execute("ALTER TABLE grantaward_product DROP PRIMARY KEY;")
except:
    pass

In [35]:
# upload it to database
base.to_sql(
    "grantaward",
    engine,
    if_exists="replace",
    dtype={
        "awardnumber": String(255),
        "startfiscalyear": Integer,
        "endfiscalyear": Integer,
        "projectstatus_fiscalyear": Integer,
        "category_niicode": Integer,
        "section_niicode": Integer,
        "institution_niicode": String(7),
        "directcost": BigInteger,
        "sequence": Integer,
        "eradcode": String(8),
    },
)

2022-12-08 21:08:03,251 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:08:03,252 INFO sqlalchemy.engine.Engine [generated in 0.00099s] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward'}
2022-12-08 21:08:03,258 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-08 21:08:03,261 INFO sqlalchemy.engine.Engine 
CREATE TABLE grantaward (
	awardnumber VARCHAR(255), 
	projecttype TEXT, 
	projectstatus_fiscalyear INTEGER, 
	projectstatus_statuscode TEXT, 
	startfiscalyear INTEGER, 
	endfiscalyear INTEGER, 
	category_niicode INTEGER, 
	section_niicode INTEGER, 
	title_ja TEXT, 
	title_en TEXT, 
	directcost BIGINT, 
	allocation_niicode TEXT, 
	institution_niicode VARCHAR(7), 
	institution_mextcode TEXT, 
	institution_jspscode TEXT, 
	institution_name TEXT, 
	sequence INTEGER, 
	participate TEXT, 
	eradcode VARCHAR(8), 
	role TEXT, 
	fullname TEXT, 
	familyname TEXT, 
	

581224

In [36]:
# Set main and foreign key in database
with engine.connect() as con:
    con.execute("ALTER TABLE grantaward ADD PRIMARY KEY(awardnumber)")
    con.execute(
        "ALTER TABLE grantaward ADD CONSTRAINT category_niicode_1 FOREIGN KEY (category_niicode) REFERENCES master_categories(category_niicode);"
    )
    con.execute(
        "ALTER TABLE grantaward ADD CONSTRAINT section_niicode_1 FOREIGN KEY (section_niicode) REFERENCES master_sections(section_niicode);"
    )
    con.execute(
        "ALTER TABLE grantaward ADD CONSTRAINT institution_niicode_1 FOREIGN KEY (institution_niicode) REFERENCES master_institutions(institution_niicode);"
    )

2022-12-08 21:09:24,770 INFO sqlalchemy.engine.Engine ALTER TABLE grantaward ADD PRIMARY KEY(awardnumber)
2022-12-08 21:09:24,776 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:09:32,060 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:09:32,062 INFO sqlalchemy.engine.Engine ALTER TABLE grantaward ADD CONSTRAINT category_niicode_1 FOREIGN KEY (category_niicode) REFERENCES master_categories(category_niicode);
2022-12-08 21:09:32,064 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:09:42,471 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:09:42,472 INFO sqlalchemy.engine.Engine ALTER TABLE grantaward ADD CONSTRAINT section_niicode_1 FOREIGN KEY (section_niicode) REFERENCES master_sections(section_niicode);
2022-12-08 21:09:42,473 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:09:42,632 INFO sqlalchemy.engine.Engine ROLLBACK


OperationalError: (pymysql.err.OperationalError) (1005, 'Can\'t create table `capstone_kaken`.`grantaward` (errno: 150 "Foreign key constraint is incorrectly formed")')
[SQL: ALTER TABLE grantaward ADD CONSTRAINT section_niicode_1 FOREIGN KEY (section_niicode) REFERENCES master_sections(section_niicode);]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

---

## Create Resarchers Table

In [60]:
# merge list 
lists = merge_list("member")
# convert list to dataframe
columns = [
    "awardnumber",
    "sequence",
    "participate",
    "eradcode",
    "role",
    "fullname",
    "familyname",
    "givenname",
    "familyname_yomi",
    "givenname_yomi",
]
member = pd.DataFrame(lists, columns=columns)
# Check the datatype for erad code (resercher ID)
assert member["eradcode"].str.match('^[0-9]*$').all(), "eradcode contains non-integer letter."
member

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0,awardnumber,sequence,participate,eradcode,role,fullname,familyname,givenname,familyname_yomi,givenname_yomi
0,16K00896,1,,20547236,principal_investigator,小竹 英一,小竹,英一,コタケ,エイイチ
1,16K00896,2,,20332273,co_investigator_buntan,今場 司朗,今場,司朗,コンバ,シロウ
2,16K00895,1,,10301384,principal_investigator,菊池 秀彦,菊池,秀彦,キクチ,ヒデヒコ
3,16K00895,2,,,research_collaborator,栗林 太,栗林,太,,
4,16K00894,1,,90280289,principal_investigator,森脇 千夏,森脇,千夏,モリワキ,チナツ
...,...,...,...,...,...,...,...,...,...,...
1292968,22591506,5,,20398019,co_investigator_buntan,池本 哲也,池本,哲也,,
1292969,22591505,1,,30423370,principal_investigator,天野 尋暢,天野,尋暢,,
1292970,22591505,2,,90359894,co_investigator_buntan,田代 裕尊,田代,裕尊,,
1292971,22591505,3,,10363061,co_investigator_buntan,大段 秀樹,大段,秀樹,,


In [63]:
member = member.drop(['fullname','familyname','givenname','familyname_yomi','givenname_yomi'], axis =1)
member

Unnamed: 0,awardnumber,sequence,participate,eradcode,role
0,16K00896,1,,20547236,principal_investigator
1,16K00896,2,,20332273,co_investigator_buntan
2,16K00895,1,,10301384,principal_investigator
3,16K00895,2,,,research_collaborator
4,16K00894,1,,90280289,principal_investigator
...,...,...,...,...,...
1292968,22591506,5,,20398019,co_investigator_buntan
1292969,22591505,1,,30423370,principal_investigator
1292970,22591505,2,,90359894,co_investigator_buntan
1292971,22591505,3,,10363061,co_investigator_buntan


In [64]:
# uploadt it to database
member.to_sql(
    "grantaward_member",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255), "sequence": Integer, "eradcode": String(8)},
)

2022-12-08 21:31:22,291 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:31:22,292 INFO sqlalchemy.engine.Engine [cached since 1399s ago] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_member'}
2022-12-08 21:31:22,296 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:31:22,296 INFO sqlalchemy.engine.Engine [cached since 1399s ago] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_member'}
2022-12-08 21:31:22,300 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `Capstone_KAKEN`
2022-12-08 21:31:22,302 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:31:22,311 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `grantaward_member`
2022-12-08 21:31:22,312 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:31:22,316 INFO sqlalchemy.e

1292973

In [67]:
#### Set main and foreign keys
###with engine.connect() as con:
###    con.execute("ALTER TABLE `grantaward_member` ADD PRIMARY KEY(`index`);")
###    con.execute(
###        "ALTER TABLE `grantaward_member` ADD CONSTRAINT fk_grantaward_member_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
###    )

---

## Create resarch field category table

In [40]:
# merge list
lists = merge_list("field")
# convert list to dataframe
columns = [
    "awardnumber",
    "field_sequence",
    "field_path",
    "field_niicode",
    "field_table",
    "field_name",
]
field = pd.DataFrame(lists,columns=columns)
field

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0,awardnumber,field_sequence,field_path,field_niicode,field_table,field_name
0,17K04798,1,000621,621,saimoku,人文社会系
1,17K04798,1,000621000647,647,saimoku,社会科学
2,17K04798,1,000621000647000677,677,saimoku,教育学
3,17K04798,1,000621000647000677001060,1060,saimoku,教科教育学
4,17K04797,1,000621,621,saimoku,人文社会系
...,...,...,...,...,...,...
1578177,17730392,1,000621000647000672000674,674,saimoku,教育心理学
1578178,17730391,1,000621,621,saimoku,人文社会系
1578179,17730391,1,000621000647,647,saimoku,社会科学
1578180,17730391,1,000621000647000672,672,saimoku,心理学


In [41]:
# uploadt it to database
field.to_sql(
    "grantaward_field",
    engine,
    if_exists="replace",
    dtype={
        "awardnumber": String(255),
        "field_niicode": Integer,
        "field_path": String(255),
    },
)

2022-12-08 21:14:55,127 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:14:55,128 INFO sqlalchemy.engine.Engine [cached since 411.9s ago] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_field'}
2022-12-08 21:14:55,133 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-08 21:14:55,135 INFO sqlalchemy.engine.Engine 
CREATE TABLE grantaward_field (
	`index` BIGINT, 
	awardnumber VARCHAR(255), 
	field_sequence TEXT, 
	field_path VARCHAR(255), 
	field_niicode INTEGER, 
	field_table TEXT, 
	field_name TEXT
)


2022-12-08 21:14:55,136 INFO sqlalchemy.engine.Engine [no key 0.00108s] {}
2022-12-08 21:14:55,280 INFO sqlalchemy.engine.Engine CREATE INDEX ix_grantaward_field_index ON grantaward_field (`index`)
2022-12-08 21:14:55,281 INFO sqlalchemy.engine.Engine [no key 0.00105s] {}
2022-12-08 21:14:55,650 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:14:56,662 IN

1578182

In [68]:
#### Set main and foreign keys
###with engine.connect() as con:
###    con.execute("ALTER TABLE grantaward_field ADD PRIMARY KEY(awardnumber);")
###    con.execute(
###        "ALTER TABLE grantaward_field ADD CONSTRAINT fk_grantaward_field_grantaward FOREIGN KEY (awardnumber) REFERENCES grantaward(awardnumber);"
###    )
###    con.execute(
###        "ALTER TABLE grantaward_field ADD CONSTRAINT fk_grantaward_field_field_niicode FOREIGN KEY (field_niicode) REFERENCES master_fields(field_niicode);"
###    )
###    con.execute(
###        "ALTER TABLE grantaward_field ADD CONSTRAINT fk_grantaward_field_field_path FOREIGN KEY (field_path) REFERENCES master_fields(field_path);"
###    )

---

## Create application review section table

In [69]:
# merge list
lists = merge_list("review_section")
# convert list to dataframe
columns = [
    "awardnumber",
    "review_section_sequence",
    "review_section_niicode",
    "review_section_table_type",
    "review_section_name",
]
review_section = pd.DataFrame(lists, columns=columns)
review_section

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0,awardnumber,review_section_sequence,review_section_niicode,review_section_table_type,review_section_name
0,13226056,1,483,review_section_ryoiki,生物系
1,13226055,1,483,review_section_ryoiki,生物系
2,13226054,1,483,review_section_ryoiki,生物系
3,13226053,1,483,review_section_ryoiki,生物系
4,13226052,1,483,review_section_ryoiki,生物系
...,...,...,...,...,...
146622,19H02994,1,222,review_section,小区分40010:森林科学関連
146623,19H02993,1,222,review_section,小区分40010:森林科学関連
146624,19H02992,1,222,review_section,小区分40010:森林科学関連
146625,19H02991,1,222,review_section,小区分40010:森林科学関連


In [70]:
# uploadt it to database
review_section.to_sql(
    "grantaward_review_section",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255), "review_section_niicode": Integer},
)

2022-12-08 21:33:09,831 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:33:09,832 INFO sqlalchemy.engine.Engine [cached since 1507s ago] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_review_section'}
2022-12-08 21:33:09,836 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-08 21:33:09,838 INFO sqlalchemy.engine.Engine 
CREATE TABLE grantaward_review_section (
	`index` BIGINT, 
	awardnumber VARCHAR(255), 
	review_section_sequence TEXT, 
	review_section_niicode INTEGER, 
	review_section_table_type TEXT, 
	review_section_name TEXT
)


2022-12-08 21:33:09,839 INFO sqlalchemy.engine.Engine [no key 0.00102s] {}
2022-12-08 21:33:09,968 INFO sqlalchemy.engine.Engine CREATE INDEX ix_grantaward_review_section_index ON grantaward_review_section (`index`)
2022-12-08 21:33:09,969 INFO sqlalchemy.engine.Engine [no key 0.00171s] {}
2022-12-08 21:33:10,302 INFO sqlalchem

146627

In [71]:
# Set main and foreign keys
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE `grantaward_review_section` ADD PRIMARY KEY(`index`);")
        con.execute(
            "ALTER TABLE `grantaward_review_section` ADD CONSTRAINT fk_grantaward_review_section_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
        )
        con.execute(
            "ALTER TABLE `grantaward_review_section` ADD CONSTRAINT fk_grantaward_review_section_review_section_niicode FOREIGN KEY (`review_section_niicode`) REFERENCES `master_review_sections`(`review_section_niicode`);"
        )
except:
    pass

2022-12-08 21:33:17,611 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_review_section` ADD PRIMARY KEY(`index`);
2022-12-08 21:33:17,612 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:33:18,312 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:33:18,313 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_review_section` ADD CONSTRAINT fk_grantaward_review_section_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);
2022-12-08 21:33:18,314 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:33:20,645 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:33:20,648 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_review_section` ADD CONSTRAINT fk_grantaward_review_section_review_section_niicode FOREIGN KEY (`review_section_niicode`) REFERENCES `master_review_sections`(`review_section_niicode`);
2022-12-08 21:33:20,649 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:33:23,162 INFO sqlalchemy.engine.Engine COMMIT


---

## Fiscal year award cost

In [72]:
# merge list
lists = merge_list("annual")
# convert list to dataframe
columns = ["awardnumber", "sequence", "fiscalyear", "directcost"]
annual = pd.DataFrame(lists, columns=columns)
annual

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0,awardnumber,sequence,fiscalyear,directcost
0,12026201,1,2000,2300000
1,12025229,1,2000,3500000
2,12025228,1,2000,3200000
3,12025227,1,2000,3600000
4,12025226,1,2000,3500000
...,...,...,...,...
1532438,18760635,1,2007,1600000
1532439,18760634,1,2006,2100000
1532440,18760634,1,2007,1100000
1532441,18760633,1,2006,2600000


In [73]:
# uploadt it to database
annual.to_sql(
    "grantaward_annual",
    engine,
    if_exists="replace",
    dtype={
        "awardnumber": String(255),
        "sequence": Integer,
        "fiscalyaer": Integer,
        "directcost": BigInteger,
    },
)



2022-12-08 21:33:30,804 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:33:30,805 INFO sqlalchemy.engine.Engine [cached since 1528s ago] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_annual'}
2022-12-08 21:33:30,814 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-08 21:33:30,816 INFO sqlalchemy.engine.Engine 
CREATE TABLE grantaward_annual (
	`index` BIGINT, 
	awardnumber VARCHAR(255), 
	sequence INTEGER, 
	fiscalyear TEXT, 
	directcost BIGINT
)


2022-12-08 21:33:30,818 INFO sqlalchemy.engine.Engine [no key 0.00197s] {}
2022-12-08 21:33:30,926 INFO sqlalchemy.engine.Engine CREATE INDEX ix_grantaward_annual_index ON grantaward_annual (`index`)
2022-12-08 21:33:30,927 INFO sqlalchemy.engine.Engine [no key 0.00079s] {}
2022-12-08 21:33:31,147 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:33:31,881 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-1

1532443

In [74]:
# Set main and foreign keys
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_annual` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_annual` ADD CONSTRAINT fk_grantaward_annual_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

2022-12-08 21:34:15,665 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_annual` ADD PRIMARY KEY(`index`);
2022-12-08 21:34:15,667 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:34:21,128 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:34:21,131 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_annual` ADD CONSTRAINT fk_grantaward_annual_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);
2022-12-08 21:34:21,132 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:34:41,786 INFO sqlalchemy.engine.Engine COMMIT


---

## Create keywords table

In [75]:
# merge list
lists = merge_list("keyword")
# convert list to dataframe
columns = ["awardnumber", "keyword_sequence", "keyword_text"]
keyword = pd.DataFrame(lists, columns=columns)
keyword

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0,awardnumber,keyword_sequence,keyword_text
0,15K09896,1,MRI
1,15K09896,2,アルツハイマー病
2,15K09896,3,定量的磁化率マップ
3,15K09895,1,薬学
4,15K09895,2,放射線
...,...,...,...
4712050,25289210,7,廃校
4712051,25289210,8,空き家
4712052,25289210,9,空家
4712053,25289210,10,ネットワーク


In [76]:
# uploadt it to database
keyword.to_sql(
    "grantaward_keyword",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255)},
)



2022-12-08 21:34:55,602 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:34:55,603 INFO sqlalchemy.engine.Engine [cached since 1612s ago] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_keyword'}
2022-12-08 21:34:55,606 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-08 21:34:55,608 INFO sqlalchemy.engine.Engine 
CREATE TABLE grantaward_keyword (
	`index` BIGINT, 
	awardnumber VARCHAR(255), 
	keyword_sequence TEXT, 
	keyword_text TEXT
)


2022-12-08 21:34:55,609 INFO sqlalchemy.engine.Engine [no key 0.00109s] {}
2022-12-08 21:34:55,719 INFO sqlalchemy.engine.Engine CREATE INDEX ix_grantaward_keyword_index ON grantaward_keyword (`index`)
2022-12-08 21:34:55,722 INFO sqlalchemy.engine.Engine [no key 0.00239s] {}
2022-12-08 21:34:55,958 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:34:57,281 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-08 21:35

4712055

In [77]:
# Set main and foreign keys
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_keyword` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_keyword` ADD CONSTRAINT fk_grantaward_keyword_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

2022-12-08 21:36:59,096 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_keyword` ADD PRIMARY KEY(`index`);
2022-12-08 21:36:59,098 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:37:16,705 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:37:16,706 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_keyword` ADD CONSTRAINT fk_grantaward_keyword_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);
2022-12-08 21:37:16,707 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:38:21,903 INFO sqlalchemy.engine.Engine COMMIT


---

## Create Research Institution details table

In [78]:
# merge list
lists = merge_list("paragraph")
# convert list to dataframe
columns = [
    "awardnumber",
    "paragraphlist_sequence",
    "paragraphlist_parentid",
    "paragraphlist_type",
    "paragraph_sequence",
    "paragraph_text",
]
paragraph = pd.DataFrame(lists, columns=columns)
paragraph

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0,awardnumber,paragraphlist_sequence,paragraphlist_parentid,paragraphlist_type,paragraph_sequence,paragraph_text
0,20J15414,1,20J154142020,outline_of_research_initial,1,申請者は修飾塩基の光架橋反応を用いることで人工核酸SNAの二重鎖形成の光制御に成功している。...
1,20J15414,2,20J154142020jisseki,outline_of_research_performance,1,これまでに申請者らは、人工核酸SNA: Serinol nucleic acidに２残基の修...
2,20J15414,2,20J154142020jisseki,outline_of_research_performance,2,今年度はPVAとは異なる波長の光を用いたSNAの光制御を目指し、新たにEVA : 8-Per...
3,20J15414,2,20J154142020jisseki,outline_of_research_performance,3,SNA中の2残基のEVAは波長530 nmの光照射によりPVA同様の架橋反応を起こしたが、そ...
4,20J15414,3,classification20J15414progress2020,progress,1,本年度は、新たな光応答性核酸塩基NVA: 8-Naphthylvinyl adenineを開...
...,...,...,...,...,...,...
1692434,17H00978,3,17H009782020jisseki,planning_scheme,2,本研究では、対象別に①障害班、②難病班、③性的マイノリティ班の３つの班を構築する。また、班横...
1692435,17H00978,3,17H009782020jisseki,planning_scheme,3,①　障害班は前身の研究で行ったパネル調査の分析を進める。（担当者を下記に＊で表示）また、２８...
1692436,17H00978,3,17H009782020jisseki,planning_scheme,4,②　難病班では、３０６の指定難病へと拡大された疾患群および難病指定に至らない希少性疾患患者が...
1692437,17H00978,3,17H009782020jisseki,planning_scheme,5,③　性的マイノリティ班は機能的には問題がないものの社会的な問題に直面する少数派の問題の分析を...


In [79]:
# uploadt it to database
paragraph.to_sql(
    "grantaward_paragraph",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255)},
)



2022-12-08 21:38:37,949 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 21:38:37,950 INFO sqlalchemy.engine.Engine [cached since 1835s ago] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_paragraph'}
2022-12-08 21:38:37,954 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-08 21:38:37,955 INFO sqlalchemy.engine.Engine 
CREATE TABLE grantaward_paragraph (
	`index` BIGINT, 
	awardnumber VARCHAR(255), 
	paragraphlist_sequence TEXT, 
	paragraphlist_parentid TEXT, 
	paragraphlist_type TEXT, 
	paragraph_sequence TEXT, 
	paragraph_text TEXT
)


2022-12-08 21:38:37,957 INFO sqlalchemy.engine.Engine [no key 0.00115s] {}
2022-12-08 21:38:38,068 INFO sqlalchemy.engine.Engine CREATE INDEX ix_grantaward_paragraph_index ON grantaward_paragraph (`index`)
2022-12-08 21:38:38,069 INFO sqlalchemy.engine.Engine [no key 0.00132s] {}
2022-12-08 21:38:38,307 INFO sqlalchemy.engine.

1692439

In [80]:
# Set main and foreign keys
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_paragraph` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_paragraph` ADD CONSTRAINT fk_grantaward_paragraph_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

2022-12-08 21:40:38,461 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_paragraph` ADD PRIMARY KEY(`index`);
2022-12-08 21:40:38,464 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:41:08,509 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 21:41:08,513 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_paragraph` ADD CONSTRAINT fk_grantaward_paragraph_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);
2022-12-08 21:41:08,515 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 21:41:42,097 INFO sqlalchemy.engine.Engine COMMIT


## Create Output table

In [4]:
# merge list
lists = merge_list("product")
# convert list to dataframe
columns = [
    "awardnumber",
    "product_type",
    "sequence",
    "reviewed",
    "doi",
    "author_ja",
    "author_en",
    "title_ja",
    "title_en",
    "journaltitle_ja",
    "journaltitle_en",
    "year",
]
product = pd.DataFrame(lists, columns=columns)
product

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dump in tqdm(glob("dump_kadai_2000_2022/" + parts + "/" + parts + "*.dump")):


  0%|          | 0/1170 [00:00<?, ?it/s]

Unnamed: 0,awardnumber,product_type,sequence,reviewed,doi,author_ja,author_en,title_ja,title_en,journaltitle_ja,journaltitle_en,year
0,17J01289,jointInternational,1,,,,,コロンビア大学(米国),,,,
1,17J01289,journal_article,2,true,10.1038/s41586-019-0878-z,"T.Tanoue, S.Morita, D.R.Plichta, A.N.Skelly, W...",,A defined commensal consortium elicits CD8 T c...,,Nature,,2019
2,17J01289,presentation,3,,,"Keiko Yasuma, Munehiro Furuichi, Takaaki Kawag...",,Identifying commensal bacteria that exert colo...,,,,2019
3,17J01285,presentation,1,,,"藤本 千賀子, 鍵 裕之, 篠崎 彩子, 三村 耕一, 西田 民人, 後藤 弘匡",,室温におけるアミノ酸の圧力誘起ペプチド化,,,,2018
4,17J01285,presentation,2,,,藤本 千賀子、鍵 裕之、篠崎 彩子、三村 耕一、西田 民人、森井 尚之、奈良 雅之,,室温高圧下におけるアラニンのペプチド生成とスタンダード合成の検討,,,,2017
...,...,...,...,...,...,...,...,...,...,...,...,...
9705646,18H03009,journal_article,13,true,10.3389/fcell.2020.00156,"Nakamura T, Iwamoto T, Nakamura HM, Shindo Y, ...",,Regulation of miR-1-Mediated Connexin 43 Expre...,,Frontiers in Cell and Developmental Biology,,2020
9705647,18H03009,journal_article,14,true,10.1074/jbc.ra118.003373,Han Xue、Yoshizaki Keigo、Miyazaki Kanako、Arai C...,,The transcription factor NKX2-3 mediates p21 e...,,The Journal of biological chemistry,,2018
9705648,18H03008,journal_article,1,true,10.3390/ijms22031098,Mori Takahiro、Kumagai Kenichi、Nasu Keisuke、Yos...,,Clonal Expansion of Tumor-Infiltrating T Cells...,,International Journal of Molecular Sciences,,2021
9705649,18H03008,presentation,2,,,"熊谷賢一, 大橋祥浩, 中村那々美, 仲宗根 康成, 市川はるの, 川口浩司, 堀江彰久, ...",,ErbB4は口腔扁平上皮癌における頸部リンパ節転移の危険因子である,,,,2018


In [5]:
product = product.drop(['author_ja', 'author_en'],axis=1)
product

Unnamed: 0,awardnumber,product_type,sequence,reviewed,doi,title_ja,title_en,journaltitle_ja,journaltitle_en,year
0,17J01289,jointInternational,1,,,コロンビア大学(米国),,,,
1,17J01289,journal_article,2,true,10.1038/s41586-019-0878-z,A defined commensal consortium elicits CD8 T c...,,Nature,,2019
2,17J01289,presentation,3,,,Identifying commensal bacteria that exert colo...,,,,2019
3,17J01285,presentation,1,,,室温におけるアミノ酸の圧力誘起ペプチド化,,,,2018
4,17J01285,presentation,2,,,室温高圧下におけるアラニンのペプチド生成とスタンダード合成の検討,,,,2017
...,...,...,...,...,...,...,...,...,...,...
9705646,18H03009,journal_article,13,true,10.3389/fcell.2020.00156,Regulation of miR-1-Mediated Connexin 43 Expre...,,Frontiers in Cell and Developmental Biology,,2020
9705647,18H03009,journal_article,14,true,10.1074/jbc.ra118.003373,The transcription factor NKX2-3 mediates p21 e...,,The Journal of biological chemistry,,2018
9705648,18H03008,journal_article,1,true,10.3390/ijms22031098,Clonal Expansion of Tumor-Infiltrating T Cells...,,International Journal of Molecular Sciences,,2021
9705649,18H03008,presentation,2,,,ErbB4は口腔扁平上皮癌における頸部リンパ節転移の危険因子である,,,,2018


In [6]:
product = product.drop(['title_en','title_ja'],axis=1)

In [7]:
# uploadt it to database
product.to_sql(
    "grantaward_product",
    engine,
    if_exists="replace",
    dtype={"awardnumber": String(255), "year": Integer},
)



2022-12-08 22:17:21,712 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-12-08 22:17:21,713 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 22:17:21,716 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2022-12-08 22:17:21,717 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 22:17:21,719 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2022-12-08 22:17:21,720 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 22:17:21,725 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 22:17:21,726 INFO sqlalchemy.engine.Engine [generated in 0.00092s] {'table_schema': 'Capstone_KAKEN', 'table_name': 'grantaward_product'}
2022-12-08 22:17:21,743 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-12-08 22:17:21,745 INFO sqlalchemy.engine.Engine [cached since 0.01987s a

9705651

In [8]:
# Set main and foreign keys
with engine.connect() as con:
    con.execute("ALTER TABLE `grantaward_product` ADD PRIMARY KEY(`index`);")
    con.execute(
        "ALTER TABLE `grantaward_product` ADD CONSTRAINT fk_grantaward_product_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);"
    )

2022-12-08 22:31:28,185 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_product` ADD PRIMARY KEY(`index`);
2022-12-08 22:31:28,188 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 22:32:09,095 INFO sqlalchemy.engine.Engine COMMIT
2022-12-08 22:32:09,097 INFO sqlalchemy.engine.Engine ALTER TABLE `grantaward_product` ADD CONSTRAINT fk_grantaward_product_grantaward FOREIGN KEY (`awardnumber`) REFERENCES `grantaward`(`awardnumber`);
2022-12-08 22:32:09,099 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-12-08 22:34:06,493 INFO sqlalchemy.engine.Engine COMMIT
