# Store KAKEN Master Documents to Local MariaDB

## Preparation

- KAKEN master data is maintained in a git repository and the latest data is available; pull the repository from https://bitbucket.org/niijp/grants_masterxml_kaken/ and sync it to your local . /grants_masterxml_kaken folder.

Store 5 master file from KAKEN below:

1. category_master_kakenhi.xml
2. field_master_kakenhi.xml
3. institution_master_kakenhi.xml
4. review_section_master_kakenhi.xml
5. section_master_kakenhi.xml

In [None]:
import configparser
import datetime
from dateutil.relativedelta import relativedelta
import networkx as nx
import numpy as np
import pandas as pd
import pymysql
from lxml import etree
from sqlalchemy import create_engine
from sqlalchemy.types import Date, Integer, String, Text

In [None]:
username= ####
password= ####
database= ####
appid = ####
url = 'mysql+pymysql://' + username + ':' + password + '@localhost:3306/' + database + '?charset=utf8'
engine = create_engine(url, echo=True)

## 1. category_master_kakenhi

In [None]:
# Create list from xml file 
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/category_master_kakenhi.xml")
categorylist = []
for category_table in tree.iterfind("category_table"):
    for category in category_table.iterfind("category"):
        name = category.find("name[@lang='en']").text
        niicode = category.find("code[@type='nii']").text
        row = [niicode, name]
        categorylist.append(row)
# Convert list to dataframe
columns = ["category_niicode", "category_name"]
df = pd.DataFrame(categorylist, columns=columns)
# Check the dupuliaction, and if notm set it as an index
df = df.drop_duplicates()
assert not df["category_niicode"].duplicated().any(), "category_niicode is duplicated."
df = df.set_index("category_niicode")
df

In [None]:
# Drop Foreign Key for Database
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY category_niicode_1;")
except:
    pass
# Add Database
df.to_sql(
    "master_categories",
    engine,
    if_exists="replace",
    dtype={"category_niicode": Integer, "category_name": String(255)},
)

In [None]:
# Create list from xml files
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/category_master_kakenhi.xml")
categorylist = []
for category_table in tree.iterfind("category_table"):
    category_table_type = category_table.get("type")
    category_table_start_date = category_table.get("start_date")
    category_table_end_date = category_table.get("end_date")
    for category in category_table.iterfind("category"):
        category_path = category.get("path")
        category_name_ja = category.find("name[@lang='ja']").text
        category_name_en = category.find("name[@lang='en']").text
        category_niicode = category.find("code[@type='nii']").text
        try:
            category_name_yomi = category.find("name_yomi").text
        except AttributeError:
            category_name_yomi = None
        row = [
            category_table_type,
            category_table_start_date,
            category_table_end_date,
            category_path,
            category_name_ja,
            category_name_en,
            category_niicode,
            category_name_yomi
        ]
        categorylist.append(row)
# Convert list to dataframe
columns = [
    "category_table_type",
    "category_table_start_date",
    "category_table_end_date",
    "category_path",
    "category_name_ja",
    "category_name_en",
    "category_niicode",
    "category_name_yomi"
]
df = pd.DataFrame(categorylist, columns=columns)
df

In [None]:
#
# Convert from a format where start and end dates are recorded for each research line to a format where
# Convert to a format that lists research lines by fiscal year
#

# Replace NULL end date with current time
df["category_table_end_date_mod"] = df["category_table_end_date"]
df["category_table_end_date_mod"] = df["category_table_end_date_mod"].fillna(datetime.datetime.today())

# Get only the fiscal year from the date and the difference between the beginning and ending fiscal year
df["category_table_startfiscalyear"] = pd.to_datetime(df["category_table_start_date"]).map(lambda x: x.year)
df["category_table_endfiscalyear"] = pd.to_datetime(df["category_table_end_date_mod"]).map(lambda x: (x - relativedelta(months=3)).year)
df["delta"] = df["category_table_endfiscalyear"] - df["category_table_startfiscalyear"]

# Make a list of combinations of table_type and starting year
type_and_fiscalyear = df[["category_table_type", "category_table_startfiscalyear"]].drop_duplicates().values

# Duplicate rows for each start year and field table for the number of years applied
results = pd.DataFrame(columns=df.columns.values)
for l in type_and_fiscalyear:
    temp = df.query("category_table_type == @l[0] & category_table_startfiscalyear == @l[1]")
    multi = temp["delta"].unique()
    assert len(multi) == 1, "年度が集合内で単一ではありません"
    results = results.append([temp] * (multi[0] + 1), ignore_index=True)

# Add a sequential number to each line and add it to the starting year to obtain the applicable year.
results["cum"] = results.groupby(["category_table_type", "category_table_start_date", "category_path"]).cumcount()
results["applied_fiscalyear"] = results["category_table_startfiscalyear"] + results["cum"]

# Add a sequential number to each line and add it to the starting year to obtain the applicable year.
results = results[[
    "category_table_type",
    "category_table_start_date",
    "category_table_end_date",
    "applied_fiscalyear",
    "category_path",
    "category_niicode",
    "category_name_ja",
    "category_name_en",
    "category_name_yomi"
]]
results

In [None]:
# Drop foreign key for db
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY category_niicode_1;")
except:
    pass



In [None]:
# Add them to MariaDB
results.to_sql(
    "master_categories_fiscalyear",
    engine,
    if_exists="replace",
    dtype={"applied_fiscalyear": Integer},
)


In [None]:
# Set unique constraint
with engine.connect() as con:
    con.execute("ALTER TABLE `master_categories_fiscalyear` ADD UNIQUE(`category_table_type`, `applied_fiscalyear`, `category_niicode`);")

## 2. field_master_kakenhi

In [None]:
# Create list from XML file
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/field_master_kakenhi.xml")
fieldlist = []
for field_table in tree.iterfind("field_table"):
    field_table_type = field_table.get("type")
    field_table_start_date = field_table.get("start_date")
    field_table_end_date = field_table.get("end_date")
    # layer 1
    for field in field_table.iterfind("field"):
        field_path = field.get("path")
        field_niicode = field.find("code[@type='nii']").text
        field_name = field.find("name[@lang='en']").text
        row = [
            field_table_type,
            field_table_start_date,
            field_table_end_date,
            field_path,
            field_niicode,
            field_name,
            1,
        ]
        fieldlist.append(row)
        # layer 2
        for field in field.iterfind("field"):
            field_path = field.get("path")
            field_niicode = field.find("code[@type='nii']").text
            field_name = field.find("name[@lang='en']").text
            row = [
                field_table_type,
                field_table_start_date,
                field_table_end_date,
                field_path,
                field_niicode,
                field_name,
                2,
            ]
            fieldlist.append(row)
            # layer 3
            for field in field.iterfind("field"):
                field_path = field.get("path")
                field_niicode = field.find("code[@type='nii']").text
                field_name = field.find("name[@lang='en']").text
                row = [
                    field_table_type,
                    field_table_start_date,
                    field_table_end_date,
                    field_path,
                    field_niicode,
                    field_name,
                    3,
                ]
                fieldlist.append(row)
                # layer 4
                for field in field.iterfind("field"):
                    field_path = field.get("path")
                    field_niicode = field.find("code[@type='nii']").text
                    field_name = field.find("name[@lang='en']").text
                    row = [
                        field_table_type,
                        field_table_start_date,
                        field_table_end_date,
                        field_path,
                        field_niicode,
                        field_name,
                        4,
                    ]
                    fieldlist.append(row)
# Convert list to dataframe
columns = [
    "field_table_type",
    "field_table_start_date",
    "field_table_end_date",
    "field_path",
    "field_niicode",
    "field_name",
    "layer",
]
df = pd.DataFrame(fieldlist, columns=columns)
df

In [None]:

df["field_table_end_date_mod"] = df["field_table_end_date"]
df["field_table_end_date_mod"] = df["field_table_end_date_mod"].fillna(datetime.datetime.today())

df["field_table_startfiscalyear"] = pd.to_datetime(df["field_table_start_date"]).map(lambda x: x.year)
df["field_table_endfiscalyear"] = pd.to_datetime(df["field_table_end_date_mod"]).map(lambda x: (x - relativedelta(months=3)).year)
df["delta"] = df["field_table_endfiscalyear"] - df["field_table_startfiscalyear"]

type_and_fiscalyear = df[["field_table_type", "field_table_startfiscalyear"]].drop_duplicates().values

results = pd.DataFrame(columns=df.columns.values)
for l in type_and_fiscalyear:
    temp = df.query("field_table_type == @l[0] & field_table_startfiscalyear == @l[1]")
    multi = temp["delta"].unique()
    assert len(multi) == 1, "年度が集合内で単一ではありません"
    results = results.append([temp] * (multi[0] + 1), ignore_index=True)

results["cum"] = results.groupby(["field_table_type", "field_table_start_date", "field_path"]).cumcount()
results["applied_fiscalyear"] = results["field_table_startfiscalyear"] + results["cum"]

results = results[["field_table_type", "field_table_start_date", "field_table_end_date", "applied_fiscalyear", "field_path", "field_niicode", "field_name", "layer"]]
results

In [None]:
# Drop foreign key for db
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY field_niicode_1;")
        con.execute("ALTER TABLE grantaward_field DROP FOREIGN KEY fk_grantaward_field_field_niicode;")
        con.execute("ALTER TABLE grantaward_field DROP FOREIGN KEY fk_grantaward_field_field_path;")
except:
    pass


In [None]:
# Add them to MariaDB
results.to_sql(
    "master_fields",
    engine,
    if_exists="replace",
    dtype={
        "field_table_type": String(255),
        "field_table_start_date": Date,
        "field_table_end_date": Date,
        "applied_fiscalyear": Integer,
        "field_path": String(255),
        "field_niicode": String(255),
        "field_name": String(255),
        "layer": Integer,
    },
)


In [None]:
# 主キーと外部キー制約、ユニーク制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE master_fields ADD INDEX (field_niicode);")
    con.execute("ALTER TABLE master_fields ADD INDEX (field_path);")
    con.execute("ALTER TABLE `master_fields` ADD UNIQUE( `field_table_type`, `applied_fiscalyear`, `field_path`, `field_niicode`);")

## 3. institution_master_kakenhi

institutions_table_groupのマスタ（2020年6月12日 NII成果担当から情報提供）

In [None]:
table_group_master = [
    [1, "国立大学"],
    [2, "公立大学"],
    [3, "私立大学"],
    [4, "短期大学"],
    [5, "大学共同利用機関法人"],
    [6, "文部科学省等施設等機関"],
    [7, "高等専門学校"],
    [8, "文部科学大臣が指定する機関（国の設置する研究所その他の機関）"],
    [9, "文部科学大臣が指定する機関（地方公共団体の設置する研究所その他の機関）"],
    [10, "文部科学大臣が指定する機関（法律により直接設立された法人）"],
    [11, "文部科学大臣が指定する機関（民法３４条の規定により設立された法人）"],
    [12, "文部科学大臣が指定する機関（民間等の研究機関）"],
    [13, "国際連合大学研究所等"],
    [0, "その他"],
]


In [None]:
# Drop foreign key for db
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE institutions DROP FOREIGN KEY institutions_ibfk_1;")
except:
    pass

In [None]:
# Add them to MariaDB
table_group_master = pd.DataFrame(table_group_master, columns=["institutions_table_group", "institutions_table_group_name"])
table_group_master = table_group_master.set_index("institutions_table_group")
table_group_master.to_sql(
    "master_institutions_table_group",
    engine,
    if_exists="replace",
    dtype={
        "institutions_table_group": Integer,
        "institutions_table_group_name": String(255),
    },
)


In [None]:
# 外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE `master_institutions_table_group` ADD PRIMARY KEY(`institutions_table_group`);")

### Institution Table

In [None]:
# Create list from XML file
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/institution_master_kakenhi.xml")
institutionlist = []
for institution_table in tree.iterfind("institution_table"):
    institution_table_group = institution_table.get("group")
    for institution in institution_table.iterfind("institution"):
        institution_name_ja = institution.find("name[@lang='ja']").text
        institution_name_en = institution.find("name[@lang='en']").text
        try:
            institution_name_yomi = institution.find("name_yomi").text
        except AttributeError:
            institution_name_yomi = None
        institution_niicode = institution.find("code[@type='nii']").text
        institution_mextcode = institution.find("code[@type='mext']").text
        try:
            institution_jspscode = institution.find("code[@type='jsps']").text
        except AttributeError:
            institution_jspscode = None
        row = [
            institution_table_group,
            institution_niicode,
            institution_mextcode,
            institution_jspscode,
            institution_name_ja,
            institution_name_en,
            institution_name_yomi,
        ]
        institutionlist.append(row)
# Convert list to DF
columns = [
    "institution_table_group",
    "institution_niicode",
    "institution_mextcode",
    "institution_jspscode",
    "institution_name_ja",
    "institution_name_en",
    "institution_name_yomi",
]
df = pd.DataFrame(institutionlist, columns=columns)
# Check the dupuliaction, and if notm set it as an index
df = df.drop_duplicates()
assert not df["institution_niicode"].duplicated().any(), "category_niicode is duplicated."
df = df.set_index("institution_niicode")
df

In [None]:
# Drop foreign key for db
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY institution_niicode_1;")
except:
    pass
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE institutions_standard DROP FOREIGN KEY institutions_standard_ibfk_1;")
        con.execute("ALTER TABLE institutions_standard DROP FOREIGN KEY institutions_standard_ibfk_2;")
except:
    pass


In [None]:

# Add them to MariaDB
df.to_sql(
    "master_institutions",
    engine,
    if_exists="replace",
    dtype={
        "institution_table_group": Integer,
        "institution_niicode": String(7),
        "institution_mextcode": String(7),
        "institution_jspscode": String(7),
    },
)


In [None]:
# 外部キー制約を設定する
with engine.connect() as con:
    con.execute("ALTER TABLE `master_institutions` ADD CONSTRAINT institutions_ibfk_1 FOREIGN KEY (`institution_table_group`) REFERENCES `master_institutions_table_group`(`institutions_table_group`) ON DELETE RESTRICT ON UPDATE RESTRICT;")

In [None]:
# Change of research institute codes (succession, absorption, merger) as if they were edges (edges) of a graph of starting and ending points for each research institute.
# Create a niicode list from the difference_list element in the XML file (note the order of before and after)
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/institution_master_kakenhi.xml")
edgelist = []
for institution in tree.iterfind("difference_list/institution"):
    niicode = institution.get("nii_code")
    # succeeded_from
    try:
        before = institution.find("succeeded_from/institution").get("nii_code")
        after = niicode
        edgelist.append([before, after])
    except AttributeError:
        pass
    # absorbed_into
    try:
        before = niicode
        after = institution.find("absorbed_into/institution").get("nii_code")
        edgelist.append([before, after])
    except AttributeError:
        pass
    # merged_from
    try:
        for i in institution.iterfind("merged_from/institution"):
            before = i.get("nii_code")
            after = niicode
            edgelist.append([before, after])
    except AttributeError:
        pass

# Directed graphing and outputting a list of routes from each node to all reachable nodes
g = nx.DiGraph()
g.add_edges_from(edgelist)
shortestpath = nx.shortest_path(g)

# Create conversion table
transitionlist = []
for dic in shortestpath.values():
    # Get a list of the longest paths (i.e., tracing back to the current institute code)
    l = list(dic.values())[-1]
    if len(l) == 1:
        # DO nothing
        pass
    if len(l) == 2:
        # Pass t 
        transitionlist.append(l)
    elif len(l) > 2:
        # Do not use middle node
        transitionlist.append([l[0], l[-1]])

# Add the same value to before and after for institute codes that did not change
transition = pd.DataFrame(transitionlist, columns=["before", "after"])
residue = pd.DataFrame(set(df.index) - set(transition.before))
residue = pd.concat([residue] * 2, axis=1)
residue.columns = ["before", "after"]
results = pd.concat([transition, residue]).reset_index(drop=True)
results

In [None]:
# Add them to MariaDB
results.to_sql(
    "master_institutions_standard",
    engine,
    if_exists="replace",
    dtype={
        "before": String(7),
        "after": String(7),
    },
)


In [None]:

# Set external key
with engine.connect() as con:
    con.execute("ALTER TABLE `master_institutions_standard` ADD CONSTRAINT institutions_standard_ibfk_1 FOREIGN KEY (`before`) REFERENCES `master_institutions`(`institution_niicode`) ON DELETE RESTRICT ON UPDATE RESTRICT;")
    con.execute("ALTER TABLE `master_institutions_standard` ADD CONSTRAINT institutions_standard_ibfk_2 FOREIGN KEY (`after`) REFERENCES `master_institutions`(`institution_niicode`) ON DELETE RESTRICT ON UPDATE RESTRICT;")

## 4. review_section_master_kakenhi

In [None]:
# Create list from XML file
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/review_section_master_kakenhi.xml")
review_sectionlist = []
for review_section_table in tree.iterfind("review_section_table"):
    review_section_table_type = review_section_table.get("type")
    review_section_table_start_date = review_section_table.get("start_date")
    review_section_table_end_date = review_section_table.get("end_date")
    # layer 1
    for review_section in review_section_table.iterfind("review_section"):
        review_section_path = review_section.get("path")
        review_section_niicode = review_section.find("code[@type='nii']").text
        review_section_name = review_section.find("name[@lang='en']").text
        row = [
            review_section_table_type,
            review_section_table_start_date,
            review_section_table_end_date,
            review_section_path,
            review_section_niicode,
            review_section_name,
            1,
        ]
        review_sectionlist.append(row)
        # layer 2
        for review_section in review_section.iterfind("review_section"):
            review_section_path = review_section.get("path")
            review_section_niicode = review_section.find("code[@type='nii']").text
            review_section_name = review_section.find("name[@lang='en']").text
            row = [
                review_section_table_type,
                review_section_table_start_date,
                review_section_table_end_date,
                review_section_path,
                review_section_niicode,
                review_section_name,
                2,
            ]
            review_sectionlist.append(row)
            # layer 3
            for review_section in review_section.iterfind("review_section"):
                review_section_path = review_section.get("path")
                review_section_niicode = review_section.find("code[@type='nii']").text
                review_section_name = review_section.find("name[@lang='en']").text
                row = [
                    review_section_table_type,
                    review_section_table_start_date,
                    review_section_table_end_date,
                    review_section_path,
                    review_section_niicode,
                    review_section_name,
                    3,
                ]
                review_sectionlist.append(row)
# Convert list to DF
columns = [
    "review_section_table_type",
    "review_section_table_start_date",
    "review_section_table_end_date",
    "review_section_path",
    "review_section_niicode",
    "review_section_name",
    "layer",
]
df = pd.DataFrame(review_sectionlist, columns=columns)
df

In [None]:

df["review_section_table_end_date_mod"] = df["review_section_table_end_date"]
df["review_section_table_end_date_mod"] = df["review_section_table_end_date_mod"].fillna(datetime.datetime.today())

df["review_section_table_startfiscalyear"] = pd.to_datetime(df["review_section_table_start_date"]).map(lambda x: x.year)
df["review_section_table_endfiscalyear"] = pd.to_datetime(df["review_section_table_end_date_mod"]).map(lambda x: (x - relativedelta(months=3)).year)
df["delta"] = df["review_section_table_endfiscalyear"] - df["review_section_table_startfiscalyear"]

type_and_fiscalyear = df[["review_section_table_type", "review_section_table_startfiscalyear"]].drop_duplicates().values

results = pd.DataFrame(columns=df.columns.values)
for l in type_and_fiscalyear:
    temp = df.query("review_section_table_type == @l[0] & review_section_table_startfiscalyear == @l[1]")
    multi = temp["delta"].unique()
    assert len(multi) == 1, "年度が集合内で単一ではありません"
    results = results.append([temp] * (multi[0] + 1), ignore_index=True)

# 
results["cum"] = results.groupby(["review_section_table_type", "review_section_table_start_date", "review_section_path"]).cumcount()
results["applied_fiscalyear"] = results["review_section_table_startfiscalyear"] + results["cum"]

results = results[[
    "review_section_table_type",
    "review_section_table_start_date",
    "review_section_table_end_date",
    "applied_fiscalyear",
    "review_section_path",
    "review_section_niicode",
    "review_section_name",
    "layer"
]]
results

In [None]:
# Drop foreign key for db
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward_review_section DROP FOREIGN KEY fk_grantaward_review_section_review_section_niicode;")
except:
    pass
# Add them to MariaDB
results.to_sql(
    "master_review_sections",
    engine,
    if_exists="replace",
    dtype={
        "review_section_table_type": String(255),
        "review_section_table_start_date": Date,
        "review_section_table_end_date": Date,
        "applied_fiscalyear": Integer,
        "review_section_path": String(255),
        "review_section_niicode": Integer,
        "layer": Integer,
    },
)


In [None]:
with engine.connect() as con:
    con.execute("ALTER TABLE master_review_sections ADD INDEX (review_section_niicode);")
    con.execute("ALTER TABLE `master_review_sections` ADD UNIQUE( `review_section_table_type`, `applied_fiscalyear`, `review_section_path`, `review_section_niicode`);")

## 5. section_master_kakenhi

In [None]:
# Create list from XML file
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/section_master_kakenhi.xml")
sectionlist = []
for section_table in tree.iterfind("section_table"):
    for section in section_table.iterfind("section"):
        section_name = section.find("name[@lang='en']").text
        section_niicode = section.find("code[@type='nii']").text
        row = [section_niicode, section_name]
        sectionlist.append(row)
# Convert list to dataframe
columns = ["section_niicode", "section_name"]
df = pd.DataFrame(sectionlist, columns=columns)
# Check the dupuliaction, and if notm set it as an index
df = df.drop_duplicates()
assert not df["section_niicode"].duplicated().any(), "section_niicode is duplicated."
df = df.set_index('section_niicode')
df

In [None]:
# Drop foreign key for db
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY section_niicode_1;")
except:
    pass
# Add them to MariaDB
df.to_sql("master_sections", engine, if_exists="replace", dtype={"section_niicode": Integer})

In [None]:
# Create list from XML file
tree = etree.parse("/Users/norika_machome/GitHub/Capstone_KAKEN/External_Sources/grants_masterxml_kaken/section_master_kakenhi.xml")
sectionlist = []
for section_table in tree.iterfind("section_table"):
    section_table_type = section_table.get("type")
    section_table_start_date = section_table.get("start_date")
    section_table_end_date = section_table.get("end_date")
    for section in section_table.iterfind("section"):
        section_niicode = section.find("code[@type='nii']").text
        try:
            section_mextcode = section.find("code[@type='mext']").text
        except AttributeError:
            section_mextcode = None
        section_name_ja = section.find("name[@lang='ja']").text
        section_name_en = section.find("name[@lang='en']").text
        section_name_yomi = section.find("name_yomi").text
        row = [
            section_table_type,
            section_table_start_date,
            section_table_end_date,
            section_niicode,
            section_mextcode,
            section_name_ja,
            section_name_en,
            section_name_yomi,
        ]
        sectionlist.append(row)
# Convert list to dataframe
columns = [
    "section_table_type",
    "section_table_start_date",
    "section_table_end_date",
    "section_niicode",
    "section_mextcode",
    "section_name_ja",
    "section_name_en",
    "section_name_yomi",
]
df = pd.DataFrame(sectionlist, columns=columns)
df

In [None]:
#
# Convert from a format where start and end dates are recorded for each application category to a format where
# Convert to a format where application categories are listed by fiscal year.
# # Replace NULLs in the end date with the current time.

# Replace NULLs in the end date with the current time.
df["section_table_end_date_mod"] = df["section_table_end_date"]
df["section_table_end_date_mod"] = df["section_table_end_date_mod"].fillna(datetime.datetime.today())

# Get only the fiscal year from the date and the difference between the beginning and ending fiscal year
df["section_table_startfiscalyear"] = pd.to_datetime(df["section_table_start_date"]).map(lambda x: x.year)
df["section_table_endfiscalyear"] = pd.to_datetime(df["section_table_end_date_mod"]).map(lambda x: (x - relativedelta(months=3)).year)
df["delta"] = df["section_table_endfiscalyear"] - df["section_table_startfiscalyear"]

# Make a list of combinations of table_type and starting year
type_and_fiscalyear = df[["section_table_type", "section_table_startfiscalyear"]].drop_duplicates().values

# Duplicate rows for each start year and field table for the number of years applied
results = pd.DataFrame(columns=df.columns.values)
for l in type_and_fiscalyear:
    temp = df.query("section_table_type == @l[0] & section_table_startfiscalyear == @l[1]")
    multi = temp["delta"].unique()
    assert len(multi) == 1, "年度が集合内で単一ではありません"
    results = results.append([temp] * (multi[0] + 1), ignore_index=True)

# Duplicate and add rows for the number of years for each group. Give each row a sequential number and add it to the starting year to get the applicable year.
results["cum"] = results.groupby(["section_table_type", "section_table_start_date", "section_niicode"]).cumcount()
results["applied_fiscalyear"] = results["section_table_startfiscalyear"] + results["cum"]

# Leave only the columns to be stored in the database
results = results[[
    "section_table_type",
    "section_table_start_date",
    "section_table_end_date",
    "applied_fiscalyear",
    "section_niicode",
    "section_mextcode",
    "section_name_ja",
    "section_name_en",
    "section_name_yomi",
]]
results

In [None]:
# Drop foreign key for db
try:
    with engine.connect() as con:
        con.execute("ALTER TABLE grantaward DROP FOREIGN KEY section_niicode_1;")
except:
    pass

# Add them to MariaDB
results.to_sql("master_sections_fiscalyear", engine, if_exists="replace", dtype={"section_niicode": Integer})