In [1]:
import json
import os

base_dir = "./raw_data/basho"

torikumi_item_keys = [
    "id",
    "bashoId",
    "division",
    "day",
    "matchNo",
    "eastId",
    "eastShikona",
    "eastRank",
    "westId",
    "westShikona",
    "westRank",
    "kimarite",
    "winnerId",
    "winnerEn",
    "winnerJp",
].sort()

all_results = []
for file_name in os.listdir(base_dir):
    with open(os.path.join(base_dir, file_name)) as f:
        data = json.load(f)
        if "torikumi" in data:
            torikumi = data["torikumi"]
            # there were no column mismatches
            # for result in torikumi:
            # if list(result.keys()).sort() != torikumi_item_keys:
            #     print(file_name)
            all_results += torikumi

In [2]:
import pandas as pd

matches_df = pd.DataFrame.from_records(all_results)
matches_df = matches_df.drop(
    columns=["id", "eastShikona", "westShikona", "winnerEn", "winnerJp"]
)

matches_df.to_csv("processed_data/matches.csv", index=False)

matches_df

Unnamed: 0,bashoId,division,day,matchNo,eastId,eastRank,westId,westRank,kimarite,winnerId
0,200003,Makuuchi,9,1,3845,Juryo 1 West,3856,Maegashira 13 West,kotenage,3856
1,200003,Makuuchi,9,2,3745,Maegashira 11 East,3834,Juryo 1 East,tsukiotoshi,3834
2,200003,Makuuchi,9,3,3743,Maegashira 9 East,3838,Maegashira 12 West,yorikiri,3838
3,200003,Makuuchi,9,4,3747,Maegashira 8 East,3836,Maegashira 14 East,oshidashi,3836
4,200003,Makuuchi,9,5,3860,Maegashira 7 East,794,Maegashira 12 East,oshitaoshi,794
...,...,...,...,...,...,...,...,...,...,...
718836,200711,Makuuchi,7,16,3181,Maegashira 3 East,3858,Ozeki 1 West,tsukidashi,3858
718837,200711,Makuuchi,7,17,3845,Ozeki 1 East,3835,Maegashira 4 West,yorikiri,3845
718838,200711,Makuuchi,7,18,3398,Sekiwake 1 West,4071,Ozeki 2 West,yorikiri,4071
718839,200711,Makuuchi,7,19,4226,Ozeki 2 East,3247,Sekiwake 1 East,fusen,3247


In [5]:
import pandas as pd

with open("./raw_data/rikishi.json") as f:
    data = json.load(f)
    rikishi_df = pd.DataFrame.from_records(data["records"])

rikishi_df = rikishi_df.drop(
    columns=[
        "sumodbId",
        "nskId",
        "shikonaEn",
        "intai",
        "updatedAt",
        "shikonaJp",
        "createdAt",
    ]
)

rikishi_df.to_csv("processed_data/rikishi.csv", index=False)

rikishi_df

Unnamed: 0,id,heya,birthDate,shusshin,debut,height,weight,currentRank
0,7522,-,0001-01-01T00:00:00Z,-,195401,,,
1,2973,Hanakago,1943-03-02T00:00:00Z,"Hokkaido, Kamikawa-gun, Higashikagura-cho",195907,,,
2,6428,Oshima,1971-02-24T00:00:00Z,"Osaka-fu, Osaka-shi, Tsurumi-ku",198705,178.5,111.0,
3,3683,Naruto,1978-04-05T00:00:00Z,"Ehime-ken, Uma-gun, Shingu-mura - Ehime-ken, S...",199403,177.0,113.5,
4,6560,Takadagawa,1967-05-14T00:00:00Z,"Tokyo-to, Edogawa-ku",198303,186.0,77.0,
...,...,...,...,...,...,...,...,...
8927,8929,Asakayama,2008-05-16T15:00:00Z,Saitama,202405,175.0,134.0,Jonokuchi 17 West
8928,8930,Hakkaku,2007-04-25T15:00:00Z,Shimane,202405,170.0,110.0,Jonokuchi 19 West
8929,8931,Tatsunami,2001-10-08T15:00:00Z,Aomori,202405,175.0,160.0,Sandanme 90 East
8930,8932,Isegahama,2000-08-23T15:00:00Z,Mongolia,202405,192.0,127.0,Jonokuchi 3 West


In [4]:
df = (
    matches_df.merge(rikishi_df, left_on="westId", right_on="id")
    .merge(rikishi_df, left_on="eastId", right_on="id", suffixes=["_west", "_east"])
    .drop(columns=["id_east", "id_west"])
)

df

Unnamed: 0,bashoId,division,day,matchNo,eastId,eastRank,westId,westRank,kimarite,winnerId,...,currentRank_west,createdAt_west,heya_east,birthDate_east,shusshin_east,debut_east,height_east,weight_east,currentRank_east,createdAt_east
0,200003,Makuuchi,9,1,3845,Juryo 1 West,3856,Maegashira 13 West,kotenage,3856,...,,,Sadogatake,1976-04-11T00:00:00Z,"Aichi-ken, Okazaki-shi",199903,182.0,154.0,,
1,200003,Makuuchi,9,2,3745,Maegashira 11 East,3834,Juryo 1 East,tsukiotoshi,3834,...,,,Isenoumi,1972-06-16T00:00:00Z,"Kyoto-fu, Kyoto-shi, Nishikyo-ku",199503,177.0,128.5,,
2,200003,Makuuchi,9,3,3743,Maegashira 9 East,3838,Maegashira 12 West,yorikiri,3838,...,,,Mihogaseki,1970-03-21T00:00:00Z,"Kumamoto-ken, Uto-shi",199201,178.0,126.0,,
3,200003,Makuuchi,9,4,3747,Maegashira 8 East,3836,Maegashira 14 East,oshidashi,3836,...,,,Takasago,1969-12-11T00:00:00Z,"Aichi-ken, Ichinomiya-shi",199203,176.0,142.3,,
4,200003,Makuuchi,9,5,3860,Maegashira 7 East,794,Maegashira 12 East,oshitaoshi,794,...,,,Tokitsukaze,1973-11-08T00:00:00Z,"Nagasaki-ken, Fukue-shi",199603,184.0,134.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718836,200711,Makuuchi,7,16,3181,Maegashira 3 East,3858,Ozeki 1 West,tsukidashi,3858,...,,,Michinoku,1985-08-10T00:00:00Z,"Mongolia, Ulan-Bator - Mongolia, Sükhbaatar",200111,186.0,150.0,,
718837,200711,Makuuchi,7,17,3845,Ozeki 1 East,3835,Maegashira 4 West,yorikiri,3845,...,,,Sadogatake,1976-04-11T00:00:00Z,"Aichi-ken, Okazaki-shi",199903,182.0,154.0,,
718838,200711,Makuuchi,7,18,3398,Sekiwake 1 West,4071,Ozeki 2 West,yorikiri,4071,...,,,Takasago,1981-08-07T00:00:00Z,"Mongolia, Ulan-Bator",200001,184.5,136.0,,
718839,200711,Makuuchi,7,19,4226,Ozeki 2 East,3247,Sekiwake 1 East,fusen,3247,...,,,Sadogatake,1983-02-19T00:00:00Z,"Bulgaria, Veliko Tarnovo",200211,202.0,140.0,,
