In [87]:
import pandas as pd
import numpy as np
from unidecode import unidecode

In [2]:
def clean_wrestler_info(wrestler_info: pd.DataFrame):
    """Method to make the general wreslter info easier to parse

    Modifies it in place, so it does not return anything"""
    # This adds the country of birth as a column to the dataframe
    birth_countries = wrestler_info[~wrestler_info['Birthplace'].isnull()]['Birthplace'].apply(lambda st: st[st.find("(")+1:st.find(")")])

    wrestler_info['Birth Country'] = birth_countries
    height_and_weight = wrestler_info[~wrestler_info['Weight and Height'].isnull()]['Weight and Height'].str.split('at', expand=True)

    weight = height_and_weight[~height_and_weight[0].isnull()][0].apply(lambda st: st[st.find("(")+1:st.find(")")])
    weight = pd.to_numeric(weight.str[:-3], errors='coerce')
    weight = weight[~weight.isnull()]
    wrestler_info['Weight'] = weight

    height = height_and_weight[~height_and_weight[1].isnull()][1].apply(lambda st: st[st.find("(")+1:st.find(")")])
    height = pd.to_numeric(height.str[:-2], errors='coerce')
    height = height[~height.isnull()]
    wrestler_info['Height'] = height

    wrestler_info.drop(['Weight and Height'], axis=1, inplace=True)

    dob = wrestler_info['Date of Birth'].str.replace(r"\(.*\)","", regex=True)
    dob = pd.to_datetime(dob, errors='coerce')

    debut = wrestler_info['Debut'].str.replace(r"\(.*\)","", regex=True)
    debut = pd.to_datetime(debut, errors='coerce')

    death = wrestler_info['Date of Death'].str.replace(r"\(.*\)","", regex=True)
    death = pd.to_datetime(dob, errors='coerce')

    wrestler_info['Date of Birth'] = dob
    wrestler_info['Debut'] = debut
    wrestler_info['Date of Death'] = death

In [119]:
wrestler_df = pd.read_csv('Data/general_info_per_wrestler_final.csv')

wrestler_df['wrestler_name'] = wrestler_df['wrestler_name'].apply(lambda x: unidecode(x))

clean_wrestler_info(wrestler_df)

fact_counts = pd.read_csv('facts_counts_final.csv')

match_wrestler_info = pd.read_csv('Data/5_star_matches_wrestler_list.csv')
match_wrestler_info.drop(['Unnamed: 0'], axis=1, inplace=True)

In [120]:
problems = []

# Used for creating the resulting merged dataframe
cur_row = 0
merged_df = pd.DataFrame(index=range(12*len(match_wrestler_info.index)), columns=range(18))

for i in range(len(match_wrestler_info.index)):
    # First for loop is to go through all of the rows
    cur_match_info = match_wrestler_info.iloc[i][:8]
    # Weird transpose it to make some concats easier later on
    cur_match_info = pd.DataFrame(cur_match_info).transpose()

    cur_wrestler_set = match_wrestler_info.iloc[i][8:]
    #print(cur_match_info)

    # Might be more efficient to precreate it as a df or something, but whatev for now
    new_rows_to_add = []

    for j in range(12):
        # There's max 12 wrestlers, so hard coding is good let's gooooo
        cur_wrestler = cur_wrestler_set[j]

        if pd.isna(cur_wrestler):
            # Break should be valid, the wrestlers should be somewhat ordered
            break

        to_append = wrestler_df[wrestler_df['wrestler_name'] == cur_wrestler]

        if len(to_append.index) != 1:
            # It means that it's a different number than 1, so there's a problem somewhere
            problems.append((cur_wrestler, i))
            continue

        merged_df.iloc[cur_row] = pd.concat([cur_match_info.reset_index(drop=True), to_append.reset_index(drop=True)], axis=1)

        cur_row += 1

merged_df.dropna(how='all', inplace=True)

In [125]:
problem_wrestlers = set()
for i in range(len(problems)):
    problem_wrestlers.add(problems[i][0])

In [126]:
problem_wrestlers

{'A.J. Styles',
 'Aja Kong',
 'Akira Hokuto',
 'Andrade "Cien" Almas',
 'Animal',
 'Bandido',
 'Big Van Vader',
 'Black Taurus',
 'Bull Nakano',
 'Chigusa Nagayo',
 'Cima',
 'Cutie Suzuki',
 'Devil Masami',
 'Donovan Dijak',
 'Dynamite Kansai',
 'Eddy Guerrero',
 'El Hijo Del Santo',
 'Etsuko Mita',
 'Flamita',
 'Fénix',
 'Genichiro Tenryu',
 'Hawk',
 'Hikari Fukuoka',
 'Jaguar Yokota',
 'Johnny Ace',
 'Kazue Nagahori',
 'Keiji Mutoh',
 'Kyoko Inoue',
 'Lioness Asuka',
 'Luchasaurus',
 'Manami Toyota',
 'Mayumi Ozaki',
 "Men's Teioh",
 'Michael Oku',
 'Mika Komatsu',
 'Mika Suzuki',
 'Mika Takahashi',
 'Mima Shimoda',
 'Mitsuko Nishiwaki',
 'Naoki Sano',
 'Octagón',
 'Pac',
 'Pentagon Jr.',
 'Pentagón Jr.',
 'Psicosis',
 'Razor Ramon',
 'Rey Misterio Jr.',
 'Sachiko Nakamura',
 'Sakie Hasegawa',
 'Samson Fuyuki',
 'Sanada',
 'Satoru Asako',
 'Satoru Sayama',
 'Shinobu Kandori',
 'Shiryu',
 'Shoichi Funaki',
 'Sid Vicious',
 'Stone Cold Steve Austin',
 'Syuri',
 'Taka Michinoku',
 'Taka

In [146]:
# I think it's easier to fix the matches than it is to fix the wrestler list, so this is just that.
problem_wrestler_fix = {'A.J. Styles': 350, 'Aja Kong': 577, 'Akira Hokuto': 578, 'Andrade "Cien" Almas' : 2581, 'Animal': 98,'Bandido': 27228, 'Big Van Vader': 51, 'Black Taurus': 5533, 'Bull Nakano': 595, 'Chigusa Nagayo': 2020, 'Cima': 968, 'Cutie Suzuki': 2070, 'Devil Masami': 2016, 'Donovan Dijak': 28415, 'Dynamite Kansai': 2023, 'Eddy Guerrero': 64, 'El Hijo Del Santo': 835, 'Etsuko Mita': 2025, 'Flamita': 18326, 'Fénix': 17005, 'Genichiro Tenryu': 938, 'Hawk': 99, 'Hikari Fukuoka': 8758, 'Jaguar Yokota': 3161, 'Johnny Ace': 638, 'Kazue Nagahori': 18980, 'Keiji Mutoh': 276, 'Kyoko Inoue': 674, 'Lioness Asuka': 623, 'Luchasaurus': 23527, 'Manami Toyota': 1380, 'Mayumi Ozaki': 2022, "Men's Teioh": 1449, 'Michael Oku': 34867, 'Mika Komatsu': 18981, 'Mika Suzuki': 3181, 'Mika Takahashi': 27017, 'Mima Shimoda': 2026, 'Mitsuko Nishiwaki': 27168, 'Naoki Sano': 995, 'Octagón': 1576, 'Pac': 2606, 'Pentagon Jr.': 13328, 'Pentagón Jr.': 13328, 'Psicosis': 779, 'Razor Ramon': 44, 'Rey Misterio Jr.': 66, 'Sakie Hasegawa': 2489, 'Samson Fuyuki': 1008, 'Sanada': 3194, 'Satoru Asako': 1058, 'Shinobu Kandori': 2035, 'Shiryu': 150, 'Shoichi Funaki': 111, 'Sid Vicious': 82, 'Stone Cold Steve Austin': 9, 'Syuri': 10764, 'Taka Michinoku': 127, 'Takako Inoue': 675, 'The War Machine': 104, 'Tiger Mask IV': 351, 'Toshiyo Yamada': 2044, 'Tsuyoshi Kohsaka': 11796, 'Utami Hayashishita': 35239, 'Walter': 2372, 'Wild Pegasus': 24, 'Yumi Ogura': 3151, 'Yumiko Hotta': 2033, 'Satoru Sayama': 766}

# I have to fix tiger mask separately because there's two of them :(
# First is the first with NJPW, second is the first with AJPW
tiger_masks = [766, 745]

# These are the wrestlers that just aren't contained in the database, for whatever reason
# They're both women, and both worked with AJW. Their matches were in house shows
to_remove = ['Sachiko Nakamura', 'Yachiya Hirata']

I'll document the parsing issues for posterity's sake:
- A bunch of the wrestlers weren't found directly for some of the following reasons:
    - Different spelling of name (for example, different subtitles, or adjectives)
    - Accents, and other language specific things
    - Multiple wrestlers with same name
- Tiger mask appears in multiple matches, but in each match he represents a different person
- Two wrestlers didn't exist in the database. Searching for them elsewhere also gave no significant results
    - Both were female AJW wrestlers, whose match was in a 5 star house show
    - They are: Sachiko Nakamura and Yachiya Hirata

To fix this up I just ran the code again. Is it efficient? No. Is it easy to program? Hell yeah

In [147]:
ultra_problems = []

# Used for creating the resulting merged dataframe
cur_row = 0
merged_df = pd.DataFrame(index=range(12*len(match_wrestler_info.index)), columns=range(18))

for i in range(len(match_wrestler_info.index)):
    # First for loop is to go through all of the rows
    cur_match_info = match_wrestler_info.iloc[i][:8]
    # Weird transpose it to make some concats easier later on
    cur_match_info = pd.DataFrame(cur_match_info).transpose()

    cur_wrestler_set = match_wrestler_info.iloc[i][8:]
    #print(cur_match_info)

    # Might be more efficient to precreate it as a df or something, but whatev for now
    new_rows_to_add = []

    for j in range(12):
        # There's max 12 wrestlers, so hard coding is good let's gooooo
        cur_wrestler = cur_wrestler_set[j]

        if pd.isna(cur_wrestler):
            # Break should be valid, the wrestlers should be somewhat ordered
            break

        if cur_wrestler in problem_wrestlers:
            # So this is just a way of manually searching through the problem wrestler list
            # Hopefully it works, for my

            # I also have to deal with a case where it is not in the problem wrestler fix... fml
            # Or where the index wasn't parsed, for whatever reason...
            cur_wrestler = wrestler_df[wrestler_df['Unnamed: 0'] == problem_wrestler_fix[cur_wrestler]]['wrestler_name'].iloc[0]

        to_append = wrestler_df[wrestler_df['wrestler_name'] == cur_wrestler]

        if len(to_append.index) != 1:
            # It means that the wrestler wasn't parsed at all, which is not cool tbh
            ultra_problems.append((cur_wrestler, i))
            continue

        merged_df.iloc[cur_row] = pd.concat([cur_match_info.reset_index(drop=True), to_append.reset_index(drop=True)], axis=1)

        cur_row += 1

merged_df.dropna(how='all', inplace=True)

IndexError: single positional indexer is out-of-bounds

In [156]:
cur_wrestler in problem_wrestler_fix
#wrestler_df[wrestler_df['Unnamed: 0'] == problem_wrestler_fix[cur_wrestler]]['wrestler_name']

True