In [1]:
import pandas as pd
import numpy as np
from unidecode import unidecode

# Cleaning up the data

The first step will be to clean up (more) the two datasets. The code is given, but the general idea of the cleaning is just to make the data easier to parse.

## Cleaning wrestler info

We'll start with cleaning up the wrestling info that's contained. Some processing has already been done, this cleaning is mainly done for the sake of making the analysis easier

In [2]:
def country_simplifier(row):
    """Due to the countries where pro wrestling is more popular, we will simplify the countries. This is mainly using only the 'main' wrestling countries, and leaving the others as 'other'.

    This is also to deal with issues of the names being mispelled for whatever reason"""
    if (row['Birth Country'] == 'United States of America') or (row['Birth Country'] == 'United States of Americ'):
        return 'USA'
    elif (row['Birth Country'] == 'Canada') or (row['Birth Country'] == 'Canad'):
        # Because of how canada and US are linked in wrestling world, I'll mark the two together
        return 'USA'
    elif (row['Birth Country'] == 'Japa') or (row['Birth Country'] == 'Japan'):
        return 'Japan'
    elif (row['Birth Country'] == 'Mexico') or (row['Birth Country'] == 'Mexic'):
        return 'Mexico'
    elif (row['Birth Country'] == 'United Kingdom') or (row['Birth Country'] == 'United Kingdo'):
        return 'UK'
    else:
        return 'Other'

In [3]:
def clean_wrestler_info(wrestler_info: pd.DataFrame):
    """Method to make the general wreslter info easier to parse

    Modifies it in place, so it does not return anything"""
    # This adds the country of birth as a column to the dataframe
    birth_countries = wrestler_info[~wrestler_info['Birthplace'].isnull()]['Birthplace'].apply(lambda st: st[st.find("(")+1:st.find(")")])

    wrestler_info['Birth Country'] = birth_countries
    height_and_weight = wrestler_info[~wrestler_info['Weight and Height'].isnull()]['Weight and Height'].str.split('at', expand=True)

    weight = height_and_weight[~height_and_weight[0].isnull()][0].apply(lambda st: st[st.find("(")+1:st.find(")")])
    weight = pd.to_numeric(weight.str[:-3], errors='coerce')
    weight = weight[~weight.isnull()]
    wrestler_info['Weight'] = weight

    height = height_and_weight[~height_and_weight[1].isnull()][1].apply(lambda st: st[st.find("(")+1:st.find(")")])
    height = pd.to_numeric(height.str[:-2], errors='coerce')
    height = height[~height.isnull()]
    wrestler_info['Height'] = height

    wrestler_info.drop(['Weight and Height'], axis=1, inplace=True)

    # TODO: Modify DOB, Debut, and Date of Death so that the year can be parsed separately
    # Needed because the year is sometimes all the info that's contained, due to how old info was stored
    dob = wrestler_info['Date of Birth'].str.replace(r"\(.*\)","", regex=True)
    dob = pd.to_datetime(dob, errors='coerce')

    debut = wrestler_info['Debut'].str.replace(r"\(.*\)","", regex=True)
    debut = pd.to_datetime(debut, errors='coerce')

    death = wrestler_info['Date of Death'].str.replace(r"\(.*\)","", regex=True)
    death = pd.to_datetime(death, errors='coerce')

    wrestler_info['Date of Birth'] = dob
    wrestler_info['Debut'] = debut
    wrestler_info['Date of Death'] = death

    wrestler_info['Region'] = wrestler_info.apply(lambda row: country_simplifier(row), axis=1)


In [4]:
wrestler_df = pd.read_csv('all_info_merged_old.csv')
wrestler_df.drop(['Unnamed: 0'], axis=1, inplace=True)

# Extra step for consistency at some other point
wrestler_df['wrestler_name'] = wrestler_df['wrestler_name'].apply(lambda x: unidecode(x))

clean_wrestler_info(wrestler_df)

match_wrestler_info = pd.read_csv('Data/5_star_matches_wrestler_list.csv')
match_wrestler_info.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
wrestler_df

Unnamed: 0,id,Real Name,Birthplace,Date of Birth,Debut,wrestler_name,Date of Death,Height,Weight,Alias,...,Hair Matches Lost,Mask Loss,Mask Wins,Student,Booked Promotions,Familiy Tie,Mask Losses,Fighting Style,Birth Country,Region
0,1,Peter Senerchia,"New York City, New York (United States of Amer...",1967-10-11,1987-06-03,Taz,NaT,1.75,112.0,1,...,0.0,0,0.0,0,0.0,0,0.0,0,United States of America,USA
1,2,Scott Anthony Levy,"Short Hills, New Jersey (United States of Amer...",1964-09-08,1988-02-20,Raven,NaT,1.83,111.0,1,...,0.0,0,0.0,0,0.0,0,0.0,0,United States of America,USA
2,3,Troy Shane Martin,"Pittsburgh, Pennsylvania (United States of Ame...",1964-11-21,1982-01-01,Shane Douglas,NaT,1.85,113.0,1,...,0.0,0,0.0,0,1.0,0,0.0,0,United States of America,USA
3,4,James D. Fullington,"Philadelphia, Pennsylvania (United States of A...",1963-06-16,1989-01-01,The Sandman,NaT,1.91,111.0,1,...,0.0,0,0.0,1,0.0,0,0.0,0,United States of America,USA
4,5,Perry Arthur Satullo,"Cleveland, Ohio (United States of America)",1966-10-25,1990-10-27,Perry Saturn,NaT,1.78,106.0,1,...,0.0,0,2.0,0,0.0,0,0.0,0,United States of America,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26245,30188,Catherine Costigan,Limerick (Ireland),NaT,2010-12-18,Catherine Costigan,NaT,1.63,48.0,1,...,,0,,0,,0,0.0,0,Ireland,Other
26246,30189,Amber Leibrock,"Hayward, California (United States of America)",1988-02-08,2015-06-09,Amber Leibrock,NaT,1.80,66.0,1,...,,0,,0,,0,0.0,0,United States of America,USA
26247,30190,Gonzalo Mendiola Ortega,"San Antonio, Texas (United States of America)",1933-04-09,NaT,Pancho Lomas,2011-10-22,,,1,...,,0,,0,,0,0.0,0,United States of America,USA
26248,30191,Otto Frederick Roehm,,1882-08-02,NaT,Otto Roehm,1958-05-29,1.65,70.0,1,...,,0,,0,,0,0.0,0,,Other


## Cleaning the 5 star matches

The cleaning is mainly adding the country of the promotion for each match

In [6]:
promotion_country = {'CWF': 'USA', 'NJPW': 'Japan', 'UWF': 'Japan', 'AJPW': 'Japan', 'AJW': 'Japan', 'JCP': 'USA', 'WCW': 'USA', 'JWP': 'Japan', 'WWF': 'USA', 'AAA': 'Mexico', 'ECW': 'USA', 'Michinoku Pro': 'Japan', 'RINGS': 'Japan', 'Noah': 'Japan', 'ROH': 'USA', 'TNA': 'USA', 'WWE': 'USA', 'PWG': 'USA', 'La Triple W': 'Other', 'AEW': 'USA', 'OTT': 'Other', 'RPW': 'UK', 'Stardom': 'Japan'}

In [7]:
match_wrestler_info['Promotion Country'] = match_wrestler_info['Promotion'].map(promotion_country)

# This is an ugly way of rearranging the columns, but it helps so much with visualization
cols = match_wrestler_info.columns.tolist()
cols = cols[:8] + [cols[-1]] + cols[8:-1]

match_wrestler_info = match_wrestler_info[cols]

In [8]:
match_wrestler_info

Unnamed: 0,Date,Match,Promotion,Event,Rating,# of teams,Team Size,PPV,Promotion Country,Wrestler 1,...,Wrestler 3,Wrestler 4,Wrestler 5,Wrestler 6,Wrestler 7,Wrestler 8,Wrestler 9,Wrestler 10,Wrestler 11,Wrestler 12
0,"April 7, 1982",Ric Flair vs. Butch Reed,CWF,Miami Beach show,5.00,2,1,False,USA,Ric Flair,...,,,,,,,,,,
1,"April 21, 1983",Dynamite Kid vs. Tiger Mask,NJPW,Big Fight Series IINight 19,5.00,2,1,True,Japan,Dynamite Kid,...,,,,,,,,,,
2,"December 5, 1984",Kazuo Yamazaki vs. Nobuhiko Takada,UWF,Year-End SpecialDay 10,5.00,2,1,True,Japan,Kazuo Yamazaki,...,,,,,,,,,,
3,"December 8, 1984",Bruiser Brody and Stan Hansen vs. Dory Funk Jr...,AJPW,Real World Tag LeagueNight 15,5.00,2,2,True,Japan,Bruiser Brody,...,Dory Funk Jr.,Terry Funk,,,,,,,,
4,"March 9, 1985",Kuniaki Kobayashi vs. Tiger Mask,AJPW,85 Gekitoh! Exciting WarsNight 14,5.00,2,1,True,Japan,Kuniaki Kobayashi,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,"November 13, 2021",Adam Page vs. Kenny Omega,AEW,Full Gear,5.50,2,1,True,USA,Adam Page,...,,,,,,,,,,
192,"December 15, 2021",Adam Page vs. Bryan Danielson,AEW,Winter is Coming,5.00,2,1,False,USA,Adam Page,...,,,,,,,,,,
193,"January 5, 2022",Kazuchika Okada vs. Will Ospreay,NJPW,Wrestle Kingdom 16Night 2,5.75,2,1,True,Japan,Kazuchika Okada,...,,,,,,,,,,
194,"January 26, 2022",Cody Rhodes vs. Sammy Guevara,AEW,Beach Break,5.00,2,1,False,USA,Cody Rhodes,...,,,,,,,,,,


# Merging the datasets

Here we merge the datasets.

Because we're relying on the names of the wrestlers and the names contained in the matches to be the same, this does lead to problems. So this section is a bit longer than it would be if there was just an index to work with

In [9]:
problems = []

# Used for creating the resulting merged dataframe
cur_row = 0

# So it's the info stored in the wrestler_df, plus 8 (which is the match info that actually matters)5151
merged_df = pd.DataFrame(index=range(12*len(match_wrestler_info.index)), columns=range(len(wrestler_df.columns)+9))

for i in range(len(match_wrestler_info.index)):
    # First for loop is to go through all of the rows
    cur_match_info = match_wrestler_info.iloc[i][:9]
    # Weird transpose it to make some concats easier later on
    cur_match_info = pd.DataFrame(cur_match_info).transpose()

    cur_wrestler_set = match_wrestler_info.iloc[i][9:]
    #print(cur_match_info)

    # Might be more efficient to precreate it as a df or something, but whatev for now
    new_rows_to_add = []

    for j in range(12):
        # There's max 12 wrestlers in a match, so hard coding is good let's gooooo
        cur_wrestler = cur_wrestler_set[j]

        if pd.isna(cur_wrestler):
            # Break should be valid, the wrestlers should be somewhat ordered
            break

        to_append = wrestler_df[wrestler_df['wrestler_name'] == cur_wrestler]

        if len(to_append.index) != 1:
            # It means that it's a different number than 1, so there's a problem somewhere
            problems.append((cur_wrestler, i))
            continue

        merged_df.iloc[cur_row] = pd.concat([cur_match_info.reset_index(drop=True), to_append.reset_index(drop=True)], axis=1)

        cur_row += 1

merged_df.dropna(how='all', inplace=True)

In [10]:
problem_wrestlers = set()
for i in range(len(problems)):
    problem_wrestlers.add(problems[i][0])

In [11]:
problem_wrestlers

{'A.J. Styles',
 'Aja Kong',
 'Akira Hokuto',
 'Andrade "Cien" Almas',
 'Bandido',
 'Big Van Vader',
 'Black Taurus',
 'Bull Nakano',
 'Chigusa Nagayo',
 'Cima',
 'Cutie Suzuki',
 'Dan Kroffat',
 'Devil Masami',
 'Donovan Dijak',
 'Dragon Kid',
 'Dynamite Kansai',
 'Eddy Guerrero',
 'El Hijo Del Santo',
 'Etsuko Mita',
 'Flamita',
 'Fénix',
 'Genichiro Tenryu',
 'Hawk',
 'Hikari Fukuoka',
 'Jaguar Yokota',
 'Kazue Nagahori',
 'Keiji Mutoh',
 'Kyoko Inoue',
 'Lioness Asuka',
 'Luchasaurus',
 'Manami Toyota',
 'Mayumi Ozaki',
 "Men's Teioh",
 'Michael Oku',
 'Mika Komatsu',
 'Mika Suzuki',
 'Mika Takahashi',
 'Mima Shimoda',
 'Mitsuko Nishiwaki',
 'Naoki Sano',
 'Octagón',
 'Pac',
 'Pentagon Jr.',
 'Pentagón Jr.',
 'Psicosis',
 'Razor Ramon',
 'Rey Misterio Jr.',
 'Sachiko Nakamura',
 'Sakie Hasegawa',
 'Samson Fuyuki',
 'Sanada',
 'Satoru Sayama',
 'Shinobu Kandori',
 'Shiryu',
 'Shoichi Funaki',
 'Sid Vicious',
 'Stone Cold Steve Austin',
 'Syuri',
 'Taka Michinoku',
 'Takako Inoue',
 

In [12]:
# I think it's easier to fix the matches than it is to fix the wrestler list, so this is just that.

# Wrestlers I'm unsure of the ID (because the website was down)
# Dragon Kid: 970
# Dan Kroffat
problem_wrestler_fix = {'A.J. Styles': 350, 'Aja Kong': 577, 'Akira Hokuto': 578, 'Andrade "Cien" Almas' : 2581, 'Animal': 98,'Bandido': 27228, 'Big Van Vader': 51, 'Black Taurus': 5533, 'Bull Nakano': 595, 'Chigusa Nagayo': 2020, 'Cima': 968, 'Cutie Suzuki': 2070, 'Dan Kroffat': 839, 'Devil Masami': 2016, 'Donovan Dijak': 28415, 'Dragon Kid': 970, 'Dynamite Kansai': 2023, 'Eddy Guerrero': 64, 'El Hijo Del Santo': 835, 'Etsuko Mita': 2025, 'Flamita': 18326, 'Fénix': 17005, 'Genichiro Tenryu': 938, 'Hawk': 99, 'Hikari Fukuoka': 8758, 'Jaguar Yokota': 3161, 'Johnny Ace': 638, 'Kazue Nagahori': 18980, 'Keiji Mutoh': 276, 'Kyoko Inoue': 674, 'Lioness Asuka': 623, 'Luchasaurus': 23527, 'Manami Toyota': 1380, 'Mayumi Ozaki': 2022, "Men's Teioh": 1449, 'Michael Oku': 34867, 'Mika Komatsu': 18981, 'Mika Suzuki': 3181, 'Mika Takahashi': 27017, 'Mima Shimoda': 2026, 'Mitsuko Nishiwaki': 27168, 'Naoki Sano': 995, 'Octagón': 1576, 'Pac': 2606, 'Pentagon Jr.': 13328, 'Pentagón Jr.': 13328, 'Psicosis': 779, 'Razor Ramon': 44, 'Rey Misterio Jr.': 66, 'Sakie Hasegawa': 2489, 'Samson Fuyuki': 1008, 'Sanada': 3194, 'Satoru Asako': 1058, 'Shinobu Kandori': 2035, 'Shiryu': 150, 'Shoichi Funaki': 111, 'Sid Vicious': 82, 'Stone Cold Steve Austin': 9, 'Syuri': 10764, 'Taka Michinoku': 127, 'Takako Inoue': 675, 'The War Machine': 104, 'Tiger Mask IV': 351, 'Toshiyo Yamada': 2044, 'Tsuyoshi Kohsaka': 11796, 'Utami Hayashishita': 35239, 'Walter': 2372, 'Wild Pegasus': 24, 'Yumi Ogura': 3151, 'Yumiko Hotta': 2033, 'Satoru Sayama': 766}

# I have to fix tiger mask separately because there's two of them :(
# First is the first with NJPW, second is the first with AJPW
tiger_masks = [766, 745]

# These are the wrestlers that just aren't contained in the database, for whatever reason
# They're both women, and both worked with AJW. Their matches were in house shows
to_remove = ['Sachiko Nakamura', 'Yachiya Hirata']

I'll document the parsing issues for posterity's sake:
- A bunch of the wrestlers weren't found directly for some of the following reasons:
    - Different spelling of name (for example, different subtitles, or adjectives)
    - Accents, and other language specific things
    - Multiple wrestlers with same name
- Tiger mask appears in multiple matches, but in each match he represents a different person
    - This was fixed by modifying the 5 star matches to have the name of the wrestler who fought
- Two wrestlers didn't exist in the database. Searching for them elsewhere also gave no significant results
    - Both were female AJW wrestlers, whose match was in a 5 star house show
    - They are: Sachiko Nakamura and Yachiya Hirata
- Wrestlers that are famous but have multiple people with same name:
    - Johnny ace (John Laurinaitis)
- Some wrestlers don't have a Real Name listed even if the name is public information:
    -  Jordan Devlin

To fix this up I just ran the code again. Is it efficient? No. Is it easy to program? Hell yeah

In [13]:
ultra_problems = []

# Used for creating the resulting merged dataframe
cur_row = 0
merged_df = pd.DataFrame(index=range(12*len(match_wrestler_info.index)), columns=range(len(wrestler_df.columns)+9))
merged_df.columns = match_wrestler_info.columns[:9].union(wrestler_df.columns, sort=False)

for i in range(len(match_wrestler_info.index)):
    # First for loop is to go through all of the rows
    cur_match_info = match_wrestler_info.iloc[i][:9]
    # Weird transpose it to make some concats easier later on
    cur_match_info = pd.DataFrame(cur_match_info).transpose()

    cur_wrestler_set = match_wrestler_info.iloc[i][9:]
    #print(cur_match_info)

    # Might be more efficient to precreate it as a df or something, but whatev for now
    new_rows_to_add = []

    for j in range(12):
        # There's max 12 wrestlers, so hard coding is good let's gooooo
        cur_wrestler = cur_wrestler_set[j]

        if pd.isna(cur_wrestler):
            # Break should be valid, the wrestlers should be somewhat ordered
            break

        if cur_wrestler in to_remove:
            # This is just a way of dealing with the few wrestlers that are not listed
            break

        if cur_wrestler in problem_wrestler_fix:
            # So this is just a way of manually searching through the problem wrestler list

            to_append = wrestler_df[wrestler_df['id'] == problem_wrestler_fix[cur_wrestler]]
        else:
            to_append = wrestler_df[wrestler_df['wrestler_name'] == cur_wrestler]

        if len(to_append.index) != 1:
            # It means that the wrestler wasn't parsed at all, which is not cool tbh
            ultra_problems.append((cur_wrestler, i))
            continue

        merged_df.iloc[cur_row] = pd.concat([cur_match_info.reset_index(drop=True), to_append.reset_index(drop=True)], axis=1)

        cur_row += 1

merged_df.dropna(how='all', inplace=True)

In [14]:
ultra_problems

[]

Means there are no more issues, woo!

In [15]:
merged_df['Birth Country'].unique()

array(['United States of America', 'United Kingdom', 'Japan',
       'United States of Americ', 'New Zealand', 'Canada', 'Mexico',
       'Japa', 'Austria', 'Spain', 'Mexic', 'Ireland', 'Russia'],
      dtype=object)