In [64]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from typing import List, Tuple

## To-Do
* Tag Teams
* Stables
* Trainer
* Finisher
* Trademark Moves

In [65]:
def getGeneralInfo(wrestlerID: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
        Arguments:
            wrestlerID: wrestlingdata index for this wrestler, minimum is 1, maximum is 30195
            
        Returns:
            GeneralInfo, Facts
    """
    A = requests.get('https://www.wrestlingdata.com/index.php?befehl=bios&wrestler=%d'%wrestlerID)
    wrestler = BeautifulSoup(A.text, 'html.parser')
    children = list(wrestler.find(title="General Information").parent.parent.children)
    GeneralInfo = {c.attrs['title'] : [list(c.children)[3].text.strip('\n')] for c in wrestler.find(title="General Information").parent.parent.children if 'attrs' in c.__dict__ and 'title' in c.attrs}
    wrestler_name = list(wrestler.find(style="width:100%;", cellpadding="4", cellspacing="2").children)[1].find(style="font-size: 14px;").text.strip('\n')
    res = pd.DataFrame(GeneralInfo, index = [wrestlerID])
    res['wrestler_name'] = [wrestler_name]
    tables2 = wrestler.find(title='Facts')
    B = pd.read_html(str(list(list(tables2.parent.parent.parent.parent.parent.children)[3].children)[1].table))
    return res.transpose(), B[0]

In [66]:
def get50wrestlers(top_index: int) -> List[int]:
    """
        Arguments:
            top_index: the index of the 'Rankings' page, minimum is 1, maximum is 105
        
        Returns:
            A list of wrestlerID's corresponding to the list of wrestlers on the Rankings page with the given page number "top_index"
    """
    B = requests.get('https://www.wrestlingdata.com/index.php?befehl=bios&letter=2&seite=%d'%top_index)
    wrestlerlist = BeautifulSoup(B.text, 'html.parser')

    result_list = []

    # I modified the parsing loop because it had issues in cases where the wrestler didn't have a hyperlink
    # This should work?
    for i in range(len(list(wrestlerlist.find(title="Liste der Wrestler").children)[3:])):
        x = list(wrestlerlist.find(title="Liste der Wrestler").children)[3:][i]
        if len(list(list(x.children)[2].children)) > 1:
            resulting_int = int(list(list(x.children)[2].children)[1].attrs['href'][32:])
        result_list.append(resulting_int)

    return result_list

In [67]:
def getSample(sample_indices: List[int]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
        Arguments:
            sample_indices: A list of wrestlerID's to be used as index for this sample
        
        Returns:
            generalInfo, allFacts
    """
    sample_GeneralInfo = [None for _ in range(len(sample_indices))]
    sample_Facts = [None for _ in range(len(sample_indices))]
    for i in range(len(sample_indices)):
        sample_GeneralInfo[i], sample_Facts[i] = getGeneralInfo(sample_indices[i])
    generalInfo = pd.concat([x.transpose() for x in sample_GeneralInfo])
    
    res = pd.Index([])
    for x in sample_Facts:
        res = pd.concat([pd.Series(res), pd.Series(x[0].value_counts().index)])
    fact_columns = res.value_counts().index
    likely_columns = [x for x in fact_columns if len(x) < 40]
    ts = [None for _ in generalInfo.index]
    for i in range(len(generalInfo.index)):
        t = sample_Facts[i].groupby(0).agg(**{"%d"%generalInfo.index[i]: (1, set)})
        ts[i] = t.loc[t.index.intersection(likely_columns)].transpose()
    allFacts = pd.concat(ts)
    return generalInfo, allFacts

In [77]:
indices = range(1, 106) # these are the indices currently missing
w150 = []
for i in indices:
    w150 += get50wrestlers(i)
print('done')

done


In [81]:
# Writing them so that they don't have to be parsed ever again
textfile = open("wrestler_index_ordered.txt", "w")
for element in w150:
    textfile.write(str(element) + "\n")
textfile.close()

In [82]:
generalInfo, allFacts = getSample(w150)

fact_counts = pd.DataFrame({col: allFacts[col].apply(lambda x : x.__len__() if type(x) == set else 0) for col in allFacts.columns})

fact_counts.to_csv('facts_counts_final.csv')

generalInfo.to_csv('Data/general_info_per_wrestler_final.csv')

print('Finished writing')

Finished writing


Indexes that need to be manually done:
4, 35, 41, 51, 81, 86, 93, 100

In [71]:
wrestler_info = pd.read_csv('Data/general_info_per_wrestler.csv')
wrestler_info.rename(columns={'Unnamed: 0': 'key'}, inplace=True)
wrestler_info.drop(['Height', 'Unnamed: 9'], axis=1, inplace=True)
wrestler_info.astype({'Weight and Height': 'str', 'Birthplace' : 'str', 'Date of Birth': 'str', 'Debut': 'str', 'Date of Death': 'str'})

Unnamed: 0,key,Real Name,Weight and Height,Birthplace,Date of Birth,Debut,wrestler_name,Date of Death
0,6990,Kazuchika Okada,240 lbs. (109 kg) at 6'3'' (1.91 m),"Anjo, Aichi (Japan)",8th November 1987 (age 34),29th August 2004 (17 years ago),Kazuchika Okada,
1,216,Curtis Michael Hennig,260 lbs. (118 kg) at 6'3'' (1.91 m),"Robbinsdale, Minnesota (United States of America)",28th March 1958,30th January 1981 (41 years ago),Curt Hennig,10th February 2003
2,458,Keiichi Yamada,208 lbs. (94 kg) at 5'6'' (1.68 m),"Hiroshima, Hiroshima (Japan)",30th November 1964 (age 57),3rd March 1984 (38 years ago),Jushin Thunder Liger,
3,766,Satoru Sayama,212 lbs. (96 kg) at 5'8'' (1.73 m),"Shimonoseki, Yamaguchi (Japan)",24th November 1957 (age 64),28th May 1976 (45 years ago),First Tiger Mask,
4,64,Eduardo Gori Guerrero Llanes,220 lbs. (100 kg) at 5'8'' (1.73 m),"El Paso, Texas (United States of America)",9th October 1967,18th September 1987 (34 years ago),Eddie Guerrero,13th November 2005
...,...,...,...,...,...,...,...,...
4792,26139,Kim Winslow,,Nevada (United States of America),,,Kim Winslow,
4793,6452,Héctor Pérez González,,Mexico,23rd July 1964,1983,Ángel o Demonio,5th June 2021
4794,4998,unknown,,,,,Jimmy Shoulders,
4795,3262,unknown,,,,,Mike Khoury,


# TODO
There's a temporary list of wrestlers. We've gotta parse them to clean the list up. Parsing will focus on the following:
- Separating height and weight (we'll use kilos and meters for easiness sake)
- Separating country of birth
- Parsing date columns into datetime format

In [72]:
# This adds the country of birth as a column to the dataframe
birth_countries = wrestler_info[~wrestler_info['Birthplace'].isnull()]['Birthplace'].apply(lambda st: st[st.find("(")+1:st.find(")")])

wrestler_info['Birth Country'] = birth_countries

In [73]:
height_and_weight = wrestler_info[~wrestler_info['Weight and Height'].isnull()]['Weight and Height'].str.split('at', expand=True)

weight = height_and_weight[~height_and_weight[0].isnull()][0].apply(lambda st: st[st.find("(")+1:st.find(")")])
weight = pd.to_numeric(weight.str[:-3], errors='coerce')
weight = weight[~weight.isnull()]
wrestler_info['Weight'] = weight

height = height_and_weight[~height_and_weight[1].isnull()][1].apply(lambda st: st[st.find("(")+1:st.find(")")])
height = pd.to_numeric(height.str[:-2], errors='coerce')
height = height[~height.isnull()]
wrestler_info['Height'] = height

wrestler_info.drop(['Weight and Height'], axis=1, inplace=True)

In [74]:
dob = wrestler_info['Date of Birth'].str.replace(r"\(.*\)","", regex=True)
dob = pd.to_datetime(dob, errors='coerce')

debut = wrestler_info['Debut'].str.replace(r"\(.*\)","", regex=True)
debut = pd.to_datetime(debut, errors='coerce')

death = wrestler_info['Date of Death'].str.replace(r"\(.*\)","", regex=True)
death = pd.to_datetime(dob, errors='coerce')

wrestler_info['Date of Birth'] = dob
wrestler_info['Debut'] = debut
wrestler_info['Date of Death'] = death

In [75]:
wrestler_info

Unnamed: 0,key,Real Name,Birthplace,Date of Birth,Debut,wrestler_name,Date of Death,Birth Country,Weight,Height
0,6990,Kazuchika Okada,"Anjo, Aichi (Japan)",1987-11-08,2004-08-29,Kazuchika Okada,1987-11-08,Japan,109.0,1.91
1,216,Curtis Michael Hennig,"Robbinsdale, Minnesota (United States of America)",1958-03-28,1981-01-30,Curt Hennig,1958-03-28,United States of America,118.0,1.91
2,458,Keiichi Yamada,"Hiroshima, Hiroshima (Japan)",1964-11-30,1984-03-03,Jushin Thunder Liger,1964-11-30,Japan,94.0,1.68
3,766,Satoru Sayama,"Shimonoseki, Yamaguchi (Japan)",1957-11-24,1976-05-28,First Tiger Mask,1957-11-24,Japan,96.0,1.73
4,64,Eduardo Gori Guerrero Llanes,"El Paso, Texas (United States of America)",1967-10-09,1987-09-18,Eddie Guerrero,1967-10-09,United States of America,100.0,1.73
...,...,...,...,...,...,...,...,...,...,...
4792,26139,Kim Winslow,Nevada (United States of America),NaT,NaT,Kim Winslow,NaT,United States of America,,
4793,6452,Héctor Pérez González,Mexico,1964-07-23,1983-01-01,Ángel o Demonio,1964-07-23,Mexic,,
4794,4998,unknown,,NaT,NaT,Jimmy Shoulders,NaT,,,
4795,3262,unknown,,NaT,NaT,Mike Khoury,NaT,,,
