In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from typing import List, Tuple

## To-Do
* Tag Teams
* Stables
* Trainer
* Finisher
* Trademark Moves

In [14]:
def getGeneralInfo(wrestlerID: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
        Arguments:
            wrestlerID: wrestlingdata index for this wrestler, minimum is 1, maximum is 30195
            
        Returns:
            GeneralInfo, Facts
    """
    A = requests.get('https://www.wrestlingdata.com/index.php?befehl=bios&wrestler=%d'%wrestlerID)
    wrestler = BeautifulSoup(A.text, 'html.parser')
    children = list(wrestler.find(title="General Information").parent.parent.children)
    GeneralInfo = {c.attrs['title'] : [list(c.children)[3].text.strip('\n')] for c in wrestler.find(title="General Information").parent.parent.children if 'attrs' in c.__dict__ and 'title' in c.attrs}
    wrestler_name = list(wrestler.find(style="width:100%;", cellpadding="4", cellspacing="2").children)[1].find(style="font-size: 14px;").text.strip('\n')
    res = pd.DataFrame(GeneralInfo, index = [wrestlerID])
    res['wrestler_name'] = [wrestler_name]
    tables2 = wrestler.find(title='Facts')
    B = pd.read_html(str(list(list(tables2.parent.parent.parent.parent.parent.children)[3].children)[1].table))
    return res.transpose(), B[0]

In [15]:
def get50wrestlers(top_index: int) -> List[int]:
    """
        Arguments:
            top_index: the index of the 'Rankings' page, minimum is 1, maximum is 105
        
        Returns:
            A list of wrestlerID's corresponding to the list of wrestlers on the Rankings page with the given page number "top_index"
    """
    B = requests.get('https://www.wrestlingdata.com/index.php?befehl=bios&letter=2&seite=%d'%top_index)
    wrestlerlist = BeautifulSoup(B.text, 'html.parser')
    return [int(list(list(x.children)[2].children)[1].attrs['href'][32:]) for x in list(wrestlerlist.find(title="Liste der Wrestler").children)[3:]]

In [16]:
def getSample(sample_indices: List[int]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
        Arguments:
            sample_indices: A list of wrestlerID's to be used as index for this sample
        
        Returns:
            generalInfo, allFacts
    """
    sample_GeneralInfo = [None for _ in range(len(sample_indices))]
    sample_Facts = [None for _ in range(len(sample_indices))]
    for i in range(len(sample_indices)):
        sample_GeneralInfo[i], sample_Facts[i] = getGeneralInfo(sample_indices[i])
    generalInfo = pd.concat([x.transpose() for x in sample_GeneralInfo])
    
    res = pd.Index([])
    for x in sample_Facts:
        res = pd.concat([pd.Series(res), pd.Series(x[0].value_counts().index)])
    fact_columns = res.value_counts().index
    likely_columns = [x for x in fact_columns if len(x) < 40]
    ts = [None for _ in generalInfo.index]
    for i in range(len(generalInfo.index)):
        t = sample_Facts[i].groupby(0).agg(**{"%d"%generalInfo.index[i]: (1, set)})
        ts[i] = t.loc[t.index.intersection(likely_columns)].transpose()
    allFacts = pd.concat(ts)
    return generalInfo, allFacts

In [5]:
indices = [1,2,3] # take top 150 wrestlers of all time
w150 = []
for i in indices:
    w150 += get50wrestlers(i)

In [6]:
generalInfo, allFacts = getSample(w150)

In [7]:
fact_counts = pd.DataFrame({col: allFacts[col].apply(lambda x : x.__len__() if type(x) == set else 0) for col in allFacts.columns})

In [8]:
fact_counts.to_csv('facts_counts.csv')