In [274]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import numpy as np

url = 'https://en.wikipedia.org/wiki/List_of_professional_wrestling_matches_rated_5_or_more_stars_by_Dave_Meltzer#Wrestlers_with_the_most_5-or-higher-star_matches'

req = requests.get(url)
print(f"Request terminated with status code {req.status_code}")
print(f"Response encoded with {req.encoding}")
# as before, we can add the HTML to our soup
wiki_soup = BeautifulSoup(req.text, 'html.parser')

Request terminated with status code 200
Response encoded with UTF-8


In [283]:
tables = wiki_soup('table') # your code goes here
# and here we need to look the table titles to find id of the table that interests us

matches_df = pd.read_html(str(tables[0]))[0]
for i in range(1,5):
    matches_df = pd.concat([matches_df, pd.read_html(str(tables[i]))[0]])

# These columns are fluff that are only helpful in wikipedi
matches_df = matches_df.drop(['Unnamed: 0', 'Ref.'], axis=1)
matches_df = matches_df.reset_index(drop=True)

# Reformatting the rating to be a series of floats
matches_df['Rating'] = matches_df['Rating'].astype(str).map(lambda x: x.rstrip('[]abcd')).astype(float)

# Manually fixing one of the entries, because this is honestly the easiest way to proceed
matches_df['Match'][0] = matches_df['Match'][0][:-3]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches_df['Match'][0] = matches_df['Match'][0][:-3]


In [284]:
# Splitting who fought who
temp_split = matches_df['Match'].str.split('vs.', expand=True)

# Splitting each person in teams
temp = temp_split.apply(lambda x: x.str.replace('and', ',')).apply(lambda x: x.str.split(','))

def clean_up_list(x):
    for col in x:
        if col is not None:
            if ' ' in col:
                # At some point empty characters are created, beats me why
                col.remove(' ')
            for i in range(len(col)):
                elem = col[i]
                # This is to deal with tag team names, just removing them and only leave the wrestler's name
                elem = re.sub(r'^.*?\(', '', elem)
                elem = elem.replace(')', '')
                elem = elem.strip()
                col[i] = elem
    return x

temp.apply(lambda row: clean_up_list(row))

new_column_names = []
for elem in temp.columns:
    new_column_names.append('Participant ' + str(elem+1))

temp.columns = new_column_names

number_teams = pd.Series(temp.count(axis=1), name='# of teams')
team_size = pd.Series(temp['Participant 1'].map(lambda a: len(a)), name='Team Size')

matches_df = pd.concat([matches_df, temp, number_teams, team_size], axis=1)

In [285]:
matches_df

Unnamed: 0,Date,Match,Promotion,Event,Rating,Participant 1,Participant 2,Participant 3,Participant 4,Participant 5,Participant 6,# of teams,Team Size
0,"April 7, 1982",Ric Flair vs. Butch Reed,CWF,Miami Beach show,5.00,[Ric Flair],[Butch Reed],,,,,2,1
1,"April 21, 1983",Dynamite Kid vs. Tiger Mask,NJPW,Big Fight Series IINight 19,5.00,[Dynamite Kid],[Tiger Mask],,,,,2,1
2,"December 5, 1984",Kazuo Yamazaki vs. Nobuhiko Takada,UWF,Year-End SpecialDay 10,5.00,[Kazuo Yamazaki],[Nobuhiko Takada],,,,,2,1
3,"December 8, 1984",Bruiser Brody and Stan Hansen vs. Dory Funk Jr...,AJPW,Real World Tag LeagueNight 15,5.00,"[Bruiser Brody, Stan Hansen]","[Dory Funk Jr., Terry Funk]",,,,,2,2
4,"March 9, 1985",Kuniaki Kobayashi vs. Tiger Mask,AJPW,85 Gekitoh! Exciting WarsNight 14,5.00,[Kuniaki Kobayashi],[Tiger Mask],,,,,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,"November 13, 2021",Adam Page vs. Kenny Omega,AEW,Full Gear,5.50,[Adam Page],[Kenny Omega],,,,,2,1
192,"December 15, 2021",Adam Page vs. Bryan Danielson,AEW,Winter is Coming,5.00,[Adam Page],[Bryan Danielson],,,,,2,1
193,"January 5, 2022",Kazuchika Okada vs. Will Ospreay,NJPW,Wrestle Kingdom 16Night 2,5.75,[Kazuchika Okada],[Will Ospreay],,,,,2,1
194,"January 26, 2022",Cody Rhodes vs. Sammy Guevara,AEW,Beach Break,5.00,[Cody Rhodes],[Sammy Guevara],,,,,2,1


In [286]:
non_ppv_events = ['Real World Tag League Night 15', 'Real World Tag League Night 23', 'Real World Tag League Night 15', 'Real World Tag League Night 16', 'Miami Beach show', 'Big Fight Series II Night 19', '85 Gekitoh! Exciting Wars Night 14', 'Summer Night Festival in Budokan', 'New Years War Super Battle Night 25', 'Battle of the Belts 2', 'World Wide Wrestling', 'Kawasaki show', 'Spring Flare Up Night 21', 'Unknown', 'New Year Giant Series Night 17', 'Landover, MD show', 'Super Power Series Night 18', 'Super Power Series Night 19', 'October Giant Series Night 2', 'October Giant Series Night 16', 'Korakuen Hall show', 'Fan Appreciation Day', 'Explosion Tour Night 13', 'Super Power Series Night 6', 'Super Power Series Night 8', 'Summer Action Series Night 2', 'Mid Summer Typhoon', 'Dream Slam I', 'Dream Slam II', 'Korakuen Hall show', 'Summer Action Series Night 1', 'Summer Action Series Night 22', 'Summer Action Series II Night 10', 'New Year Giant Series Night 20', 'Super Power Series Night 6', 'Super Power Series Night 16', '	Summer Struggle', 'Wrestlemarinpiad', 'Matamoros show', 'Champion Carnival Night 19', 'Champion Carnival Night 19', 'Super Power Series Night 15', 'Summer Action Series Night 1', 'Destiny', 'Mexico City show', 'Big Ass Extreme Bash', 'Super Power Series Night 5', 'Super Power Series Night 15', '3rd Anniversary', 'Real World Tag League Night 16', 'Best of the Super Juniors IV Night 17', 'Super Power Series Night 16', 'Real World Tag League Night 15', 'Fourth Fighting Integration', '26th Anniversary Show', 'Super Power Series Night 14', 'October Giant Series Night 11', 'Navigate For Evolution', 'Joe vs. Punk II', 'Joe vs. Kobashi', 'Supercard of Honor', 'Battle of Los Angeles Night 2', 'Best of the Super Juniors Night 14', 'Battle of Los Angeles Night 3', 'All Star Weekend 13 Night 2', 'Wrestle Kingdom 12', 'NXT TakeOver: Philadelphia', 'Strong Style Evolved', 'Sakura Genesis', 'NXT TakeOver: New Orleans', 'Total Rumble 8', 'Wrestling Dontaku', 'Best of the Super Juniors Final', 'Dominion 6.9 in Osaka-jo Hall', 'NXT', 'G1 Climax 28 Night 2', 'G1 Climax 28 Night 4', 'G1 Climax 28 Night 6', 'G1 Climax 28 Night 14', 'G1 Climax 28 Night 17', 'G1 Climax 28 Night 18', 'G1 Climax 28 Night Final', 'Destruction in Kobe', 'Fighting Spirit Unleashed', 'Road to Tokyo Dome Night 2', 'Wrestle Kingdom 13', 'New Japan Cup Final', '	Best of the Super Juniors Night 8', 'NXT TakeOver: New York', 'NXT TakeOver: XXV', 'Best of the Super Juniors Final', 'Dominion 6.9 in Osaka-jo Hall', 'G1 Climax 29 Night 5', 'G1 Climax 29 Night 6', 'G1 Climax 29 Night 7', 'Sixteen', 'G1 Climax 29 Night 13', 'G1 Climax 29 Night 14', 'G1 Climax 29 Night 16', 'G1 Climax 29 Night 17', 'G1 Climax 29 Night Finals', 'NXT UK TakeOver: Cardiff', 'Battle of Los Angeles Night 3', '5th Year Anniversary', 'High Stakes', 'Dynamite', 'G1 Climax 30 Night 5', 'G1 Climax 30 Night 13', 'NXT UK', 'The BEST ~Final Chronicle 2020~', "New Year's Smash Night 1", 'The New Beginning in Nagoya', 'New Japan Cup Night 8', 'New Japan Cup Night 13', 'Dynamite', 'Wrestling Dontaku 2021 Night 2', 'Tokyo Dream Cinderella', 'NXT TakeOver 36', 'G1 Climax 31 Night 1', 'AEW Grand Slam', 'Héroes Inmortales XIV', 'Winter is Coming', 'Beach Break', 'High Stakes']

matches_df['PPV'] = pd.Series(np.where(matches_df['Event'].isin(non_ppv_events), False, True))

In [287]:
matches_df.to_csv('5_star_matches_list.csv')

# Alternate take
This is another way of presenting the data, focusing on making the people wrestling the matches more accessible.

In [316]:
#temp_split = matches_df['Match'].str.split('vs.', expand=True)
temp_split = matches_df['Match'].str.replace(', and', ',').str.replace('and', ',').str.replace('vs.', ',')
# Splitting each person in teams
#temp = temp_split.apply(lambda x: x.str.replace('and', ',')).apply(lambda x: x.str.split(',', expand=True))

  temp_split = matches_df['Match'].str.replace(', and', ',').str.replace('and', ',').str.replace('vs.', ',')


In [320]:
wreslters_in_match = temp_split.str.split(',', expand=True)

def clean_up_wrestlers(x):
    #print(x)
    for i in range(len(x)):
        elem = x[i]
        if elem is not None:
        # This is to deal with tag team names, just removing them and only leave the wrestler's name
            elem = re.sub(r'^.*?\(', '', elem)
            elem = elem.replace(')', '')
            elem = elem.strip()
            x[i] = elem
    return x


wreslters_in_match.apply(lambda row: clean_up_wrestlers(row))

new_column_names = []
for elem in wreslters_in_match.columns:
    new_column_names.append('Wrestler ' + str(elem+1))

wreslters_in_match.columns = new_column_names

In [321]:
wreslters_in_match

Unnamed: 0,Wrestler 1,Wrestler 2,Wrestler 3,Wrestler 4,Wrestler 5,Wrestler 6,Wrestler 7,Wrestler 8,Wrestler 9,Wrestler 10,Wrestler 11,Wrestler 12
0,Ric Flair,Butch Reed,,,,,,,,,,
1,Dynamite Kid,Tiger Mask,,,,,,,,,,
2,Kazuo Yamazaki,Nobuhiko Takada,,,,,,,,,,
3,Bruiser Brody,Stan Hansen,Dory Funk Jr.,Terry Funk,,,,,,,,
4,Kuniaki Kobayashi,Tiger Mask,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
191,Adam Page,Kenny Omega,,,,,,,,,,
192,Adam Page,Bryan Danielson,,,,,,,,,,
193,Kazuchika Okada,Will Ospreay,,,,,,,,,,
194,Cody Rhodes,Sammy Guevara,,,,,,,,,,


In [322]:
to_make_csv_wrestlers = pd.concat([matches_df.drop(['Participant 1', 'Participant 2', 'Participant 3', 'Participant 4', 'Participant 5', 'Participant 6'], axis=1), wreslters_in_match], axis=1)

In [325]:
to_make_csv_wrestlers.to_csv('5_star_matches_wrestler_list.csv')