In [6]:
from createdata.scrape_fight_links import get_this_event_fight_links
from createdata.scrape_fight_data import create_fight_data_csv

event_links = ['http://ufcstats.com/event-details/b16a7e6a627e9789', 'http://ufcstats.com/event-details/c0c1bc0766df4c00',
              'http://ufcstats.com/event-details/6c9383ffab2725a5', 'http://ufcstats.com/event-details/2f8f3c69522db931',
              'http://ufcstats.com/event-details/9e0f28d1f639ad73', 'http://ufcstats.com/event-details/abcf7e55a0a9ed89']
event_links.reverse()
event_and_fight_links = get_this_event_fight_links(event_links)
create_fight_data_csv(event_and_fight_links, 'new_data.csv')

Scraping all fight data: 
Progress: |██████████████████████████████████████████████████| 100.00% Complete


In [1]:
import pandas as pd
import numpy as np
import math

df_orig = pd.read_csv('data/total_fight_data.csv', sep=';')
df = pd.read_csv('data/new_data.csv', sep=';')
fighter_details = pd.read_csv('data/fighter_details.csv', index_col='fighter_name')

In [2]:
df = pd.concat([df, df_orig]).reset_index(drop=True)

In [3]:
df[:90]

Unnamed: 0,R_fighter,B_fighter,R_KD,B_KD,R_SIG_STR.,B_SIG_STR.,R_SIG_STR_pct,B_SIG_STR_pct,R_TOTAL_STR.,B_TOTAL_STR.,...,B_GROUND,win_by,last_round,last_round_time,Format,Referee,date,location,Fight_type,Winner
0,Max Holloway,Frankie Edgar,0,0,129 of 328,96 of 264,39%,36%,131 of 330,96 of 264,...,1 of 2,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),Herb Dean,"July 27, 2019","Edmonton, Alberta, Canada",UFC Featherweight Title Bout,Max Holloway
1,Cris Cyborg,Felicia Spencer,0,0,122 of 216,38 of 85,56%,44%,138 of 234,64 of 115,...,4 of 6,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Yves Lavigne,"July 27, 2019","Edmonton, Alberta, Canada",Women's Featherweight Bout,Cris Cyborg
2,Geoff Neal,Niko Price,0,1,41 of 61,24 of 57,67%,42%,53 of 73,42 of 79,...,4 of 10,KO/TKO,2,2:39,3 Rnd (5-5-5),Dan Miragliotta,"July 27, 2019","Edmonton, Alberta, Canada",Welterweight Bout,Geoff Neal
3,Olivier Aubin-Mercier,Arman Tsarukyan,0,0,23 of 48,47 of 105,47%,44%,36 of 61,110 of 174,...,22 of 29,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Marc Goddard,"July 27, 2019","Edmonton, Alberta, Canada",Lightweight Bout,Arman Tsarukyan
4,Marc-Andre Barriault,Krzysztof Jotko,0,0,39 of 77,40 of 117,50%,34%,62 of 102,98 of 181,...,0 of 0,Decision - Split,3,5:00,3 Rnd (5-5-5),Herb Dean,"July 27, 2019","Edmonton, Alberta, Canada",Middleweight Bout,Krzysztof Jotko
5,Alexis Davis,Viviane Araujo,0,0,68 of 137,67 of 148,49%,45%,134 of 213,79 of 162,...,1 of 1,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Yves Lavigne,"July 27, 2019","Edmonton, Alberta, Canada",Women's Flyweight Bout,Viviane Araujo
6,Hakeem Dawodu,Yoshinori Horie,1,0,80 of 154,39 of 79,51%,49%,89 of 166,41 of 81,...,0 of 0,KO/TKO,3,4:09,3 Rnd (5-5-5),Kyle Cardinal,"July 27, 2019","Edmonton, Alberta, Canada",Featherweight Bout,Hakeem Dawodu
7,Gavin Tucker,SeungWoo Choi,0,0,37 of 62,23 of 96,59%,23%,49 of 77,42 of 116,...,0 of 0,Submission,3,3:17,3 Rnd (5-5-5),Marc Goddard,"July 27, 2019","Edmonton, Alberta, Canada",Featherweight Bout,Gavin Tucker
8,Alexandre Pantoja,Deiveson Figueiredo,0,1,61 of 151,67 of 113,40%,59%,68 of 160,71 of 117,...,10 of 16,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Dan Miragliotta,"July 27, 2019","Edmonton, Alberta, Canada",Flyweight Bout,Deiveson Figueiredo
9,Gillian Robertson,Sarah Frota,0,0,29 of 42,9 of 20,69%,45%,51 of 64,33 of 46,...,6 of 12,KO/TKO,2,4:13,3 Rnd (5-5-5),Kyle Cardinal,"July 27, 2019","Edmonton, Alberta, Canada",Women's Flyweight Bout,Gillian Robertson


In [4]:
weight_classes = ['Women\'s Strawweight', 'Women\'s Bantamweight', 
                  'Women\'s Featherweight', 'Women\'s Flyweight', 'Lightweight', 
                  'Welterweight', 'Middleweight','Light Heavyweight', 
                  'Heavyweight', 'Featherweight','Bantamweight', 'Flyweight', 'Open Weight']
columns = ['R_SIG_STR.', 'B_SIG_STR.', 'R_TOTAL_STR.', 'B_TOTAL_STR.',
       'R_TD', 'B_TD', 'R_HEAD', 'B_HEAD', 'R_BODY','B_BODY', 'R_LEG', 'B_LEG', 
        'R_DISTANCE', 'B_DISTANCE', 'R_CLINCH','B_CLINCH', 'R_GROUND', 'B_GROUND']

pct_columns = ['R_SIG_STR_pct','B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct']


In [5]:
def make_weight_class(X):
    for weight_class in weight_classes:
        if weight_class in X:
            return weight_class
    if X == 'Catch Weight Bout' or 'Catchweight Bout':
        return 'Catch Weight'
    else:
        return 'Open Weight'

In [6]:
def get_total_time(row):
    if row['Format'] in time_in_first_round.keys():
        return (row['last_round'] - 1) * time_in_first_round[row['Format']] + row['last_round_time']
    elif row['Format'] in exception_format_time.keys():
        if (row['last_round'] - 1) >= 2:
            return exception_format_time[row['Format']][0] + (row['last_round'] - 2) * \
                    exception_format_time[row['Format']][1] + row['last_round_time']
        else:
            return (row['last_round'] - 1) * exception_format_time[row['Format']][0] + row['last_round_time']
    
# So if the fight ended in round 1, we only need last_round_time. 
# If it ended in round 2, we need the full time of round 1 and the last_round_time
# This works for fights with same time in each round and fights with only two rounds.

In [7]:
def get_no_of_rounds(X):
    if X == 'No Time Limit':
        return 1
    else:
        return len(X.split('(')[1].replace(')', '').split('-'))

In [8]:
attempt_suffix = '_att'
landed_suffix = '_landed'

for column in columns:
    df[column+attempt_suffix] = df[column].apply(lambda X: int(X.split('of')[1]))
    df[column+landed_suffix] = df[column].apply(lambda X: int(X.split('of')[0]))
    
df.drop(columns, axis=1, inplace=True)
df['Winner'].fillna('Draw', inplace=True)

for column in pct_columns:
    df[column] = df[column].apply(lambda X: float(X.replace('%', ''))/100)
    
df['title_bout'] = df['Fight_type'].apply(lambda X: True if 'Title Bout' in X else False)
df['weight_class'] = df['Fight_type'].apply(make_weight_class)

time_in_first_round = {'3 Rnd (5-5-5)': 5*60, '5 Rnd (5-5-5-5-5)': 5*60, '1 Rnd + OT (12-3)': 12*60,
       'No Time Limit': 1, '3 Rnd + OT (5-5-5-5)': 5*60, '1 Rnd (20)': 1*20,
       '2 Rnd (5-5)': 5*60, '1 Rnd (15)': 15*60, '1 Rnd (10)': 10*60,
       '1 Rnd (12)':12*60, '1 Rnd + OT (30-5)': 30*60, '1 Rnd (18)': 18*60, '1 Rnd + OT (15-3)': 15*60,
       '1 Rnd (30)': 30*60, '1 Rnd + OT (31-5)': 31*5,
       '1 Rnd + OT (27-3)': 27*60, '1 Rnd + OT (30-3)': 30*60}

exception_format_time = {'1 Rnd + 2OT (15-3-3)': [15*60, 3*60], '1 Rnd + 2OT (24-3-3)': [24*60, 3*60]}

df['last_round_time'] = df['last_round_time'].apply(lambda X: int(X.split(':')[0])*60 + int(X.split(':')[1]))

df['total_time_fought(seconds)'] = df.apply(get_total_time, axis=1)

df['no_of_rounds'] = df['Format'].apply(get_no_of_rounds)

df2 = df.copy()
df2.drop(['Format', 'Fight_type', 'last_round_time','R_KD', 'B_KD', 'R_SIG_STR_pct',
       'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct', 'R_SUB_ATT', 'B_SUB_ATT',
       'R_PASS', 'B_PASS', 'R_REV', 'B_REV', 'win_by', 'last_round', 
        'R_SIG_STR._att', 'R_SIG_STR._landed',
       'B_SIG_STR._att', 'B_SIG_STR._landed', 'R_TOTAL_STR._att',
       'R_TOTAL_STR._landed', 'B_TOTAL_STR._att', 'B_TOTAL_STR._landed',
       'R_TD_att', 'R_TD_landed', 'B_TD_att', 'B_TD_landed', 'R_HEAD_att',
       'R_HEAD_landed', 'B_HEAD_att', 'B_HEAD_landed', 'R_BODY_att',
       'R_BODY_landed', 'B_BODY_att', 'B_BODY_landed', 'R_LEG_att',
       'R_LEG_landed', 'B_LEG_att', 'B_LEG_landed', 'R_DISTANCE_att',
       'R_DISTANCE_landed', 'B_DISTANCE_att', 'B_DISTANCE_landed',
       'R_CLINCH_att', 'R_CLINCH_landed', 'B_CLINCH_att', 'B_CLINCH_landed',
       'R_GROUND_att', 'R_GROUND_landed', 'B_GROUND_att', 'B_GROUND_landed',
        'total_time_fought(seconds)'], axis = 1, inplace=True)

In [9]:
red_fighters = df['R_fighter'].value_counts().index
blue_fighters = df['B_fighter'].value_counts().index

fighters = list(set(red_fighters) | set(blue_fighters))

In [10]:
def get_renamed_winner(row):
    if row['R_fighter'] == row['Winner']:
        return 'Red'
    elif row['B_fighter'] == row['Winner']:
        return 'Blue'
    elif row['Winner'] == 'Draw':
        return 'Draw'

df2['Winner'] = df2[['R_fighter', 'B_fighter', 'Winner']].apply(get_renamed_winner, axis=1)

In [11]:
df = pd.concat([df,pd.get_dummies(df['win_by'], prefix='win_by')],axis=1)
df.drop(['win_by'],axis=1, inplace=True)

In [12]:
Numerical_columns = ['hero_KD', 'opp_KD', 'hero_SIG_STR_pct',
       'opp_SIG_STR_pct', 'hero_TD_pct', 'opp_TD_pct', 'hero_SUB_ATT', 'opp_SUB_ATT',
       'hero_PASS', 'opp_PASS', 'hero_REV', 'opp_REV', 'hero_SIG_STR._att', 'hero_SIG_STR._landed',
       'opp_SIG_STR._att', 'opp_SIG_STR._landed', 'hero_TOTAL_STR._att',
       'hero_TOTAL_STR._landed', 'opp_TOTAL_STR._att', 'opp_TOTAL_STR._landed',
       'hero_TD_att', 'hero_TD_landed', 'opp_TD_att', 'opp_TD_landed', 'hero_HEAD_att',
       'hero_HEAD_landed', 'opp_HEAD_att', 'opp_HEAD_landed', 'hero_BODY_att',
       'hero_BODY_landed', 'opp_BODY_att', 'opp_BODY_landed', 'hero_LEG_att',
       'hero_LEG_landed', 'opp_LEG_att', 'opp_LEG_landed', 'hero_DISTANCE_att',
       'hero_DISTANCE_landed', 'opp_DISTANCE_att', 'opp_DISTANCE_landed',
       'hero_CLINCH_att', 'hero_CLINCH_landed', 'opp_CLINCH_att', 'opp_CLINCH_landed',
       'hero_GROUND_att', 'hero_GROUND_landed', 'opp_GROUND_att', 'opp_GROUND_landed',
       'total_time_fought(seconds)']

Categorical_columns = ['win_by', 'last_round',
        'Winner', 'title_bout']

In [13]:
import re

def lreplace(pattern, sub, string):
    """
    Replaces 'pattern' in 'string' with 'sub' if 'pattern' starts 'string'.
    """
    return re.sub('^%s' % pattern, sub, string)

In [14]:
red = df.groupby('R_fighter')
blue = df.groupby('B_fighter')

In [15]:
def get_fighter_red(fighter_name):
    try:
        fighter_red = red.get_group(fighter_name)
    except:
        return None
    rename_columns = {}
    for column in fighter_red.columns:
        if re.search('^R_', column) is not None:
            rename_columns[column] = lreplace('R_', 'hero_', column)
        elif re.search('^B_', column) is not None:
            rename_columns[column] = lreplace('B_', 'opp_', column)
    fighter_red = fighter_red.rename(rename_columns, axis='columns')
    return fighter_red

In [16]:
def get_fighter_blue(fighter_name):
    try:
        fighter_blue = blue.get_group(fighter_name)
    except:
        return None
    rename_columns = {}
    for column in fighter_blue.columns:
        if re.search('^B_', column) is not None:
            rename_columns[column] = lreplace('B_', 'hero_', column)
        elif re.search('^R_', column) is not None:
            rename_columns[column] = lreplace('R_', 'opp_', column)
    fighter_blue = fighter_blue.rename(rename_columns, axis='columns')
    return fighter_blue

In [17]:
def get_result_stats(result_list):
    result_list.reverse() # To get it in ascending order
    current_win_streak = 0
    current_lose_streak = 0
    longest_win_streak = 0
    wins = 0
    losses = 0
    draw = 0
    for result in result_list:
        if result == 'hero':
            wins += 1
            current_win_streak += 1
            current_lose_streak = 0
            if longest_win_streak < current_win_streak:
                longest_win_streak += 1
        elif result == 'opp':
            losses += 1
            current_win_streak = 0
            current_lose_streak += 1
        elif result == 'draw':
            draw += 1
            current_lose_streak = 0
            current_win_streak = 0
            
    return current_win_streak, current_lose_streak, longest_win_streak, wins, losses, draw

In [18]:
win_by_columns = ['win_by_Decision - Majority', 'win_by_Decision - Split',
       'win_by_Decision - Unanimous', 'win_by_KO/TKO','win_by_Submission',
       'win_by_TKO - Doctor\'s Stoppage']

In [19]:
temp_blue_frame = pd.DataFrame()
temp_red_frame = pd.DataFrame()
latest_fighter_stats = pd.DataFrame()
result_stats = ['current_win_streak', 'current_lose_streak', 'longest_win_streak', 'wins', 'losses', 'draw']

for fighter_name in fighters:
    fighter_red = get_fighter_red(fighter_name)
    fighter_blue = get_fighter_blue(fighter_name)
    fighter_index = None

    if fighter_red is None:
        fighter = fighter_blue
        fighter_index = 'blue'
    elif fighter_blue is None:
        fighter = fighter_red
        fighter_index = 'red'
    else:
        fighter = pd.concat([fighter_red, fighter_blue]).sort_index()

    fighter['Winner'] = fighter['Winner'].apply(lambda X: 'hero' if X == fighter_name else 'opp')

    latest = fighter[Numerical_columns].mean()
    latest['total_rounds_fought'] = fighter['last_round'].sum()
    latest['total_title_bouts'] = fighter[fighter['title_bout']==True]['title_bout'].count()
    latest['hero_fighter'] = fighter_name
    results = get_result_stats(list(fighter['Winner']))
    for result_stat, result in zip(result_stats, results):
        latest[result_stat] = result
    win_by_results = fighter[fighter['Winner'] == 'hero'][win_by_columns].sum()
    for win_by_column,win_by_result in zip(win_by_columns, win_by_results):
        latest[win_by_column] = win_by_result
    latest.name = fighter_name

    latest_fighter_stats = latest_fighter_stats.append(latest)
    
    for i, index in enumerate(fighter.index):
        fighter_slice = fighter[(i+1):]
        s = fighter_slice[Numerical_columns].mean()
        s['total_rounds_fought'] = fighter_slice['last_round'].sum()
        s['total_title_bouts'] = fighter_slice[fighter_slice['title_bout']==True]['title_bout'].count()
        s['hero_fighter'] = fighter_name
        results = get_result_stats(list(fighter_slice['Winner']))
        for result_stat, result in zip(result_stats, results):
            s[result_stat] = result
        win_by_results = fighter_slice[fighter_slice['Winner'] == 'hero'][win_by_columns].sum()
        for win_by_column,win_by_result in zip(win_by_columns, win_by_results):
            s[win_by_column] = win_by_result
        s.name = index

        if fighter_index is None:
            if index in fighter_blue.index:
                temp_blue_frame = temp_blue_frame.append(s)
            elif index in fighter_red.index:
                temp_red_frame = temp_red_frame.append(s)
        elif fighter_index == 'blue':
            temp_blue_frame = temp_blue_frame.append(s)
        elif fighter_index == 'red':
            temp_red_frame = temp_red_frame.append(s)

In [20]:
fighter_details = fighter_details[fighter_details.index.isin(fighters)]

In [21]:
def convert_to_cms(X):
    if X is np.NaN:
        return X
    elif len(X.split("'")) == 2:
        feet = float(X.split("'")[0])
        inches = int(X.split("'")[1].replace(' ', '').replace('"',''))
        return (feet * 30.48) + (inches * 2.54)
    else:
        return float(X.replace('"','')) * 2.54

In [22]:
fighter_details['Height_cms'] = fighter_details['Height'].apply(convert_to_cms)
fighter_details['Reach_cms'] = fighter_details['Reach'].apply(convert_to_cms)

In [23]:
fighter_details['Weight_lbs'] = fighter_details['Weight'].apply(lambda X: float(X.replace(' lbs.', '')) if X is not np.NaN else X)

In [24]:
fighter_details.drop(['Height', 'Weight', 'Reach'], axis=1, inplace=True)

In [25]:
fighter_details.reset_index(inplace=True)
latest_fighter_stats.reset_index(inplace=True)
temp_red_frame.reset_index(inplace=True)
temp_blue_frame.reset_index(inplace=True)

In [26]:
temp_blue_frame = temp_blue_frame.merge(fighter_details, left_on='hero_fighter', right_on='fighter_name', how='left')
temp_blue_frame.set_index('index', inplace=True)

In [27]:
latest_fighter_stats = latest_fighter_stats.merge(fighter_details, left_on='index', right_on='fighter_name', how='left')
latest_fighter_stats.set_index('index', inplace=True)

In [28]:
temp_red_frame = temp_red_frame.merge(fighter_details, left_on='hero_fighter', right_on='fighter_name', how='left')
temp_red_frame.set_index('index', inplace=True)

In [29]:
temp_blue_frame.drop('fighter_name', axis=1, inplace=True)
temp_red_frame.drop('fighter_name', axis=1, inplace=True)
latest_fighter_stats.drop('fighter_name', axis=1, inplace=True)

In [30]:
blue_frame = temp_blue_frame.add_prefix('B_')
red_frame = temp_red_frame.add_prefix('R_')

In [31]:
frame = blue_frame.join(red_frame, how='outer')

In [32]:
rename_cols = {}
for col in frame.columns:
    if 'hero' in col:
        rename_cols[col] = col.replace('_hero_', '_avg_').replace('.', '')
    if 'opp' in col:
        rename_cols[col] = col.replace('_opp_', '_avg_opp_').replace('.', '')
    if 'win_by' in col:
        rename_cols[col] = col.replace(' ', '').replace('-', '_').replace('\'s', '_')

In [33]:
fs_rename_cols = {}
for col in latest_fighter_stats.columns:
    if 'hero' in col:
        fs_rename_cols[col] = col.replace('hero_', 'avg_').replace('.', '')
    if 'opp' in col:
        fs_rename_cols[col] = col.replace('opp_', 'avg_opp_').replace('.', '')
    if 'win_by' in col:
        fs_rename_cols[col] = col.replace(' ', '').replace('-', '_').replace('\'s', '_')

In [34]:
frame.rename(rename_cols, axis='columns', inplace=True)
latest_fighter_stats.rename(fs_rename_cols, axis='columns', inplace=True)

In [35]:
frame.drop(['R_avg_fighter','B_avg_fighter'], axis=1, inplace=True)
latest_fighter_stats.drop(['avg_fighter'], axis=1, inplace=True)

In [36]:
df2 = df2.join(frame, how='outer')

In [37]:
df2['R_DOB'] = pd.to_datetime(df2['R_DOB'])
df2['B_DOB'] = pd.to_datetime(df2['B_DOB'])
df2['date'] = pd.to_datetime(df2['date'])

In [38]:
def get_age(row):
    B_age = (row['date'] - row['B_DOB']).days
    R_age = (row['date'] - row['R_DOB']).days
    if np.isnan(B_age)!=True:
        B_age = math.floor(B_age/365.25)
    if np.isnan(R_age)!=True:
        R_age = math.floor(R_age/365.25)
    return pd.Series([B_age, R_age], index=['B_age', 'R_age'])

In [39]:
df2[['B_age', 'R_age']]= df2[['date', 'R_DOB', 'B_DOB']].apply(get_age, axis=1)

In [40]:
df2['R_Reach_cms'].fillna(df2['R_Height_cms'], inplace=True)
df2['B_Reach_cms'].fillna(df2['B_Height_cms'], inplace=True)
df2.fillna(df2.median(), inplace=True)

In [41]:
latest_fighter_stats['Reach_cms'].fillna(latest_fighter_stats['Height_cms'], inplace=True)
latest_fighter_stats.fillna(latest_fighter_stats.median(), inplace=True)

In [42]:
df2['R_Stance'].fillna('Orthodox', inplace=True)
df2['B_Stance'].fillna('Orthodox', inplace=True)
latest_fighter_stats['Stance'].fillna('Orthodox', inplace=True)

In [43]:
df2 = pd.concat([df2, pd.get_dummies(df2[['weight_class', 'B_Stance', 'R_Stance']])], axis=1)
df2.drop(columns=['weight_class', 'B_Stance', 'R_Stance', 
                  'location', 'date', 'R_fighter', 'B_fighter', 'R_DOB', 'B_DOB', 'Referee'], inplace=True)
df2.drop(df2.index[df2['Winner'] == 'Draw'], inplace = True)

In [44]:
latest_fighter_stats = pd.concat([latest_fighter_stats, pd.get_dummies(latest_fighter_stats[['Stance']])], axis=1)
latest_fighter_stats.drop(columns=['Stance'], inplace=True)

In [73]:
rename_cols = {}
for col in df2.columns:
    rename_cols[col] = col.replace(' ', '').replace('-', '_').replace('\'s', '_')
lfs_rename_cols = {}
for col in latest_fighter_stats.columns:
    lfs_rename_cols[col] = col.replace(' ', '').replace('-', '_').replace('\'s', '_')
df2.rename(rename_cols, axis='columns', inplace=True)
latest_fighter_stats.rename(lfs_rename_cols, axis='columns', inplace=True)

In [81]:
df2.to_csv('data/new_preprocessed_data.csv', index=False)
latest_fighter_stats.to_csv('data/latest_fighter_stats.csv', index=True)

### I have to now deal with 
* `dob` to `age`, 
* adding `weight_class` as dummies,  
* adding `title_bout`, 
* adding `no_of_rounds`, 
* adding `R_` and `B_`

In [28]:
import pandas as pd
import numpy as np
import math

df_orig = pd.read_csv('data/total_fight_data.csv', sep=';')
df = pd.read_csv('data/new_data.csv', sep=';')
fighter_details = pd.read_csv('data/fighter_details.csv', index_col='fighter_name')

In [29]:
df = pd.concat([df, df_orig]).reset_index(drop=True)

In [30]:
weight_classes = ['Women\'s Strawweight', 'Women\'s Bantamweight', 
                  'Women\'s Featherweight', 'Women\'s Flyweight', 'Lightweight', 
                  'Welterweight', 'Middleweight','Light Heavyweight', 
                  'Heavyweight', 'Featherweight','Bantamweight', 'Flyweight', 'Open Weight']

In [31]:
def make_weight_class(X):
    for weight_class in weight_classes:
        if weight_class in X:
            return weight_class
    if X == 'Catch Weight Bout' or 'Catchweight Bout':
        return 'Catch Weight'
    else:
        return 'Open Weight'

In [32]:
df['weight_class'] = df['Fight_type'].apply(make_weight_class)

In [33]:
df = df[['R_fighter', 'B_fighter', 'weight_class']]

In [34]:
df1 = df[['R_fighter', 'weight_class']].rename(index=str, columns={'R_fighter':'fighter'})
df2 = df[['B_fighter', 'weight_class']].rename(index=str, columns={'B_fighter':'fighter'})

                                               
df3 = pd.concat([df1,df2]).reset_index(drop=True)

In [35]:
df3 = df3.drop_duplicates().reset_index(drop=True)

df3.to_csv('data/weight_classes.csv', index=False)

In [16]:
import pandas as pd
import pickle
import numpy as np
import math

df = pd.read_csv('data/latest_fighter_stats.csv', index_col='index')
with open('data/cols.list', 'rb') as c:
    cols = pickle.load(c)

In [17]:
df_weight_classes = {'Flyweight':'weight_class_Flyweight',
 'Bantamweight':'weight_class_Bantamweight',
 'Featherweight':'weight_class_Featherweight',
 'Lightweight':'weight_class_Lightweight',
 'Welterweight':'weight_class_Welterweight',
 'Middleweight':'weight_class_Middleweight',
 'Light Heavyweight':'weight_class_LightHeavyweight',
 'Heavyweight':'weight_class_Heavyweight',
 'Women\'s Strawweight':'weight_class_Women_Strawweight',
 'Women\'s Flyweight':'weight_class_Women_Flyweight',
 'Women\'s Bantamweight':'weight_class_Women_Bantamweight',
'Women\'s Featherweight':'weight_class_Women_Featherweight',
 'Catch Weight':'weight_class_CatchWeight',
 'Open Weight':'weight_class_OpenWeight'}

In [18]:
def get_age(X):

    median_age = 29

    DOB = pd.to_datetime(X)
    today = pd.to_datetime('today')

    if pd.isnull(DOB):
        return median_age
    else:
        age = math.floor((today-DOB).days/365.25)
        return age

In [19]:
title_bout = True
no_of_rounds = 5
weightclass = 'Middleweight'

cols_dict = {df_weight_classes[k]:(1 if weightclass==k else 0) for k in df_weight_classes.keys()}
cols_dict.update({'title_bout':title_bout, 'no_of_rounds':no_of_rounds})

In [20]:
extra_cols = pd.DataFrame([list(cols_dict.values())], columns=cols_dict.keys())

In [21]:
df['age'] = df['DOB'].apply(get_age)
df.drop(columns=['DOB'], inplace=True)

In [22]:
r = df.loc[['Alex Perez']].add_prefix('R_').reset_index(drop=True)
b = df.loc[['Frank Hamaker']].add_prefix('B_').reset_index(drop=True)

In [23]:
final = pd.concat([r,b,extra_cols], axis=1)[cols]

In [24]:
final

Unnamed: 0,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,...,B_Stance_OpenStance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_OpenStance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch
0,True,5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,1,0,0,0,0,1,0,0,0


In [25]:
with open('data/xgb-model-1.sav', 'rb') as mdl:
    model = pickle.load(mdl)
with open('data/standard.scaler', 'rb') as ss:
    scaler = pickle.load(ss)

In [26]:
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.1, learning_rate=0.1,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
       n_estimators=244, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8)

In [27]:
def normalize(df: pd.DataFrame, scaler) -> pd.DataFrame:
    df_num = df.select_dtypes(include=[np.float, np.int])
    df[list(df_num.columns)] = scaler.transform(df[list(df_num.columns)])
    return df

In [15]:
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

model.predict_proba(np.array(normalize(final, scaler)))

array([[ 0.51440406,  0.48559597]], dtype=float32)

In [38]:
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

model.predict_proba(np.array(normalize(final, scaler)))[0]

array([ 0.05455363,  0.94544637], dtype=float32)