# UFC WINNER PREDICTION

.

In [None]:
#Import Cell
#used to import all the libraries and functions used
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import sys, warnings, os
from sklearn.dummy import DummyClassifier

In [None]:
#To ignore max-iteration warnings while cross validating scores
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [None]:
#Setting columns and rows to display all the results
pd.set_option("display.max_columns", None, "display.max_rows", None)

In [None]:
#Reading the dataset
ufc_master_ds = pd.read_csv("../input/ultimate-ufc-dataset/ufc-master.csv")
#Seperating label from input
label = ufc_master_ds.Winner
#I have removed "B_Women's Featherweight_rank" because imputing with this feature in the dataset gives me a ton of errors in the baseline model.
X = ufc_master_ds.drop(['Winner',"B_Women's Featherweight_rank"], axis =1)

In [None]:
X.head()

# 1. Creating a baseline model

 Since this is a classification task, the target value will be 'Winner'.

***Encoding categorical variables***

I'll be encoding the categorical columns prior to doing anything so it'll be easier for me to split the data while avoiding data leakage.

In [None]:
#Separating the features based on their data types
cat_col = [col for col in X.columns if X[col].dtypes == 'object']
num_col = [col for col in X.columns if col not in cat_col]

In [None]:
enc = LabelEncoder()
for i in X[cat_col]:
    #using astype(str) to avoid columns with 'float and str' to throw errors
    X[i] = enc.fit_transform(X[i].astype(str))

In [None]:
#Also encoding Label for Red to be 1 and Blue to be 0 
label = [1 if win == 'Red' else 0 for win in label]

Now that this is done, I can split the dataset into training and test set.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, label, random_state = 2, test_size = 0.3)

***Finding and filling null values***

Building a baseline model would mean imputing all the null values with their 'means' (or mode/ median but I chose mean here) since this is a baseline model

In [None]:
X_train.isnull().sum().sort_values(ascending=False)

Since, it is baseline model, imputation would be pretty basic and simple:
1. Impute the numerical columns with mean
2. Fill the categorical columns with 'most_frequent'

In [None]:
imp = SimpleImputer(strategy='most_frequent')
imp.fit(X_train[num_col])
X_train[num_col] = imp.transform(X_train[num_col])
X_valid[num_col] = imp.transform(X_valid[num_col])

In [None]:
cat_imp = SimpleImputer(strategy = 'most_frequent')
cat_imp.fit(X_train[cat_col])
X_train[cat_col] = cat_imp.transform(X_train[cat_col])
X_valid[cat_col] = cat_imp.transform(X_valid[cat_col])

In [None]:
X_train.sample(10)

***Building a baseline model***

In [None]:
#A DummyClassifier is used to be a baseline to compare a better model's performance later on
base_model = DummyClassifier(random_state=2)

In [None]:
base_model.fit(X_train,y_train)

In [None]:
preds = base_model.predict(X_valid)
accuracy_score(y_valid, preds)

49% is the accuracy for this model/ baseline accuracy

Now that the baseline model is built let's focus on building an actual model for our prediction

# 2. Data Visualization
Feature visualization on ufc_master_ds to gain some insight into our data 

In [None]:
ufc_master_ds.head()

In [None]:
#Encoding label so it is easier to find correlation
ufc_master_ds['Winner'] = [1 if winner == 'Red' else 0 for winner in ufc_master_ds.Winner]

Now, let's see the correlation between variables and target and then select the appropriate variable to visualize

In [None]:
num_corr_col = [col for col in ufc_master_ds.columns if ufc_master_ds[col].dtype == 'int64' or ufc_master_ds[col].dtype == 'float64']
corr_dict = {}
#Getting absolute values of correlation since we would need to inspect negative correlation too
for col in num_corr_col:
    corr_dict[col] = abs(ufc_master_ds[col].corr(ufc_master_ds['Winner']))

In [None]:
for w in sorted(corr_dict, key=corr_dict.get):
    print(w, corr_dict[w])

The biggest contributors i.e. columns with a greater than 0.25 correlation (in increasing order of importance) are:
* B_td_landed_bout,
* R_td_pct_bout,
* B_sig_str_landed_bout,
* B_tot_str_landed_bout,
* R_tot_str_landed_bout,
* R_pass_bout,
* R_kd_bout,
* B_sig_str_pct_bout,
* B_pass_bout,
* B_ev,
* R_ev,
* R_odds,
* B_odds,
* R_sig_str_pct_bout,
* B_kd_bout

Since none of the bout variables have any information on them, I'll have to do some inspection and visualization and see if I can find out the meaning of these.

In [None]:
ufc_master_ds['B_kd_bout'].unique()

In [None]:
#Getting null values percentage
(ufc_master_ds['B_kd_bout'].isnull().sum()/ufc_master_ds.shape[0])*100

More than 38% of these values are null

In [None]:
#For visualization purposes
ufc_master_ds['Winner'] = ['Red' if winner == 1 else 'Blue' for winner in ufc_master_ds.Winner]

Let's see the countplot

In [None]:
sns.countplot(x=ufc_master_ds['B_kd_bout'], hue = ufc_master_ds['Winner']);

A majority of these values belong to 0, then 1,2,3 with a rare appearance in 4. All belong to float64 type. Also, these are bout related.

According to the official UFC site, 'kd' refers to knockdown, so these could be knockdowns dealt to/by the blue player

In [None]:
#Analysing "_odds" variables
sns.scatterplot(x="B_odds", y="R_odds", hue="Winner", data = ufc_master_ds);

As B_odds increase, there are more "Red" winners. Same for blue, so it seems that, with the exception of a few outliers, all of the winner are the ones that are bet against. These seem like odds to lose rather than odds to win

In [None]:
#Just to be sure
ufc_master_ds["Winner"].loc[ufc_master_ds["B_odds"]>1].value_counts()

These are the winner distributions where the odds of Blue player winning are greater than 0, but the winners are clearly Red players.
Same would be applicable for B_odds

In [None]:
#Null values in _sig_str_pct_bout variables
[(ufc_master_ds[col].isnull().sum()/ufc_master_ds.shape[0])*100 for col in ['R_sig_str_pct_bout','B_sig_str_pct_bout']]
#Same number of missing values as _kd_bout variables

In [None]:
sns.scatterplot(x='R_sig_str_pct_bout',y='B_sig_str_pct_bout',hue = 'Winner', data=ufc_master_ds);

There does seem to be somewhat of a linear correlation in significant striking accuracy and winner of the bout

In [None]:
#Lets inspect _ev variables
sns.scatterplot(x='B_ev', y='R_ev',hue = 'Winner', data=ufc_master_ds);

When B_ev is increasing in value, the winners are Red and when R_ev is increasing, the winners are Blue.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,7))
sns.scatterplot(x='B_ev', y='R_odds',hue = 'Winner', data=ufc_master_ds, ax=ax[0]);
sns.scatterplot(x='B_ev', y='B_odds',hue = 'Winner', data=ufc_master_ds, ax=ax[1]);
fig.show()

As the chances of Red winning increases (i.e. R_odds decreases), the profit i.e. B_ev increases.
This indicates that B_ev is the profit on Red rather than blue and the other way round would be the same for R_ev

The _bout features would have more or less similar relationship with one another

Let's move to _Stance variables

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,7))
sns.countplot(ufc_master_ds['Winner'], hue = ufc_master_ds['R_Stance'], ax=ax[0])
sns.countplot(ufc_master_ds['Winner'], hue = ufc_master_ds['B_Stance'], ax=ax[1])
ax[0].title.set_text('Stances of Red Players')
ax[1].title.set_text('Stances of Blue Players')
fig.show()

Orthodox is the superior stance followed by Southpaw and then Switch

# 3. Feature Engineering, Filling null values and Final Model

In [None]:
ufc_master_ds.head()

Considering the number of variables, it would make sense to combine similar variables to increase their correlation and reduce complexity by limiting the input features

Some variables are already engineered. They have combined features, so I'll just be doing those that haven't yet been touched.
These are the already engineered features:
1. lose_streak_dif: (Blue lose streak) - (Red lose streak) 
2. winstreakdif: (Blue win streak) - (Red win streak)
3. longest_win_streak_dif: (Blue longest win streak) - (Red longest win streak)
4. win_dif: (Blue wins) - (Red wins)
5. loss_dif: (Blue losses) - (Red losses)
6. total_round_dif: (Blue total rounds fought) - (Red total rounds fought)
7. total_title_bout_dif: (Blue number of title fights) - (Red number of title fights)
8. ko_dif: (Blue wins by KO/TKO) - (Red wins by KO/TKO)
9. sub_dif: (Blue wins by submission) - (Red wins by submission)
10. height_dif: (Blue height) - (Red height) in cms
11. reach_dif: (Blue reach) - (Red reach) in cms
12. age_dif: (Blue age) - (Red age)
13. sig_str_dif: (Blue sig strikes per minute) - (Red sig strikes per minute)
14. avg_sub_att_dif: (Blue submission attempts) - (Red submission attempts)
15. avg_td_dif: (Blue TD attempts) - (Red TD attempts)

In [None]:
ufc_master_ds['draw_diff'] = (ufc_master_ds['B_draw']-ufc_master_ds['R_draw'])
ufc_master_ds['avg_sig_str_pct_diff'] = (ufc_master_ds['B_avg_SIG_STR_pct']-ufc_master_ds['R_avg_SIG_STR_pct'])
ufc_master_ds['avg_TD_pct_diff'] = (ufc_master_ds['B_avg_TD_pct']-ufc_master_ds['B_avg_TD_pct'])
ufc_master_ds['win_by_Decision_Majority_diff'] = (ufc_master_ds['B_win_by_Decision_Majority']-ufc_master_ds['R_win_by_Decision_Majority'])
ufc_master_ds['win_by_Decision_Split_diff'] = (ufc_master_ds['B_win_by_Decision_Split']-ufc_master_ds['R_win_by_Decision_Split'])
ufc_master_ds['win_by_Decision_Unanimous_diff'] = (ufc_master_ds['B_win_by_Decision_Unanimous']-ufc_master_ds['R_win_by_Decision_Unanimous'])
ufc_master_ds['win_by_TKO_Doctor_Stoppage_diff'] = (ufc_master_ds['B_win_by_TKO_Doctor_Stoppage']-ufc_master_ds['R_win_by_TKO_Doctor_Stoppage'])

In [None]:
ufc_master_ds['odds_diff'] = (ufc_master_ds['B_odds']-ufc_master_ds['R_odds'])
ufc_master_ds['ev_diff'] = (ufc_master_ds['B_ev']-ufc_master_ds['R_ev'])

ufc_master_ds['kd_bout_diff']=(ufc_master_ds['B_kd_bout']-ufc_master_ds['R_kd_bout'])
ufc_master_ds['sig_str_landed_bout_diff']=(ufc_master_ds['B_sig_str_landed_bout']-ufc_master_ds['R_sig_str_landed_bout'])
ufc_master_ds['sig_str_attempted_bout_diff']=(ufc_master_ds['B_sig_str_attempted_bout']-ufc_master_ds['R_sig_str_attempted_bout'])
ufc_master_ds['sig_str_attempted_bout_diff']=(ufc_master_ds['B_sig_str_attempted_bout']-ufc_master_ds['R_sig_str_attempted_bout'])
ufc_master_ds['sig_str_pct_bout_diff']=(ufc_master_ds['B_sig_str_pct_bout']-ufc_master_ds['R_sig_str_pct_bout'])
ufc_master_ds['tot_str_landed_bout_diff']=(ufc_master_ds['B_tot_str_landed_bout']-ufc_master_ds['R_tot_str_landed_bout'])
ufc_master_ds['tot_str_attempted_bout_diff']=(ufc_master_ds['B_tot_str_attempted_bout']-ufc_master_ds['R_tot_str_attempted_bout'])
ufc_master_ds['td_landed_bout_diff']=(ufc_master_ds['B_td_landed_bout']-ufc_master_ds['R_td_landed_bout'])
ufc_master_ds['td_attempted_bout_diff']=(ufc_master_ds['B_td_attempted_bout']-ufc_master_ds['R_td_attempted_bout'])
ufc_master_ds['td_pct_bout_diff']=(ufc_master_ds['B_td_pct_bout']-ufc_master_ds['R_td_pct_bout'])
ufc_master_ds['td_pct_bout_diff']=(ufc_master_ds['B_td_pct_bout']-ufc_master_ds['R_td_pct_bout'])
ufc_master_ds['sub_attempts_bout_diff']=(ufc_master_ds['B_sub_attempts_bout']-ufc_master_ds['R_sub_attempts_bout'])
ufc_master_ds['pass_bout_diff']=(ufc_master_ds['B_pass_bout']-ufc_master_ds['R_pass_bout'])
ufc_master_ds['rev_bout_diff']=(ufc_master_ds['B_rev_bout']-ufc_master_ds['R_rev_bout'])

After extracting the necessary information from these variables, there's no more need for them. So, I'll just drop them

In [None]:
#Dropping variables
var_drop = [
'B_odds',
'R_odds',
'B_ev',
'R_ev',
'R_kd_bout',
'B_kd_bout',
'R_sig_str_landed_bout',
'B_sig_str_landed_bout',
'R_sig_str_attempted_bout',
'B_sig_str_attempted_bout',
'R_sig_str_pct_bout',
'B_sig_str_pct_bout',
'R_tot_str_landed_bout',
'B_tot_str_landed_bout',
'R_tot_str_attempted_bout',
'B_tot_str_attempted_bout',
'R_td_landed_bout',
'B_td_landed_bout',
'R_td_attempted_bout',
'B_td_attempted_bout',
'R_td_pct_bout',
'B_td_pct_bout',
'R_sub_attempts_bout',
'B_sub_attempts_bout',
'R_pass_bout',
'B_pass_bout',
'R_rev_bout',
'B_rev_bout',
'B_current_lose_streak', 'R_current_lose_streak',
'B_current_win_streak', 'R_current_win_streak',
'B_longest_win_streak', 'R_longest_win_streak',
'B_wins', 'R_wins',
'B_losses', 'R_losses',
'B_total_rounds_fought', 'R_total_rounds_fought',
'B_total_title_bouts', 'R_total_title_bouts',
'B_win_by_KO/TKO', 'R_win_by_KO/TKO',
'B_win_by_Submission', 'R_win_by_Submission',
'B_Height_cms', 'R_Height_cms',
'B_Reach_cms', 'R_Reach_cms',
'B_age', 'R_age',
'B_avg_SIG_STR_landed', 'R_avg_SIG_STR_landed',
'B_avg_SUB_ATT', 'R_avg_SUB_ATT',
'B_avg_TD_landed', 'R_avg_TD_landed',
'B_draw','B_avg_SIG_STR_pct','B_avg_TD_pct','B_win_by_Decision_Majority','B_win_by_Decision_Split','B_win_by_Decision_Unanimous','B_win_by_TKO_Doctor_Stoppage',
'R_draw','R_avg_SIG_STR_pct','R_avg_TD_pct','R_win_by_Decision_Majority','R_win_by_Decision_Split','R_win_by_Decision_Unanimous','R_win_by_TKO_Doctor_Stoppage']
ufc_master_ds.drop(var_drop, axis=1, inplace = True)

Next step will be dropping the variables that are common to both the fighters(like date, country, etc.) and thus provide no advantage to anyone

In [None]:
comm_drop = [
'date','location','country','weight_class','gender','no_of_rounds','empty_arena','constant_1','finish','finish_details','finish_round','finish_round_time','total_fight_time_secs','B_Weight_lbs','R_Weight_lbs'
]
ufc_master_ds.drop(comm_drop, axis=1, inplace = True)

Before moving on to the rank variables, the _Stance variables need some encoding. I'll encode them as I visualized them above.i.e. Orthodox is superior so it will be 4, Southpaw after that will be 3, Switch, 2, and Open Stance 1

In [None]:
ufc_master_ds.B_Stance.unique()

In [None]:
#It has one spelling mistake
ufc_master_ds['B_Stance'].loc[ufc_master_ds['B_Stance']=='Switch '] = 'Switch'
#R_Stance doesn't have this error, so we're cool

In [None]:
stance = ['B_Stance', 'R_Stance']

In [None]:
for x in stance:
    ufc_master_ds[x] = [4 if st == 'Orthodox'
                           else 3 if st == 'Southpaw'
                           else 2 if st == 'Switch'
                           else 1 for st in ufc_master_ds[x]]
#using -1 and 1 for both red and blue so there is no misunderstanding that one variable is better than the other    
ufc_master_ds['better_rank'] = [-1 if rank == 'Red'
                               else 1 if rank == 'Blue'
                               else 0 for rank in ufc_master_ds['better_rank']]

ufc_master_ds['title_bout'] = [1 if tb==True else 0 for tb in ufc_master_ds['title_bout']]

In [None]:
ufc_master_ds['Stance_diff'] = (ufc_master_ds['B_Stance'] - ufc_master_ds['R_Stance'])
ufc_master_ds.drop(stance, axis = 1, inplace = True)

In [None]:
ufc_master_ds.head()

In [None]:
#Encoding label so it is easier to find correlation
ufc_master_ds['Winner'] = [1 if winner == 'Red' else 0 for winner in ufc_master_ds.Winner]

In [None]:
ufc_master_ds.loc[:,'B_match_weightclass_rank':'better_rank'].isnull().sum()

Almost all of _rank variables except better_rank are empty. I'll keep it and remove the rest

In [None]:
ufc_master_ds.drop(ufc_master_ds.loc[:,'B_match_weightclass_rank':'B_Pound-for-Pound_rank'], axis=1, inplace = True)

In [None]:
ufc_master_ds.sample(10)

In [None]:
label = ufc_master_ds.Winner
ufc_master_ds.drop(['Winner'], axis=1, inplace = True)

In [None]:
#Encoding the remaining categorical variables
cat_col = ['R_fighter', 'B_fighter']
enc = LabelEncoder()
for i in ufc_master_ds[cat_col]:
    ufc_master_ds[i] = enc.fit_transform(ufc_master_ds[i])

***Splitting the data in training and testing data set***

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(ufc_master_ds, label, test_size = 0.3, random_state=2)

In [None]:
#At this point all the null values are the ones that have been left empty by error/mistake and are not left empty deliberately.
#So, it would make sense to fill in these with mean rather than 0 or anything else
impute = SimpleImputer(strategy = 'mean')
impute.fit(X_train)
X_train = impute.transform(X_train)
X_valid = impute.transform(X_valid)

In [None]:
RF_model = RandomForestClassifier(random_state=2)

In [None]:
RF_model.fit(X_train, y_train)

In [None]:
preds = RF_model.predict(X_valid)
accuracy_score(y_valid, preds)

**77% is the accuracy after feature engineering**

In [None]:
#Built a model after doing GridSearch but not putting the code here because the cell takes up a lot of time
RF_model = RandomForestClassifier(n_estimators = 350, max_depth = 12, random_state = 2)

In [None]:
RF_model.fit(X_train, y_train)
preds = RF_model.predict(X_valid)
accuracy_score(y_valid, preds)

**77.9% which is almost 78% accuracy**

So, after careful feature engineering and data visualization, the accuracy of our model increased by almost 30%.

Note : It should be considered that the model used as a baseline was a DummyClassifier and not a proper model.
