In [25]:
import math
import os.path
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import random
import statsmodels.api as sm
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [52]:
data = pd.read_excel("data.xlsx")
data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0,...,2,0,0,8,Orthodox,162.56,162.56,135.0,31.0,32.0
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0,...,0,2,0,5,Southpaw,165.1,167.64,125.0,32.0,31.0
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019,"Chicago, Illinois, USA",Red,False,Lightweight,3,0,...,3,6,1,14,Orthodox,180.34,193.04,155.0,36.0,35.0
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0,...,1,0,0,6,Orthodox,162.56,172.72,135.0,26.0,29.0
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0,...,2,0,0,3,Southpaw,187.96,190.5,264.0,32.0,26.0


### Apenas a categoria Peso-Pesados ('Heavyweights')

In [53]:
data.weight_class = data.weight_class.astype('category')
data.Winner = data.Winner.astype('category')

In [54]:
data.Winner.value_counts()

Red     3470
Blue    1591
Draw      83
Name: Winner, dtype: int64

### Blue = 0
### Red = 1

In [55]:
data_heavy = data.loc[(data.weight_class=='Heavyweight'),:]
bool_to_number = {False: 0, True: 1}
string_to_number = {'Blue': 0, 'Red': 1, 'Draw': 2}
data_heavy['title_bout'] = data_heavy['title_bout'].map(bool_to_number)
data_heavy['Winner'] = data_heavy['Winner'].map(string_to_number)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [56]:
data_heavy_util = data_heavy.drop(['Referee','date','location'], axis=1)
data_heavy_util.dropna(inplace=True)

In [57]:
data_heavy_util.head()

Unnamed: 0,R_fighter,B_fighter,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
4,Tai Tuivasa,Blagoy Ivanov,0,0,Heavyweight,3,0,1,0,17.0,...,2,0,0,3,Southpaw,187.96,190.5,264.0,32.0,26.0
60,Arjan Bhullar,Juan Adams,1,0,Heavyweight,3,0,1,0,8.0,...,0,0,0,2,Orthodox,185.42,190.5,245.0,27.0,32.0
63,Greg Hardy,Dmitrii Smoliakov,1,0,Heavyweight,3,2,0,0,5.0,...,0,0,0,0,Orthodox,195.58,203.2,265.0,36.0,30.0
69,Andrei Arlovski,Augusto Sakai,0,0,Heavyweight,3,0,1,0,35.0,...,9,2,0,16,Orthodox,190.5,195.58,240.0,27.0,40.0
75,Alistair Overeem,Aleksei Oleinik,1,0,Heavyweight,5,0,2,0,4.25,...,6,0,0,9,Orthodox,193.04,203.2,265.0,41.0,38.0


In [58]:
colunas = list(data_heavy_util)

In [59]:
colunas.remove('R_fighter')
colunas.remove('B_fighter')
colunas.remove('title_bout')
colunas.remove('weight_class')
colunas.remove('no_of_rounds')
colunas.remove('B_total_rounds_fought')
colunas.remove('R_total_rounds_fought')

In [60]:
dataf = data_heavy_util.loc[:,colunas]
dataf.head()

Unnamed: 0,Winner,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
4,0,0,1,0,17.0,14.5,2.5,2.0,201.0,59.5,...,2,0,0,3,Southpaw,187.96,190.5,264.0,32.0,26.0
60,1,0,1,0,8.0,8.0,15.0,13.0,85.0,38.0,...,0,0,0,2,Orthodox,185.42,190.5,245.0,27.0,32.0
63,1,2,0,0,5.0,3.5,5.5,3.5,16.0,5.5,...,0,0,0,0,Orthodox,195.58,203.2,265.0,36.0,30.0
69,0,0,1,0,35.0,21.0,38.0,30.0,167.0,69.0,...,9,2,0,16,Orthodox,190.5,195.58,240.0,27.0,40.0
75,1,0,2,0,4.25,3.875,5.375,4.125,26.625,10.5,...,6,0,0,9,Orthodox,193.04,203.2,265.0,41.0,38.0


### Correlações

In [61]:
data_heavy_blue_winners = data_heavy.loc[(data_heavy.Winner==0),:]
data_heavy_blue_winners = data_heavy_blue_winners.drop(['Winner'], axis=1)
data_heavy_red_winners = data_heavy.loc[(data_heavy.Winner==1),:]
data_heavy_red_winners = data_heavy_red_winners.drop(['Winner'], axis=1)

In [62]:
blue_winners_correlation = data_heavy_blue_winners.corr()
blue_winners_correlation.head(2)

Unnamed: 0,date,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
date,1.0,-0.161123,-0.032762,0.054294,-0.145747,,0.274744,0.256441,0.154872,0.179436,...,0.231107,0.255263,-0.078268,-0.051332,0.205434,0.029779,0.01007,0.05046,0.102656,0.25813
title_bout,-0.161123,1.0,0.491461,-0.137166,0.345926,,0.000122,0.002365,0.059811,0.056089,...,0.198632,0.182796,0.064792,-0.056728,0.181773,-0.018567,0.100335,-0.185558,0.014377,-0.040564


### Aplicando filtros

In [63]:
filtros = ['R_fighter','B_fighter','Winner','weight_class','title_bout','R_wins','R_losses','B_wins','B_losses',
           'B_Reach_cms','B_Height_cms','B_Weight_lbs','R_Reach_cms','R_Height_cms','R_Weight_lbs']

In [64]:
data_heavy_filtered = data_heavy_util.loc[:,filtros]

In [65]:
data_heavy_filtered.head()

Unnamed: 0,R_fighter,B_fighter,Winner,weight_class,title_bout,R_wins,R_losses,B_wins,B_losses,B_Reach_cms,B_Height_cms,B_Weight_lbs,R_Reach_cms,R_Height_cms,R_Weight_lbs
4,Tai Tuivasa,Blagoy Ivanov,0,Heavyweight,0,3,1,1,1,185.42,180.34,250.0,190.5,187.96,264.0
60,Arjan Bhullar,Juan Adams,1,Heavyweight,0,2,1,1,0,203.2,195.58,265.0,190.5,185.42,245.0
63,Greg Hardy,Dmitrii Smoliakov,1,Heavyweight,0,0,1,0,2,187.96,187.96,253.0,203.2,195.58,265.0
69,Andrei Arlovski,Augusto Sakai,0,Heavyweight,0,16,12,1,0,195.58,190.5,265.0,195.58,190.5,240.0
75,Alistair Overeem,Aleksei Oleinik,1,Heavyweight,0,9,6,6,2,203.2,187.96,240.0,203.2,193.04,265.0


In [66]:
data_heavy_filtered['B_Reach_cms'].corr(data_heavy_filtered['Winner'])

-0.10192044189964397

### Modelo inicial

Separando em categóricos

In [67]:
categoricas = [
    'R_fighter', 
    'B_fighter', 
    'weight_class', 
    'R_Stance', 
    'B_Stance', 
]

data_heavy_cat = data_heavy_util[categoricas].astype('category')
data_heavy_num = data_heavy_util.drop(categoricas, axis=1).astype('float')

In [68]:
X = data_heavy_num.drop('Winner', axis=1)
y = data_heavy_num['Winner']

Separando os dados em testes e treinamento

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [70]:
model = RandomForestClassifier(n_estimators=10000)

model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [71]:
#Fatores que tem o maior peso na decisão da vitória
j=1
lista_j=list()
for i, f in sorted(list(zip(model.feature_importances_, X_train.columns)), reverse=True):
    print( j,'°:',f'{f}: {i}')
    a=str(j)+'°'
    lista_j.append(a)
    j+=1

1 °: R_avg_opp_CLINCH_att: 0.022359704171831224
2 °: R_avg_opp_CLINCH_landed: 0.01980716771258457
3 °: R_age: 0.017887442423005058
4 °: R_avg_TD_att: 0.013481501387814655
5 °: R_avg_TD_landed: 0.013376475880559988
6 °: R_total_rounds_fought: 0.012571500739346089
7 °: R_losses: 0.012426902420763275
8 °: B_avg_HEAD_att: 0.011908518645654516
9 °: B_avg_opp_GROUND_att: 0.011702855781373603
10 °: B_avg_HEAD_landed: 0.011490510498193485
11 °: B_avg_SIG_STR_att: 0.01144789834855208
12 °: B_Weight_lbs: 0.010991403209570432
13 °: B_avg_BODY_att: 0.010902260070462095
14 °: B_avg_SIG_STR_landed: 0.01049236113982605
15 °: B_avg_BODY_landed: 0.010476013899575542
16 °: B_Reach_cms: 0.010104076881478657
17 °: R_avg_BODY_att: 0.010094422492484382
18 °: B_avg_opp_GROUND_landed: 0.00999500132903117
19 °: R_avg_CLINCH_att: 0.00997939112958901
20 °: B_age: 0.009974061556401433
21 °: R_avg_opp_HEAD_landed: 0.009797503056031494
22 °: R_avg_TD_pct: 0.009695274836469403
23 °: B_avg_opp_TD_att: 0.0096135166291

In [72]:
data={'Fator':X_train.columns ,'Correlação':model.feature_importances_,}
Fator_por_corr=pd.DataFrame(data)
Fator_por_corr=Fator_por_corr.sort_values(by='Correlação', ascending=False)
Fator_por_corr['Grau de Importância']=lista_j
Fator_por_corr = Fator_por_corr.set_index('Grau de Importância')
Fator_por_corr.head()

Unnamed: 0_level_0,Fator,Correlação
Grau de Importância,Unnamed: 1_level_1,Unnamed: 2_level_1
1°,R_avg_opp_CLINCH_att,0.02236
2°,R_avg_opp_CLINCH_landed,0.019807
3°,R_age,0.017887
4°,R_avg_TD_att,0.013482
5°,R_avg_TD_landed,0.013376


In [73]:
inuteis = []
uteis = []
for index,row in Fator_por_corr.iterrows():
    if row['Correlação'] >= -0.01 and row['Correlação']<=0.01:
        inuteis.append(row['Fator'])
    else:
        uteis.append(row['Fator'])    

In [76]:
data_heavy_util_relevante = data_heavy_util.loc[:,uteis]
data_heavy_util_relevante.head()

Unnamed: 0,R_avg_opp_CLINCH_att,R_avg_opp_CLINCH_landed,R_age,R_avg_TD_att,R_avg_TD_landed,R_total_rounds_fought,R_losses,B_avg_HEAD_att,B_avg_opp_GROUND_att,B_avg_HEAD_landed,B_avg_SIG_STR_att,B_Weight_lbs,B_avg_BODY_att,B_avg_SIG_STR_landed,B_avg_BODY_landed,B_Reach_cms,R_avg_BODY_att
4,4.5,3.5,26.0,0.5,0.0,7,1,184.5,0.0,45.0,203.5,250.0,17.0,61.5,14.5,185.42,7.75
60,8.0,5.333333,32.0,2.0,1.333333,8,1,131.0,2.0,74.0,144.0,265.0,8.0,86.0,8.0,203.2,2.0
63,2.0,1.0,30.0,0.0,0.0,2,1,15.5,28.5,5.0,22.0,253.0,5.0,9.5,3.5,187.96,5.0
69,4.964286,3.035714,40.0,0.892857,0.321429,59,12,169.0,0.0,81.0,218.0,265.0,35.0,111.0,21.0,195.58,6.607143
75,6.466667,4.2,38.0,0.866667,0.266667,29,6,28.5,3.625,11.5,35.25,240.0,4.25,17.25,3.875,203.2,13.466667


In [84]:
def relevancia(df,coluna_nome,coluna_correlacao,acuracia):
    inuteis = []
    uteis = []
    for index,row in df.iterrows():
        if row[coluna_correlacao] >= -acuracia and row[coluna_correlacao] <= acuracia:
            inuteis.append(row[coluna_nome])
        else:
            uteis.append(row[coluna_nome])
    return uteis

In [85]:
relevancia(Fator_por_corr,'Fator','Correlação',0.01)

[['B_avg_opp_GROUND_landed',
  'R_avg_CLINCH_att',
  'B_age',
  'R_avg_opp_HEAD_landed',
  'R_avg_TD_pct',
  'B_avg_opp_TD_att',
  'R_win_by_KO/TKO',
  'B_avg_DISTANCE_att',
  'B_avg_TOTAL_STR_landed',
  'B_avg_DISTANCE_landed',
  'R_avg_GROUND_landed',
  'R_avg_HEAD_landed',
  'R_avg_GROUND_att',
  'R_avg_opp_GROUND_att',
  'R_avg_opp_SIG_STR_pct',
  'R_avg_TOTAL_STR_landed',
  'B_avg_GROUND_att',
  'R_avg_CLINCH_landed',
  'B_avg_opp_DISTANCE_landed',
  'B_avg_TOTAL_STR_att',
  'R_avg_BODY_landed',
  'R_avg_opp_KD',
  'R_avg_opp_HEAD_att',
  'B_avg_LEG_att',
  'R_avg_TOTAL_STR_att',
  'B_avg_GROUND_landed',
  'R_avg_opp_SIG_STR_att',
  'B_avg_opp_TOTAL_STR_landed',
  'R_avg_opp_DISTANCE_att',
  'R_avg_SIG_STR_landed',
  'B_avg_TD_att',
  'R_avg_PASS',
  'R_avg_DISTANCE_landed',
  'R_avg_opp_GROUND_landed',
  'R_wins',
  'B_avg_LEG_landed',
  'R_avg_opp_SIG_STR_landed',
  'B_avg_SIG_STR_pct',
  'R_avg_SIG_STR_pct',
  'B_avg_opp_BODY_att',
  'B_avg_opp_TD_pct',
  'B_total_time_fought(s