In [196]:
import math
import os.path
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import random
import statsmodels.api as sm
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [144]:
data = pd.read_excel("data.xlsx")
data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0,...,2,0,0,8,Orthodox,162.56,162.56,135.0,31.0,32.0
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0,...,0,2,0,5,Southpaw,165.1,167.64,125.0,32.0,31.0
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019,"Chicago, Illinois, USA",Red,False,Lightweight,3,0,...,3,6,1,14,Orthodox,180.34,193.04,155.0,36.0,35.0
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0,...,1,0,0,6,Orthodox,162.56,172.72,135.0,26.0,29.0
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0,...,2,0,0,3,Southpaw,187.96,190.5,264.0,32.0,26.0


### Apenas a categoria Peso-Pesados ('Heavyweights')

In [145]:
data.weight_class = data.weight_class.astype('category')
data.Winner = data.Winner.astype('category')

In [146]:
data.Winner.value_counts()

Red     3470
Blue    1591
Draw      83
Name: Winner, dtype: int64

### Blue = 0
### Red = 1

In [147]:
data_heavy = data.loc[(data.weight_class=='Lightweight'),:]
bool_to_number = {False: 0, True: 1}
string_to_number = {'Blue': 0, 'Red': 1, 'Draw': 2}
data_heavy['title_bout'] = data_heavy['title_bout'].map(bool_to_number)
data_heavy['Winner'] = data_heavy['Winner'].map(string_to_number)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [148]:
data_heavy_util = data_heavy.drop(['Referee','date','location'], axis=1)
data_heavy_util.dropna(inplace=True)

In [149]:
data_heavy_util.head()

Unnamed: 0,R_fighter,B_fighter,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
2,Tony Ferguson,Donald Cerrone,1,0,Lightweight,3,0,3,0,15.354839,...,3,6,1,14,Orthodox,180.34,193.04,155.0,36.0,35.0
16,Damir Hadzovic,Christos Giagos,0,0,Lightweight,3,0,1,0,7.0,...,2,0,0,3,Orthodox,175.26,177.8,155.0,29.0,32.0
20,Stevie Ray,Leonardo Santos,0,0,Lightweight,3,0,4,0,9.666667,...,2,0,0,6,Southpaw,177.8,177.8,155.0,39.0,29.0
21,Nick Hein,Frank Camacho,0,0,Lightweight,3,2,0,0,21.666667,...,0,0,0,4,Southpaw,167.64,167.64,155.0,30.0,35.0
29,Charles Oliveira,Nik Lentz,1,0,Lightweight,3,0,2,0,13.181818,...,0,13,0,14,Orthodox,177.8,187.96,155.0,34.0,29.0


In [150]:
colunas = list(data_heavy_util)

In [151]:
colunas.remove('R_fighter')
colunas.remove('B_fighter')
colunas.remove('title_bout')
colunas.remove('weight_class')
colunas.remove('no_of_rounds')
colunas.remove('B_total_rounds_fought')
colunas.remove('R_total_rounds_fought')

In [152]:
dataf = data_heavy_util.loc[:,colunas]
dataf.head()

Unnamed: 0,Winner,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
2,1,0,3,0,15.354839,11.322581,6.741935,4.387097,84.741935,38.580645,...,3,6,1,14,Orthodox,180.34,193.04,155.0,36.0,35.0
16,0,0,1,0,7.0,4.2,4.6,2.6,42.8,11.0,...,2,0,0,3,Orthodox,175.26,177.8,155.0,29.0,32.0
20,0,0,4,0,9.666667,5.166667,2.0,1.0,53.833333,18.833333,...,2,0,0,6,Southpaw,177.8,177.8,155.0,39.0,29.0
21,0,2,0,0,21.666667,17.666667,13.333333,10.333333,177.333333,84.0,...,0,0,0,4,Southpaw,167.64,167.64,155.0,30.0,35.0
29,1,0,2,0,13.181818,10.590909,13.227273,8.863636,62.045455,25.545455,...,0,13,0,14,Orthodox,177.8,187.96,155.0,34.0,29.0


### Correlações

In [153]:
data_heavy_blue_winners = data_heavy.loc[(data_heavy.Winner==0),:]
data_heavy_blue_winners = data_heavy_blue_winners.drop(['Winner'], axis=1)
data_heavy_red_winners = data_heavy.loc[(data_heavy.Winner==1),:]
data_heavy_red_winners = data_heavy_red_winners.drop(['Winner'], axis=1)

In [154]:
blue_winners_correlation = data_heavy_blue_winners.corr()
blue_winners_correlation.head(2)

Unnamed: 0,date,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
date,1.0,-0.017063,0.133452,0.134075,0.030633,,0.064656,0.033407,-0.04441,-0.013271,...,0.103545,0.186643,0.017163,0.05795,0.157698,0.079028,-0.033772,0.17962,0.299697,0.233073
title_bout,-0.017063,1.0,0.569244,-0.118739,0.121773,,0.031181,0.036265,0.109884,0.102218,...,0.067474,0.115826,-0.001662,0.051952,0.077464,-0.013792,-0.114154,0.029612,-0.006608,-0.041315


### Aplicando filtros

In [155]:
filtros = ['R_fighter','B_fighter','Winner','weight_class','title_bout','R_wins','R_losses','B_wins','B_losses',
           'B_Reach_cms','B_Height_cms','B_Weight_lbs','R_Reach_cms','R_Height_cms','R_Weight_lbs','R_Stance','B_Stance']

In [156]:
data_heavy_filtered = data_heavy_util.loc[:,filtros]

In [157]:
data_heavy_filtered.head()

Unnamed: 0,R_fighter,B_fighter,Winner,weight_class,title_bout,R_wins,R_losses,B_wins,B_losses,B_Reach_cms,B_Height_cms,B_Weight_lbs,R_Reach_cms,R_Height_cms,R_Weight_lbs,R_Stance,B_Stance
2,Tony Ferguson,Donald Cerrone,1,Lightweight,0,14,1,23,8,185.42,185.42,155.0,193.04,180.34,155.0,Orthodox,Orthodox
16,Damir Hadzovic,Christos Giagos,0,Lightweight,0,3,2,2,3,180.34,177.8,155.0,177.8,175.26,155.0,Orthodox,Orthodox
20,Stevie Ray,Leonardo Santos,0,Lightweight,0,6,3,5,1,190.5,182.88,155.0,177.8,177.8,155.0,Southpaw,Orthodox
21,Nick Hein,Frank Camacho,0,Lightweight,0,4,3,1,2,185.42,177.8,170.0,167.64,167.64,155.0,Southpaw,Orthodox
29,Charles Oliveira,Nik Lentz,1,Lightweight,0,14,9,14,8,172.72,172.72,155.0,187.96,177.8,155.0,Orthodox,Orthodox


In [158]:
data_heavy_filtered.columns

Index(['R_fighter', 'B_fighter', 'Winner', 'weight_class', 'title_bout',
       'R_wins', 'R_losses', 'B_wins', 'B_losses', 'B_Reach_cms',
       'B_Height_cms', 'B_Weight_lbs', 'R_Reach_cms', 'R_Height_cms',
       'R_Weight_lbs', 'R_Stance', 'B_Stance'],
      dtype='object')

### Modelo inicial

Separando em categóricos

In [159]:
categoricas = [
    'R_fighter', 
    'B_fighter', 
    'weight_class', 
    'R_Stance', 
    'B_Stance', 
]

data_heavy_cat = data_heavy_filtered[categoricas].astype('category')
data_heavy_num = data_heavy_filtered.drop(categoricas, axis=1).astype('float')

In [160]:
X = data_heavy_num.drop('Winner', axis=1)
y = data_heavy_num['Winner']

Separando os dados em testes e treinamento

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [162]:
model = RandomForestClassifier(n_estimators=100000)

model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [163]:
y_pred = model.predict(X_test)

In [164]:
print(accuracy_score(y_test, y_pred))

0.6441717791411042


In [165]:
#Fatores que tem o maior peso na decisão da vitória
j=1
lista_j=list()
for i, f in sorted(list(zip(model.feature_importances_, X_train.columns)), reverse=True):
    print( j,'°:',f'{f}: {i}')
    a=str(j)+'°'
    lista_j.append(a)
    j+=1

1 °: R_wins: 0.1410892601931193
2 °: B_wins: 0.12925260758995993
3 °: R_losses: 0.11888021848765706
4 °: R_Reach_cms: 0.11491403861707264
5 °: B_Reach_cms: 0.1082130319561022
6 °: B_Height_cms: 0.10435107114311647
7 °: B_losses: 0.10048376925316697
8 °: R_Height_cms: 0.09950712155421781
9 °: B_Weight_lbs: 0.03831093013354007
10 °: R_Weight_lbs: 0.036518408766559665
11 °: title_bout: 0.008479542305493462


In [166]:
data={'Fator':X_train.columns ,'Correlação':model.feature_importances_,}
Fator_por_corr=pd.DataFrame(data)
Fator_por_corr=Fator_por_corr.sort_values(by='Correlação', ascending=False)
Fator_por_corr['Grau de Importância']=lista_j
Fator_por_corr = Fator_por_corr.set_index('Grau de Importância')
Fator_por_corr.head()

Unnamed: 0_level_0,Fator,Correlação
Grau de Importância,Unnamed: 1_level_1,Unnamed: 2_level_1
1°,R_wins,0.141089
2°,B_wins,0.129253
3°,R_losses,0.11888
4°,R_Reach_cms,0.114914
5°,B_Reach_cms,0.108213


In [167]:
def relevancia(df,coluna_nome,coluna_correlacao,acuracia):
    inuteis = ['Winner']
    uteis = ['Winner']
    for index,row in df.iterrows():
        if row[coluna_correlacao] >= -acuracia and row[coluna_correlacao] <= acuracia:
            inuteis.append(row[coluna_nome])
        else:
            uteis.append(row[coluna_nome])
    return uteis

In [168]:
#def nome_e_vencedor(lista):
    #nova_lista = lista.append('R_fighter')
    #nova_lista = lista.append('B_fighter')
    #nova_lista = lista.append('Winner')
    #return nova_lista

In [169]:
uteis = relevancia(Fator_por_corr,'Fator','Correlação',0.01)

In [170]:
#tirando empates
data_heavy_util_relevante = data_heavy_util[data_heavy_util.Winner != 2]

In [171]:
data_heavy_util_relevante = data_heavy_util_relevante.loc[:,uteis]

In [172]:
data_heavy_util_relevante.head()

Unnamed: 0,Winner,R_wins,B_wins,R_losses,R_Reach_cms,B_Reach_cms,B_Height_cms,B_losses,R_Height_cms,B_Weight_lbs,R_Weight_lbs
2,1,14,23,1,193.04,185.42,185.42,8,180.34,155.0,155.0
16,0,3,2,2,177.8,180.34,177.8,3,175.26,155.0,155.0
20,0,6,5,3,177.8,190.5,182.88,1,177.8,155.0,155.0
21,0,4,1,3,167.64,185.42,177.8,2,167.64,170.0,155.0
29,1,14,14,9,187.96,172.72,172.72,8,177.8,155.0,155.0


In [173]:
data_heavy_util_relevante.Winner.value_counts()

1    416
0    220
Name: Winner, dtype: int64

### Random forest 2.0

In [174]:
data_heavy_num = data_heavy_util_relevante.astype('float')

In [175]:
X = data_heavy_num.drop('Winner', axis=1)
y = data_heavy_num['Winner']

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [186]:
model = RandomForestClassifier(n_estimators=10000)

model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [187]:
y_pred = model.predict(X_test)

In [188]:
print(accuracy_score(y_test, y_pred))

0.6352201257861635


### Regressão Logística

In [197]:
def preparo(X,Y):
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp,missing='drop')
    results = model.fit()
    return results

In [245]:
Y_log = data_heavy_util_relevante["Winner"]
data_heavy_sem_Winner=data_heavy_util_relevante.drop('Winner',axis=1) 
X_log=data_heavy_sem_Winner
#np.asarray(X)

In [246]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_log, Y_log, test_size=0.25)

In [247]:
model = LogisticRegression(max_iter=200000,solver='lbfgs', multi_class='auto')

model.fit(X_train_log, y_train_log)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200000, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [248]:
y_pred_log = model.predict(X_test_log)

In [249]:
print(accuracy_score(y_test_log, y_pred_log))

0.660377358490566


In [241]:
result = preparo(X,Y)
result.summary()

0,1,2,3
Dep. Variable:,Winner,R-squared:,0.074
Model:,OLS,Adj. R-squared:,0.06
Method:,Least Squares,F-statistic:,5.024
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,4.76e-07
Time:,11:53:55,Log-Likelihood:,-405.28
No. Observations:,636,AIC:,832.6
Df Residuals:,625,BIC:,881.6
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.7883,1.060,3.575,0.000,1.707,5.870
R_wins,0.0102,0.008,1.302,0.193,-0.005,0.026
B_wins,-0.0225,0.008,-2.731,0.006,-0.039,-0.006
R_losses,-0.0525,0.012,-4.445,0.000,-0.076,-0.029
R_Reach_cms,0.0067,0.004,1.512,0.131,-0.002,0.015
B_Reach_cms,-0.0071,0.004,-1.713,0.087,-0.015,0.001
B_Height_cms,0.0039,0.005,0.758,0.449,-0.006,0.014
B_losses,0.0369,0.013,2.770,0.006,0.011,0.063
R_Height_cms,-0.0132,0.006,-2.367,0.018,-0.024,-0.002

0,1,2,3
Omnibus:,1835.835,Durbin-Watson:,1.887
Prob(Omnibus):,0.0,Jarque-Bera (JB):,82.766
Skew:,-0.581,Prob(JB):,1.07e-18
Kurtosis:,1.669,Cond. No.,24300.0
