# Cleaning and Preprocessing Data for Machine Learning






In [39]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from datetime import datetime, date
import time
#from google.colab import files
pd.options.mode.chained_assignment = None  # default='warn'


In [40]:
# Read results csv file into a pandas DataFrame

sumo_df = pd.read_csv('https://sumo-data-bucket.s3.amazonaws.com/sumo_data.csv')
sumo_df = sumo_df.loc[(sumo_df['tournament_date'] >= '2004-05-01')]
sumo_df.head()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_1_birth_date,wrestler_1_height,wrestler_1_weight,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age
117466,2004-05-01,1,81,J14w,Wakakosho,1-0 (7-8),1,hatakikomi,103,J14e,...,1975-03-04,185.0,175.0,Kasugano,Saitama,1977-06-14,191.0,168.5,29,27
117467,2004-05-01,5,784,J13e,Daishodai,1-4 (6-9),1,yorikiri,103,J14e,...,1976-02-25,176.0,158.0,Kasugano,Saitama,1977-06-14,191.0,168.5,28,27
117468,2004-05-01,2,1227,J13w,Toyonoshima,2-0 (11-4),1,sukuinage,103,J14e,...,1983-06-29,171.0,121.0,Kasugano,Saitama,1977-06-14,191.0,168.5,21,27
117469,2004-05-01,7,101,J12e,Dewanofuji,2-5 (3-12),0,yorikiri,103,J14e,...,1976-12-04,184.0,128.0,Kasugano,Saitama,1977-06-14,191.0,168.5,28,27
117470,2004-05-01,6,874,J11e,Hamanishiki,4-2 (9-6),1,hikiotoshi,103,J14e,...,1976-11-23,181.0,125.0,Kasugano,Saitama,1977-06-14,191.0,168.5,28,27


In [41]:
# Filtering out the Juryos

juryo_ranks = ['J14e', 'J14w', 'J13e', 'J13w', 'J12e', 'J12w', 'J11e', 'J11w','J10e', 'J10w', 'J9e', 'J9w', 'J8e', 'J8w', 'J7e', 'J7w', 'J6e', 'J6w', 'J5e','J5w', 'J4e', 'J4w', 'J3e', 'J3w', 'J2e', 'J2w', 'J1e', 'J1w']
sumo_df = sumo_df.loc[~((sumo_df['wrestler1_rank'].isin(juryo_ranks))|(sumo_df['wrestler2_rank'].isin(juryo_ranks))),:]
sumo_df.head(10)


Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_1_birth_date,wrestler_1_height,wrestler_1_weight,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age
117856,2004-05-01,8,71,M16w,Takanowaka,4-4 (8-7),1,yorikiri,145,M17e,...,1976-04-02,190.0,152.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31
117857,2004-05-01,4,34,M14w,Asanowaka,1-3 (4-11),1,tsukiotoshi,145,M17e,...,1969-12-11,176.0,140.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,35,31
117858,2004-05-01,2,2834,M15w,Futeno,2-0 (7-8),1,yorikiri,145,M17e,...,1980-08-28,181.0,161.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,24,31
117859,2004-05-01,3,5,M13e,Takanonami,0-3,0,fusen,145,M17e,...,1971-10-27,196.0,163.5,Miyagino,Kagoshima,1973-08-18,183.0,132.0,33,31
117860,2004-05-01,5,43,M13w,Kinkaiyama,5-0 (8-7),1,yorikiri,145,M17e,...,1976-01-07,184.0,163.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31
117861,2004-05-01,10,40,M12e,Kaiho,6-4 (9-6),1,yoritaoshi,145,M17e,...,1973-04-17,178.0,125.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,31,31
117862,2004-05-01,13,96,M12w,Jumonji,7-6 (8-7),1,yorikiri,145,M17e,...,1976-06-09,186.0,155.5,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31
117863,2004-05-01,6,129,M11w,Hayateumi,3-3 (8-7),1,oshitaoshi,145,M17e,...,1975-07-05,185.0,118.8,Miyagino,Kagoshima,1973-08-18,183.0,132.0,29,31
117864,2004-05-01,9,38,M10w,Toki,2-7 (4-11),1,oshidashi,145,M17e,...,1974-07-04,190.5,172.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,30,31
117865,2004-05-01,12,13,M9w,Tosanoumi,5-7 (7-8),1,oshidashi,145,M17e,...,1972-02-16,187.0,156.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,32,31


In [42]:
sumo_df['age_diff'] = sumo_df['wrestler_1_age'] - sumo_df['wrestler_2_age']
sumo_df.tail()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_1_height,wrestler_1_weight,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff
214317,2021-05-01,12,12291,O1e,Asanoyama,7-5 (7-5-3),0,fusen,6480,S1e,...,187.0,174.0,Tagonoura,Ibaraki,1990-02-28,187.0,175.0,27,31,-4
214318,2021-05-01,3,12210,K1e,Mitakeumi,3-0 (10-5),1,oshidashi,6642,M3w,...,179.0,172.0,Kokonoe,Mie,1990-07-10,182.0,144.0,29,31,-2
214319,2021-05-01,1,11855,S1w,Takanosho,1-0 (5-10),1,oshidashi,6642,M3w,...,184.0,163.0,Kokonoe,Mie,1990-07-10,182.0,144.0,27,31,-4
214320,2021-05-01,2,6480,S1e,Takayasu,2-0 (10-5),1,tsukitaoshi,6642,M3w,...,187.0,175.0,Kokonoe,Mie,1990-07-10,182.0,144.0,31,31,0
214321,2021-05-01,4,11985,K1w,Daieisho,2-2 (6-9),1,fusen,6642,M3w,...,182.0,161.0,Kokonoe,Mie,1990-07-10,182.0,144.0,28,31,-3


In [43]:
sumo_df['height_diff'] = sumo_df['wrestler_1_height'] - sumo_df['wrestler_2_height']
sumo_df.tail()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_1_weight,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff
214317,2021-05-01,12,12291,O1e,Asanoyama,7-5 (7-5-3),0,fusen,6480,S1e,...,174.0,Tagonoura,Ibaraki,1990-02-28,187.0,175.0,27,31,-4,0.0
214318,2021-05-01,3,12210,K1e,Mitakeumi,3-0 (10-5),1,oshidashi,6642,M3w,...,172.0,Kokonoe,Mie,1990-07-10,182.0,144.0,29,31,-2,-3.0
214319,2021-05-01,1,11855,S1w,Takanosho,1-0 (5-10),1,oshidashi,6642,M3w,...,163.0,Kokonoe,Mie,1990-07-10,182.0,144.0,27,31,-4,2.0
214320,2021-05-01,2,6480,S1e,Takayasu,2-0 (10-5),1,tsukitaoshi,6642,M3w,...,175.0,Kokonoe,Mie,1990-07-10,182.0,144.0,31,31,0,5.0
214321,2021-05-01,4,11985,K1w,Daieisho,2-2 (6-9),1,fusen,6642,M3w,...,161.0,Kokonoe,Mie,1990-07-10,182.0,144.0,28,31,-3,0.0


In [44]:
sumo_df['weight_diff'] = sumo_df['wrestler_1_weight'] - sumo_df['wrestler_2_weight']
sumo_df.tail()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff,weight_diff
214317,2021-05-01,12,12291,O1e,Asanoyama,7-5 (7-5-3),0,fusen,6480,S1e,...,Tagonoura,Ibaraki,1990-02-28,187.0,175.0,27,31,-4,0.0,-1.0
214318,2021-05-01,3,12210,K1e,Mitakeumi,3-0 (10-5),1,oshidashi,6642,M3w,...,Kokonoe,Mie,1990-07-10,182.0,144.0,29,31,-2,-3.0,28.0
214319,2021-05-01,1,11855,S1w,Takanosho,1-0 (5-10),1,oshidashi,6642,M3w,...,Kokonoe,Mie,1990-07-10,182.0,144.0,27,31,-4,2.0,19.0
214320,2021-05-01,2,6480,S1e,Takayasu,2-0 (10-5),1,tsukitaoshi,6642,M3w,...,Kokonoe,Mie,1990-07-10,182.0,144.0,31,31,0,5.0,31.0
214321,2021-05-01,4,11985,K1w,Daieisho,2-2 (6-9),1,fusen,6642,M3w,...,Kokonoe,Mie,1990-07-10,182.0,144.0,28,31,-3,0.0,17.0


In [45]:
filtered_df = sumo_df.loc[(sumo_df['tournament_date'] >= '2004-05-01')]
filtered_df

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff,weight_diff
117856,2004-05-01,8,71,M16w,Takanowaka,4-4 (8-7),1,yorikiri,145,M17e,...,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31,-3,7.0,20.0
117857,2004-05-01,4,34,M14w,Asanowaka,1-3 (4-11),1,tsukiotoshi,145,M17e,...,Miyagino,Kagoshima,1973-08-18,183.0,132.0,35,31,4,-7.0,8.0
117858,2004-05-01,2,2834,M15w,Futeno,2-0 (7-8),1,yorikiri,145,M17e,...,Miyagino,Kagoshima,1973-08-18,183.0,132.0,24,31,-7,-2.0,29.0
117859,2004-05-01,3,5,M13e,Takanonami,0-3,0,fusen,145,M17e,...,Miyagino,Kagoshima,1973-08-18,183.0,132.0,33,31,2,13.0,31.5
117860,2004-05-01,5,43,M13w,Kinkaiyama,5-0 (8-7),1,yorikiri,145,M17e,...,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31,-3,1.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214317,2021-05-01,12,12291,O1e,Asanoyama,7-5 (7-5-3),0,fusen,6480,S1e,...,Tagonoura,Ibaraki,1990-02-28,187.0,175.0,27,31,-4,0.0,-1.0
214318,2021-05-01,3,12210,K1e,Mitakeumi,3-0 (10-5),1,oshidashi,6642,M3w,...,Kokonoe,Mie,1990-07-10,182.0,144.0,29,31,-2,-3.0,28.0
214319,2021-05-01,1,11855,S1w,Takanosho,1-0 (5-10),1,oshidashi,6642,M3w,...,Kokonoe,Mie,1990-07-10,182.0,144.0,27,31,-4,2.0,19.0
214320,2021-05-01,2,6480,S1e,Takayasu,2-0 (10-5),1,tsukitaoshi,6642,M3w,...,Kokonoe,Mie,1990-07-10,182.0,144.0,31,31,0,5.0,31.0


In [46]:
filtered_2_df = pd.DataFrame(filtered_df[['wrestler1_rank','wrestler2_rank']])

filtered_2_df['w1_rank_base'] = filtered_2_df['wrestler1_rank'].str.slice(0,1)
filtered_2_df['w1_rank_#'] = filtered_2_df['wrestler1_rank'].str.extract('(\d+)', expand=True).astype(int)+4

filtered_2_df['w2_rank_base'] = filtered_2_df['wrestler2_rank'].str.slice(0,1)
filtered_2_df['w2_rank_#'] = filtered_2_df['wrestler2_rank'].str.extract('(\d+)', expand=True).astype(int)+4

In [47]:
m1_df = pd.DataFrame(filtered_2_df[['wrestler1_rank','w1_rank_base','w1_rank_#']])
m1_df = m1_df[m1_df.w1_rank_base == "M"]
m1_df['copy_index'] = m1_df.index

In [48]:
m2_df = pd.DataFrame(filtered_2_df[['wrestler2_rank','w2_rank_base','w2_rank_#']])
m2_df = m2_df[m2_df.w2_rank_base == "M"]
m2_df['copy_index'] = m2_df.index

In [49]:
other1_df = pd.DataFrame(filtered_2_df[['wrestler1_rank','w1_rank_base','w1_rank_#']])
other1_df = other1_df[other1_df.w1_rank_base != "M"]
ranks = {"Y":1,"O":2,"S":3,"K":4}
other1_df["w1_rank_#"] = other1_df["w1_rank_base"].map(ranks)
other1_df['copy_index'] = other1_df.index

In [50]:
other2_df = pd.DataFrame(filtered_2_df[['wrestler2_rank','w2_rank_base','w2_rank_#']])
other2_df = other2_df[other2_df.w2_rank_base != "M"]
ranks = {"Y":1,"O":2,"S":3,"K":4}
other2_df["w2_rank_#"] = other2_df["w2_rank_base"].map(ranks)
other2_df['copy_index'] = other2_df.index

In [51]:
res1_df = m1_df.merge(other1_df, how="outer", on=None).set_index('copy_index')
res1_df = res1_df.rename(columns={'w1_rank_#':'wrestler1_rank_id'})

In [52]:
res2_df = m2_df.merge(other2_df, how="outer", on=None).set_index('copy_index')
res2_df = res2_df.rename(columns={'w2_rank_#':'wrestler2_rank_id'})

In [53]:
resolved_df = pd.concat([res1_df,res2_df], axis=1)
resolved_df

Unnamed: 0_level_0,wrestler1_rank,w1_rank_base,wrestler1_rank_id,wrestler2_rank,w2_rank_base,wrestler2_rank_id
copy_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
117856,M16w,M,20,M17e,M,21
117857,M14w,M,18,M17e,M,21
117858,M15w,M,19,M17e,M,21
117859,M13e,M,17,M17e,M,21
117860,M13w,M,17,M17e,M,21
...,...,...,...,...,...,...
214317,O1e,O,2,S1e,S,3
214318,K1e,K,4,M3w,M,7
214319,S1w,S,3,M3w,M,7
214320,S1e,S,3,M3w,M,7


In [54]:
results_df = resolved_df[['wrestler1_rank_id','wrestler2_rank_id']]

In [55]:
finished_df = pd.concat([filtered_df,results_df], axis=1)
finished_df

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff,weight_diff,wrestler1_rank_id,wrestler2_rank_id
117856,2004-05-01,8,71,M16w,Takanowaka,4-4 (8-7),1,yorikiri,145,M17e,...,1973-08-18,183.0,132.0,28,31,-3,7.0,20.0,20,21
117857,2004-05-01,4,34,M14w,Asanowaka,1-3 (4-11),1,tsukiotoshi,145,M17e,...,1973-08-18,183.0,132.0,35,31,4,-7.0,8.0,18,21
117858,2004-05-01,2,2834,M15w,Futeno,2-0 (7-8),1,yorikiri,145,M17e,...,1973-08-18,183.0,132.0,24,31,-7,-2.0,29.0,19,21
117859,2004-05-01,3,5,M13e,Takanonami,0-3,0,fusen,145,M17e,...,1973-08-18,183.0,132.0,33,31,2,13.0,31.5,17,21
117860,2004-05-01,5,43,M13w,Kinkaiyama,5-0 (8-7),1,yorikiri,145,M17e,...,1973-08-18,183.0,132.0,28,31,-3,1.0,31.0,17,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214317,2021-05-01,12,12291,O1e,Asanoyama,7-5 (7-5-3),0,fusen,6480,S1e,...,1990-02-28,187.0,175.0,27,31,-4,0.0,-1.0,2,3
214318,2021-05-01,3,12210,K1e,Mitakeumi,3-0 (10-5),1,oshidashi,6642,M3w,...,1990-07-10,182.0,144.0,29,31,-2,-3.0,28.0,4,7
214319,2021-05-01,1,11855,S1w,Takanosho,1-0 (5-10),1,oshidashi,6642,M3w,...,1990-07-10,182.0,144.0,27,31,-4,2.0,19.0,3,7
214320,2021-05-01,2,6480,S1e,Takayasu,2-0 (10-5),1,tsukitaoshi,6642,M3w,...,1990-07-10,182.0,144.0,31,31,0,5.0,31.0,3,7


In [56]:
finished_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59240 entries, 117856 to 214321
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   tournament_date         59240 non-null  object 
 1   day                     59240 non-null  int64  
 2   wrestler1_id            59240 non-null  int64  
 3   wrestler1_rank          59240 non-null  object 
 4   wrestler1_name          59240 non-null  object 
 5   wrestler1_result        59240 non-null  object 
 6   wrestler1_win           59240 non-null  int64  
 7   finishing_move          59240 non-null  object 
 8   wrestler2_id            59240 non-null  int64  
 9   wrestler2_rank          59240 non-null  object 
 10  wrestler2_name          59240 non-null  object 
 11  wrestler2_result        59240 non-null  object 
 12  wrestler2_win           59240 non-null  int64  
 13  wrestler_1_stable       59240 non-null  object 
 14  wrestler_1_birth_place  59240 no

In [57]:
finished_df['rank_diff'] = finished_df['wrestler1_rank_id'] - finished_df['wrestler2_rank_id']
finished_df.head()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff,weight_diff,wrestler1_rank_id,wrestler2_rank_id,rank_diff
117856,2004-05-01,8,71,M16w,Takanowaka,4-4 (8-7),1,yorikiri,145,M17e,...,183.0,132.0,28,31,-3,7.0,20.0,20,21,-1
117857,2004-05-01,4,34,M14w,Asanowaka,1-3 (4-11),1,tsukiotoshi,145,M17e,...,183.0,132.0,35,31,4,-7.0,8.0,18,21,-3
117858,2004-05-01,2,2834,M15w,Futeno,2-0 (7-8),1,yorikiri,145,M17e,...,183.0,132.0,24,31,-7,-2.0,29.0,19,21,-2
117859,2004-05-01,3,5,M13e,Takanonami,0-3,0,fusen,145,M17e,...,183.0,132.0,33,31,2,13.0,31.5,17,21,-4
117860,2004-05-01,5,43,M13w,Kinkaiyama,5-0 (8-7),1,yorikiri,145,M17e,...,183.0,132.0,28,31,-3,1.0,31.0,17,21,-4


In [58]:
label = {1:'wrestler 1 wins',0:'wrestler 2 wins'}
finished_df["outcome"] = finished_df["wrestler1_win"].map(label)
finished_df.head()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff,weight_diff,wrestler1_rank_id,wrestler2_rank_id,rank_diff,outcome
117856,2004-05-01,8,71,M16w,Takanowaka,4-4 (8-7),1,yorikiri,145,M17e,...,132.0,28,31,-3,7.0,20.0,20,21,-1,wrestler 1 wins
117857,2004-05-01,4,34,M14w,Asanowaka,1-3 (4-11),1,tsukiotoshi,145,M17e,...,132.0,35,31,4,-7.0,8.0,18,21,-3,wrestler 1 wins
117858,2004-05-01,2,2834,M15w,Futeno,2-0 (7-8),1,yorikiri,145,M17e,...,132.0,24,31,-7,-2.0,29.0,19,21,-2,wrestler 1 wins
117859,2004-05-01,3,5,M13e,Takanonami,0-3,0,fusen,145,M17e,...,132.0,33,31,2,13.0,31.5,17,21,-4,wrestler 2 wins
117860,2004-05-01,5,43,M13w,Kinkaiyama,5-0 (8-7),1,yorikiri,145,M17e,...,132.0,28,31,-3,1.0,31.0,17,21,-4,wrestler 1 wins


In [59]:
#DataFrame for Training containing data from May 2004 through March 2021
train_df = finished_df.loc[(finished_df['tournament_date'] <= '2021-03-01')]
train_df.tail()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff,weight_diff,wrestler1_rank_id,wrestler2_rank_id,rank_diff,outcome
213751,2021-03-01,12,12210,K1w,Mitakeumi,5-7 (8-7),0,yorikiri,12291,O1w,...,158.0,29,27,2,-10.0,-9.0,4,2,2,wrestler 2 wins
213752,2021-03-01,8,11985,K2w,Daieisho,3-5 (8-7),1,oshitaoshi,12291,O1w,...,158.0,28,27,1,-10.0,-22.3,4,2,2,wrestler 1 wins
213753,2021-03-01,3,12043,M1w,Onosho,1-2 (4-11),1,fusen,1123,Y1e,...,150.7,25,36,-11,-17.0,-2.2,5,1,4,wrestler 1 wins
213754,2021-03-01,2,11728,M1e,Takarafuji,0-2 (3-12),0,kotenage,1123,Y1e,...,150.7,34,36,-2,-7.0,10.4,5,1,4,wrestler 2 wins
213755,2021-03-01,1,11985,K2w,Daieisho,0-1 (8-7),0,yoritaoshi,1123,Y1e,...,150.7,28,36,-8,-14.0,-15.0,4,1,3,wrestler 2 wins


In [60]:
#DataFrame for Testing containing only May 2021 data
test_df = finished_df.loc[(finished_df['tournament_date'] > '2021-03-01')]
test_df

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_weight,wrestler_1_age,wrestler_2_age,age_diff,height_diff,weight_diff,wrestler1_rank_id,wrestler2_rank_id,rank_diff,outcome
213756,2021-05-01,1,7240,M16w,Chiyomaru,1-0 (8-7),1,hikiotoshi,11918,M17e,...,168.0,30,31,-1,-6.0,21.0,20,21,-1,wrestler 1 wins
213757,2021-05-01,6,6753,M15e,Kaisei,4-2 (9-6),1,oshidashi,11918,M17e,...,168.0,35,31,4,11.0,24.0,19,21,-2,wrestler 1 wins
213758,2021-05-01,3,12051,M16e,Ishiura,1-2 (7-8),0,oshidashi,11918,M17e,...,168.0,31,31,0,-10.0,-58.0,20,21,-1,wrestler 2 wins
213759,2021-05-01,2,11934,M14w,Chiyotairyu,2-0 (10-5),1,hatakikomi,11918,M17e,...,168.0,33,31,2,-3.0,3.0,18,21,-3,wrestler 1 wins
213760,2021-05-01,4,12273,M13w,Daiamami,2-2 (7-8),1,katasukashi,11918,M17e,...,168.0,29,31,-2,-1.0,14.0,17,21,-4,wrestler 1 wins
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214317,2021-05-01,12,12291,O1e,Asanoyama,7-5 (7-5-3),0,fusen,6480,S1e,...,175.0,27,31,-4,0.0,-1.0,2,3,-1,wrestler 2 wins
214318,2021-05-01,3,12210,K1e,Mitakeumi,3-0 (10-5),1,oshidashi,6642,M3w,...,144.0,29,31,-2,-3.0,28.0,4,7,-3,wrestler 1 wins
214319,2021-05-01,1,11855,S1w,Takanosho,1-0 (5-10),1,oshidashi,6642,M3w,...,144.0,27,31,-4,2.0,19.0,3,7,-4,wrestler 1 wins
214320,2021-05-01,2,6480,S1e,Takayasu,2-0 (10-5),1,tsukitaoshi,6642,M3w,...,144.0,31,31,0,5.0,31.0,3,7,-4,wrestler 1 wins


Setting up features for SKLearn Library

In [61]:
wrestler_list =  {1123: "Hakuho", 5944: "Tamawashi", 6463: "Okinoumi", 6480: "Takayasu", 6642: "Chiyonokuni", 6753: "Kaisei", 7153: "Kotoeko", 7240: "Chiyomaru", 
                  11726: "Tokushoryu", 11728: "Takarafuji", 11784: "Myogiryu", 11785: "Chiyoshoma", 11786: "Aoiyama", 11840: "Chiyonoo", 11845: "Kagayaki", 11855: "Takanosho", 
                  11927: "Terunofuji", 11946: "Meisei", 11985: "Daieisho", 12024: "Shimanoumi", 12026: "Hidenoumi", 12043: "Onosho", 12051: "Ishiura", 12055: "Endo", 
                  12107: "Ichinojo", 12113: "Tsurugisho", 12130: "Shodai", 12191: "Takakeisho", 12210: "Mitakeumi", 12226: "Ura", 12231: "Kiribayama", 12239: "Hokutofuji", 
                  12270: "Kotonowaka", 12273: "Daiamami", 12291: "Asanoyama", 12362: "Ichiyamamoto", 12370: "Wakatakakage", 12451: "Hoshoryu"
                  }

In [62]:
#Logistic Regresssion method
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split  # used if performing model evaluation/cross-validation
from sklearn.metrics import precision_recall_fscore_support

In [63]:
X_train = train_df.loc[:, ['wrestler1_id','wrestler2_id','height_diff','weight_diff','age_diff','rank_diff']]
y_train = train_df.loc[:, 'outcome']
print(X_train.shape, y_train.shape)

(58690, 6) (58690,)


In [64]:
X_test = test_df.loc[:, ['wrestler1_id','wrestler2_id','height_diff','weight_diff','age_diff','rank_diff']]
y_test = test_df.loc[:, 'outcome']
print(X_test.shape, y_test.shape)

(550, 6) (550,)


In [65]:
classifier = LogisticRegression()
classifier

LogisticRegression()

In [66]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [67]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.5859771681717498
Testing Data Score: 0.5454545454545454


In [68]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['wrestler 1 wins' 'wrestler 1 wins' 'wrestler 1 wins' 'wrestler 1 wins'
 'wrestler 1 wins' 'wrestler 1 wins' 'wrestler 1 wins' 'wrestler 1 wins'
 'wrestler 1 wins' 'wrestler 1 wins']
First 10 Actual labels: ['wrestler 1 wins', 'wrestler 1 wins', 'wrestler 2 wins', 'wrestler 1 wins', 'wrestler 1 wins', 'wrestler 2 wins', 'wrestler 2 wins', 'wrestler 1 wins', 'wrestler 1 wins', 'wrestler 1 wins']


In [69]:
prediction_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
prediction_df

Unnamed: 0,Prediction,Actual
213756,wrestler 1 wins,wrestler 1 wins
213757,wrestler 1 wins,wrestler 1 wins
213758,wrestler 1 wins,wrestler 2 wins
213759,wrestler 1 wins,wrestler 1 wins
213760,wrestler 1 wins,wrestler 1 wins
...,...,...
214317,wrestler 1 wins,wrestler 2 wins
214318,wrestler 1 wins,wrestler 1 wins
214319,wrestler 1 wins,wrestler 1 wins
214320,wrestler 1 wins,wrestler 1 wins


In [70]:
prediction2_df = prediction_df
#del prediction2_df['Actual']
fx = {'wrestler 1 wins':1,'wrestler 2 wins':0}
prediction2_df['Prediction'] = prediction2_df['Prediction'].map(fx)
prediction2_df


Unnamed: 0,Prediction,Actual
213756,1,wrestler 1 wins
213757,1,wrestler 1 wins
213758,1,wrestler 2 wins
213759,1,wrestler 1 wins
213760,1,wrestler 1 wins
...,...,...
214317,1,wrestler 2 wins
214318,1,wrestler 1 wins
214319,1,wrestler 1 wins
214320,1,wrestler 1 wins


In [71]:
outcome_df = test_df.join(prediction2_df, how='outer')
outcome_df['outcome'] = outcome_df['outcome'].map(fx)
outcome_df

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_2_age,age_diff,height_diff,weight_diff,wrestler1_rank_id,wrestler2_rank_id,rank_diff,outcome,Prediction,Actual
213756,2021-05-01,1,7240,M16w,Chiyomaru,1-0 (8-7),1,hikiotoshi,11918,M17e,...,31,-1,-6.0,21.0,20,21,-1,1,1,wrestler 1 wins
213757,2021-05-01,6,6753,M15e,Kaisei,4-2 (9-6),1,oshidashi,11918,M17e,...,31,4,11.0,24.0,19,21,-2,1,1,wrestler 1 wins
213758,2021-05-01,3,12051,M16e,Ishiura,1-2 (7-8),0,oshidashi,11918,M17e,...,31,0,-10.0,-58.0,20,21,-1,0,1,wrestler 2 wins
213759,2021-05-01,2,11934,M14w,Chiyotairyu,2-0 (10-5),1,hatakikomi,11918,M17e,...,31,2,-3.0,3.0,18,21,-3,1,1,wrestler 1 wins
213760,2021-05-01,4,12273,M13w,Daiamami,2-2 (7-8),1,katasukashi,11918,M17e,...,31,-2,-1.0,14.0,17,21,-4,1,1,wrestler 1 wins
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214317,2021-05-01,12,12291,O1e,Asanoyama,7-5 (7-5-3),0,fusen,6480,S1e,...,31,-4,0.0,-1.0,2,3,-1,0,1,wrestler 2 wins
214318,2021-05-01,3,12210,K1e,Mitakeumi,3-0 (10-5),1,oshidashi,6642,M3w,...,31,-2,-3.0,28.0,4,7,-3,1,1,wrestler 1 wins
214319,2021-05-01,1,11855,S1w,Takanosho,1-0 (5-10),1,oshidashi,6642,M3w,...,31,-4,2.0,19.0,3,7,-4,1,1,wrestler 1 wins
214320,2021-05-01,2,6480,S1e,Takayasu,2-0 (10-5),1,tsukitaoshi,6642,M3w,...,31,0,5.0,31.0,3,7,-4,1,1,wrestler 1 wins


In [72]:
predict1_df = outcome_df[['wrestler1_name','outcome','Prediction']]
predict1_df.rename(columns = {'wrestler1_name':'Shikona','outcome':'Actual Basho Outcome','Prediction':'Predicted Basho Outcome'}, inplace = True)
predict12_df = predict1_df.groupby(['Shikona'])
data_table_df = predict12_df.sum().sort_values(by=['Predicted Basho Outcome'], ascending=False)
data_table_df

Unnamed: 0_level_0,Actual Basho Outcome,Predicted Basho Outcome
Shikona,Unnamed: 1_level_1,Unnamed: 2_level_1
Takakeisho,12,16
Terunofuji,13,14
Kagayaki,6,12
Asanoyama,7,12
Kotonowaka,7,12
Shodai,9,12
Takayasu,10,12
Takanosho,5,12
Tochinoshin,5,9
Terutsuyoshi,7,9


In [75]:
import dataframe_image as dfi
#.dfi.export('prediction.png')
data_table_df.export_png('prediction.png')
#files.download("prediction.png") 

AttributeError: 'DataFrame' object has no attribute 'export_png'

In [73]:
from bokeh.io import export_png, export_svgs
from bokeh.models import ColumnDataSource, DataTable, TableColumn

df = data_table_df
path = '/Users/timweir/Downloads/prediction.png'

def save_df_as_image(df, path):
    source = ColumnDataSource(df)
    df_columns = [df.index.name]
    df_columns.extend(df.columns.values)
    columns_for_table=[]
    for column in df_columns:
        columns_for_table.append(TableColumn(field=column, title=column))

    data_table = DataTable(source=source, columns=columns_for_table,height_policy="auto",width_policy="auto",index_position=None)
    export_png(data_table, filename = path)
    