In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [2]:
# Import Match Data CSV
match_data = pd.read_csv("raw_scraped_csv/match_list_33150.csv")
match_data.head()

Unnamed: 0,pull_time,game_id,console,winning_club,h_club,h_club_id,h_goals,h_shots,h_shot_percent,h_passes_made,...,opp_tackle_percent,opp_red_cards,opp_players_in_match,opp_forward,opp_midfielder,opp_defender,opp_goalkeeper,opp_any,opp_season,opp_round
0,2023-03-21 05:02:18,15695788211,xboxone,Draw,Bang Average FC,1940614,1,6,0.17,95,...,0.18,0,5,2,3,0,0,0,2,11
1,2023-03-21 05:02:18,1940614259,xboxone,Bang Average FC,Bang Average FC,1940614,5,13,0.38,92,...,0.31,1,10,2,3,4,1,0,8,96
2,2023-03-21 05:02:18,7612049113,xboxone,punjabi panther,Bang Average FC,1940614,1,7,0.14,99,...,0.47,0,6,2,4,0,0,0,1,13
3,2023-03-21 05:02:18,19406141058,xboxone,Draw,Bang Average FC,1940614,1,11,0.09,154,...,0.13,0,9,3,2,3,1,0,8,58
4,2023-03-21 05:02:18,8369654865,xboxone,Bang Average FC,Bang Average FC,1940614,3,13,0.23,118,...,0.23,0,11,2,5,3,1,0,8,65


In [3]:
# List columns

match_data.columns

Index(['pull_time', 'game_id', 'console', 'winning_club', 'h_club',
       'h_club_id', 'h_goals', 'h_shots', 'h_shot_percent', 'h_passes_made',
       'h_pass_attempts', 'h_pass_percent', 'h_tackles_made',
       'h_tackle_attempts', 'h_tackle_percent', 'h_red_cards',
       'h_players_in_match', 'h_forward', 'h_midfielder', 'h_defender',
       'h_goalkeeper', 'h_any', 'h_season', 'h_round', 'opp_club',
       'opp_club_id', 'opp_goals', 'opp_shots', 'opp_shot_percent',
       'opp_passes_made', 'opp_pass_attempts', 'opp_pass_percent',
       'opp_tackles_made', 'opp_tackle_attempts', 'opp_tackle_percent',
       'opp_red_cards', 'opp_players_in_match', 'opp_forward',
       'opp_midfielder', 'opp_defender', 'opp_goalkeeper', 'opp_any',
       'opp_season', 'opp_round'],
      dtype='object')

In [4]:
# Given that the df provided effectively duplicate columns (one for the home team, and one for the away team), 
# we will strip split the match data in half (for home/away) and then merge it back together to provide 
# A single unified df 2x in lenghth. 

h_club_matches = match_data[['winning_club', 'h_club',
       'h_goals', 'h_shots', 'h_shot_percent', 'h_passes_made',
       'h_pass_attempts', 'h_pass_percent', 'h_tackles_made',
       'h_tackle_attempts', 'h_tackle_percent', 'h_red_cards',
       'h_players_in_match', 'h_forward', 'h_midfielder', 'h_defender',
       'h_goalkeeper', 'h_any',]]

opp_club_matches = match_data[['winning_club','opp_club',
       'opp_goals', 'opp_shots', 'opp_shot_percent',
       'opp_passes_made', 'opp_pass_attempts', 'opp_pass_percent',
       'opp_tackles_made', 'opp_tackle_attempts', 'opp_tackle_percent',
       'opp_red_cards', 'opp_players_in_match', 'opp_forward',
       'opp_midfielder', 'opp_defender', 'opp_goalkeeper', 'opp_any']]

In [5]:
print(len(h_club_matches.columns))
print(len(opp_club_matches.columns))

18
18


In [6]:
hcm_renamed = h_club_matches.rename(columns={'winning_club':"win", 'h_club':"club",
       'h_goals':"goals", 'h_shots':"shots", 'h_shot_percent':"shot_percent", 'h_passes_made':"passes_made",
       'h_pass_attempts':"pass_attempts", 'h_pass_percent':"pass_percent", 'h_tackles_made':"tackles_made",
       'h_tackle_attempts':"tackle_attempts", 'h_tackle_percent':"tackle_percent", 'h_red_cards':"red_cards",
       'h_players_in_match':"players_in_match", 'h_forward':"forwards", 'h_midfielder':"midfielders", 'h_defender':"defenders",
       'h_goalkeeper':"goalkeeper", 'h_any':"any"})

opp_renamed = opp_club_matches.rename(columns={'winning_club':"win", 'opp_club':"club",
       'opp_goals':"goals", 'opp_shots':"shots", 'opp_shot_percent':"shot_percent", 'opp_passes_made':"passes_made",
       'opp_pass_attempts':"pass_attempts", 'opp_pass_percent':"pass_percent", 'opp_tackles_made':"tackles_made",
       'opp_tackle_attempts':"tackle_attempts", 'opp_tackle_percent':"tackle_percent", 'opp_red_cards':"red_cards",
       'opp_players_in_match':"players_in_match", 'opp_forward':"forwards", 'opp_midfielder':"midfielders", 'opp_defender':"defenders",
       'opp_goalkeeper':"goalkeeper", 'opp_any':"any"})

print(len(hcm_renamed))
print(len(opp_renamed))
unified_match_data = hcm_renamed.append(opp_renamed)
print(len(unified_match_data))
unified_match_data.head()

33150
33150
66300


Unnamed: 0,win,club,goals,shots,shot_percent,passes_made,pass_attempts,pass_percent,tackles_made,tackle_attempts,tackle_percent,red_cards,players_in_match,forwards,midfielders,defenders,goalkeeper,any
0,Draw,Bang Average FC,1,6,0.17,95,125,0.76,18,49,0.37,0,8,2,3,3,0,0
1,Bang Average FC,Bang Average FC,5,13,0.38,92,120,0.77,13,29,0.45,0,8,2,3,3,0,0
2,punjabi panther,Bang Average FC,1,7,0.14,99,124,0.8,17,65,0.26,0,10,2,4,4,0,0
3,Draw,Bang Average FC,1,11,0.09,154,195,0.79,16,41,0.39,0,11,2,4,4,1,0
4,Bang Average FC,Bang Average FC,3,13,0.23,118,145,0.81,10,43,0.23,0,10,2,4,4,0,0


In [7]:
unified_match_data['any'].value_counts()

0     46904
1     18079
2       894
3       242
4        96
5        42
7        17
6        14
8         5
9         4
10        2
11        1
Name: any, dtype: int64

In [8]:
print(len(unified_match_data))
unified_match_data['any'] = np.where(unified_match_data['any'] >= 1, 1, 0)
unified_match_data['goalkeeper'] = np.where(unified_match_data['goalkeeper'] >= 1, 1, 0)
unified_match_data = unified_match_data[unified_match_data['passes_made'] != 0]
print(len(unified_match_data))

66300
65579


In [9]:
unified_match_data['result'] = np.where(unified_match_data.club == unified_match_data.win, 1, 0)
unified_match_data = unified_match_data.reset_index() 
unified_match_data = unified_match_data.drop(['club', 'index', 'win'], axis=1)

In [10]:
len(unified_match_data)
unified_match_data = unified_match_data[np.isfinite(unified_match_data).all(1)]
len(unified_match_data)

65576

In [11]:
# unified_match_data.to_csv('models/model_ready_match_data.csv', index=False)

In [12]:
unified_match_data['any'].value_counts()

0    46355
1    19221
Name: any, dtype: int64