In [141]:
import pandas as pd 
import numpy as np
import os 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [142]:
pd.set_option("display.max_columns",85)
# for displaying purposes 

## Reading the Data
<hr>

In [143]:
df1= pd.read_csv("season2017-2018.csv")
df2= pd.read_csv("season2017-2018(2).csv")

df1=pd.concat([df1,pd.read_csv("season2018-2019.csv")],axis=0)
df2=pd.concat([df2,pd.read_csv("season2018-2019(2).csv")],axis=0)

df1=pd.concat([df1,pd.read_csv("season2019-2020.csv")],axis=0)
df2=pd.concat([df2,pd.read_csv("season2019-2020(2).csv")],axis=0)

df1=pd.concat([df1,pd.read_csv("season2020-2021.csv")],axis=0)
df2=pd.concat([df2,pd.read_csv("season2020-2021(2).csv")],axis=0)


df1=pd.concat([df1,pd.read_csv("season2021-2022.csv")],axis=0)
df2=pd.concat([df2,pd.read_csv("season2021-2022(2).csv")],axis=0)

# the data is scraped from different resourses
# so the first part of this notebook will be for cleaing the data and 
# unify the attributes and so on

In [144]:
df1.drop(["Unnamed: 0"],axis='columns',inplace=True)
# droping the indexing column in the data 

In [145]:
# convert the date column from string to a date object 
# date column will be a very important column in the preprocssesing process 
# it will be the index that the matches will be jonined at 
# the date should be the same for the same match in different resources 
# it means that the data will be sorted on the date then on other attributes discussed later 
df1["date"]=pd.to_datetime(df1["date"].str.replace("-",""),format="%Y%m%d")

In [146]:
df2["Date"]=pd.to_datetime(df2["Date"].str.replace("/",""),format="%d%m%Y")

In [147]:
# dropping unwanted columns 

In [148]:
df2.drop(["Div","FTHG","FTAG","HTHG","HTAG","HTR","Referee"],axis='columns',inplace=True)
# drop the dive , FTHG(final time home goal), FTAG(Final time  away Goals)
# HTGH(half time goals home ), " ... " "to referee"

In [149]:
index = df2.columns.get_loc("B365H")
df2.drop(df2.iloc[:,index:],axis=1,inplace=True)
# dropping the betting data 

In [150]:
df2.sort_values(by=["Date","HomeTeam","AwayTeam"],inplace=True)
# sorting the data on Date , Home , then Away 
# the data relevant to each other 
# it means that the 380 row in the first resources will be the same 
# matches for the second resource which will make the joining process correct 

In [151]:
df1.sort_values(by=["date","home","away"],inplace=True)

In [152]:
# TCP -> Team Complete Pass
# TTP -> Team Total Pass

df1["TCP"]=df1["passeshome"].str.split("—").apply(lambda x : x[0].split("of")[0])
df1["TTP"]=df1["passeshome"].str.split("—").apply(lambda x : x[0].split("of")[1])
df1.drop(["passeshome"],axis=1,inplace=True)


# the date was in string format like that "200 of 400-- 50%"
# so I want to get the team complete pass 200 , team total pass 400 


In [153]:
#OCP ->Opponent Complete Pass
#OTP -> Opponent Total Pass
df1["OCP"]=df1["passesaway"].str.split("—").apply(lambda x : x[1].split("of")[0])
df1["OTP"]=df1["passesaway"].str.split("—").apply(lambda x : x[1].split("of")[1])
df1.drop(["passesaway"],axis=1,inplace=True)

# the same thing in opponent 

In [154]:
df1.drop(["saveshome","savesaway"],axis=1,inplace=True)

# you have extracted the data so , drop the columns 

In [155]:
df1.rename(columns={"possessionhome":"POT","possessionaway":"POO","crosseshome":"CRT",
                   "crossesaway":"CRO","toucheshome":"TCT","touchesaway":"TCO",
                   "tackleshome":"TTT","tacklesaway":"TTO","interceptionhome":"IPT","interceptionaway":"IPO",
                   "aerialshome":"ART","aerialsaway":"ARO","clearnacehome":"CLT","clearnaceaway":"CLO"},inplace=True)


# renaming the date 

# POT-> PossesionTeam
# POO=> PossesionOpponent
# CRT => crossesTeam
# CRO => crossesOpponent
# TCT => touchesTeam
# TCO => touchesOpponent
# TTT => tacklesTeam
# TTO => tacklesOpponent
# IPT => interceptionTeam
# IPO => interceptionOpponent
# ART => aerialsTeam
# ARO => aerialsOpponent
# CLT => clearnaceTeam
# CLO => clearnaceOpponent


# the team data means the home team , opponent will be the away team 
# I will explain in details what will I do to make use of the Home attribute 
# in the model 

In [156]:
df1.set_index("date",inplace=True)
df2.set_index("Date",inplace=True)

In [157]:
total_data=pd.concat([df1,df2],axis=1).drop(["HomeTeam","AwayTeam"],axis=1)

# merging the data 

In [158]:
# TS=> Team shot 
# OS=> Oppoent shot 
# TST=>Team shot taret
# OST=>Opponent shot target

# and the rest just change the home t- team , and away to opponent 

total_data.rename({"HS":"TS","AS":"OS","HST":"TST","AST":"OST","HF":"TF","AF":"OF","HC":"TC","AC":"OC","HY":"TY"
                   ,"AY":"OY","HR":"TR","AR":"OR"},axis=1,inplace=True)

In [159]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import copy 
# this process should be done after concat all the data exampels 
# because as you know there are 5 teams that will be present from the 
# Div1 and five will go from Div0 to Div1 each season 
# so to get all the name of the teams that 
# This is not always ideal for ML as the integers have different numerical values, 
#suggesting that one is bigger than the other, with, for example Pear > Apple, 
#which is not at all the case. To not introduce this kind of problem you'd want to use OneHotEncoder.


In [160]:
total_data.rename({"home":"team","away":"opponent"},axis=1,inplace=True)
# renameing the home to team and the away to opponent 

In [161]:
total_data["Home"]=1



## Important Part 
<hr>
**here I will  give the model the name of the team then the opponent name and the most effective attribute is 'are you palying on your home or not' so I should duplicate the data and reverse the order**
example 


- if the home team is arsenal and the away is Liverpool
- the first example should be like that {"team":"Arsenal","opponent":"Liverpool",...."Home":"1"}
-  the last input was taking in Arsenal perspective so what about liverpool
-  we should add another example in the data which will talk about Liverpool perspective 
-  {"team":"Liverpool", "opponent":"Aresnal",.... "Home":"0"}

In [162]:
temp=copy.deepcopy(total_data)

In [163]:
total_data.columns

Index(['team', 'opponent', 'POT', 'POO', 'CRT', 'CRO', 'TCT', 'TCO', 'TTT',
       'TTO', 'IPT', 'IPO', 'ART', 'ARO', 'CLT', 'CLO', 'TCP', 'TTP', 'OCP',
       'OTP', 'FTR', 'TS', 'OS', 'TST', 'OST', 'TF', 'OF', 'TC', 'OC', 'TY',
       'OY', 'TR', 'OR', 'Home'],
      dtype='object')

In [164]:
temp.rename({ "team":"opponent","opponent":"team" , "POT":"POO","POO":"POT" ,"CRT":"CRO","CRO":"CRT",
              "TCT":"TCO","TCO":"TCT" , "TTT":"TTO","TTO":"TTT" , "IPT":"IPO","IPO":"IPT" 
             ,"ART":"ARO","ARO":"ART", "CLT":"CLO","CLO":"CLT", "TCP":"OCP","OCP":"TCP" 
             ,"TTP":"OTP","OTP":"TTP",  "TS":"OS","OS":"TS", "TST":"OST","OST":"TST" ,"TF":"OF","OF":"TF",
            "TC":"OC","OC":"TC", "TY":"OY","OY":"TY", "TR":"OR","OR":"TR" }
            ,axis=1,inplace=True)



# just reversing the order of all the attributes the team will be the 
# opponent and the opponent will be the team to see the prespective 
# of the two clubs not only one 

In [165]:
temp=temp[["team","opponent","POT","POO","CRT","CRO","TCT","TCO","TTT","TTO","IPT","IPO","ART","ARO","CLT","CLO","TCP","TTP","OCP","OTP","FTR","TS","OS","TST","OST","TF","OF","TC","OC","TY","OY","TR","OR","Home"]]

# reording the columns to be in the  same order as the total data  

In [166]:
total_data.columns==temp.columns

filt_win= (total_data["FTR"]=='H')
filt_draw=(total_data["FTR"]=='D')
filt_loss=(total_data["FTR"]=='A')
print(total_data.loc[filt_draw].shape[0])
print(total_data.loc[filt_win].shape[0])
print(total_data.loc[filt_loss].shape[0])
total_data.loc[filt_win,"FTR"]=2
total_data.loc[filt_draw,"FTR"]=1
total_data.loc[filt_loss,"FTR"]=0
total_data["Home"]=1
total_data[["team","opponent","FTR","Home"]]

# just encode the H-> my perspective team win to 2 
#                 D-> Drawing 
#                 L-> loss 
# and see some data about the effect of home in making the team 
# make a positive result 

433
833
634


Unnamed: 0,team,opponent,FTR,Home
2017-08-11,Arsenal,Leicester City,2,1
2017-08-12,Brighton & Hove Albion,Manchester City,0,1
2017-08-12,Chelsea,Burnley,0,1
2017-08-12,Crystal Palace,Huddersfield Town,0,1
2017-08-12,Everton,Stoke City,2,1
...,...,...,...,...
2022-05-22,Crystal Palace,Manchester United,2,1
2022-05-22,Leicester City,Southampton,2,1
2022-05-22,Liverpool,Wolverhampton Wanderers,2,1
2022-05-22,Manchester City,Aston Villa,2,1


In [167]:
filt_win= (temp["FTR"]=='H')
filt_draw=(temp["FTR"]=='D')
filt_loss=(temp["FTR"]=='A')

temp.loc[filt_win,"FTR"]=0
temp.loc[filt_loss,"FTR"]=2
temp.loc[filt_draw,"FTR"]=1
temp["Home"]=0
temp[["team","opponent","FTR","Home"]]

# the second team prespective is the converse of the first one 

Unnamed: 0,team,opponent,FTR,Home
2017-08-11,Leicester City,Arsenal,0,0
2017-08-12,Manchester City,Brighton & Hove Albion,2,0
2017-08-12,Burnley,Chelsea,2,0
2017-08-12,Huddersfield Town,Crystal Palace,2,0
2017-08-12,Stoke City,Everton,0,0
...,...,...,...,...
2022-05-22,Manchester United,Crystal Palace,0,0
2022-05-22,Southampton,Leicester City,0,0
2022-05-22,Wolverhampton Wanderers,Liverpool,0,0
2022-05-22,Aston Villa,Manchester City,0,0


In [168]:
season_data_complete=pd.concat([total_data,temp],axis=0) # concat the data 

In [169]:
season_data_complete

Unnamed: 0,team,opponent,POT,POO,CRT,CRO,TCT,TCO,TTT,TTO,IPT,IPO,ART,ARO,CLT,CLO,TCP,TTP,OCP,OTP,FTR,TS,OS,TST,OST,TF,OF,TC,OC,TY,OY,TR,OR,Home
2017-08-11,Arsenal,Leicester City,68%,32%,20,18,801,427,23,17,13,11,18,17,33,29,565,681,192,315,2,27,6,10,3,9,12,9,4,0,1,0,0,1
2017-08-12,Brighton & Hove Albion,Manchester City,23%,77%,8,27,343,897,10,10,12,9,13,13,43,8,146,248,718,820,0,6,14,2,4,6,9,3,10,0,2,0,0,1
2017-08-12,Chelsea,Burnley,62%,38%,26,16,667,453,10,8,9,13,15,21,23,44,468,568,257,348,0,19,10,6,5,16,11,8,5,3,3,2,0,1
2017-08-12,Crystal Palace,Huddersfield Town,56%,45%,27,14,561,460,24,28,19,9,18,16,20,32,333,444,230,356,0,14,8,4,6,7,19,12,9,1,3,0,0,1
2017-08-12,Everton,Stoke City,60%,40%,18,27,670,473,17,18,19,24,19,23,43,26,425,545,233,356,2,9,9,4,1,13,10,6,7,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-22,Manchester United,Crystal Palace,61%,39%,21,11,679,479,17,8,12,9,10,8,10,21,466,582,263,367,0,10,6,4,3,22,12,6,3,4,2,0,0,0
2022-05-22,Southampton,Leicester City,49%,51%,14,9,566,588,17,20,11,7,10,8,9,16,349,467,375,479,0,7,12,2,6,5,10,3,3,1,0,0,0,0
2022-05-22,Wolverhampton Wanderers,Liverpool,37%,63%,9,24,570,835,18,21,15,6,4,7,31,8,343,430,632,730,0,7,29,5,8,3,6,3,5,0,1,0,0,0
2022-05-22,Aston Villa,Manchester City,29%,71%,5,35,388,762,17,11,11,5,18,23,31,9,180,271,568,670,0,4,24,2,5,11,5,1,13,1,0,0,0,0


In [170]:
temp_home=pd.get_dummies(season_data_complete["team"],prefix="team")
temp_home

# converting the team name into hot vector 
# team name will be a crucial part , because if for example Liverpool 
# is playing with brighton "the winning chances for liverpool " will 
# be higher 

Unnamed: 0,team_Arsenal,team_Aston Villa,team_Bournemouth,team_Brentford,team_Brighton & Hove Albion,team_Burnley,team_Cardiff City,team_Chelsea,team_Crystal Palace,team_Everton,team_Fulham,team_Huddersfield Town,team_Leeds United,team_Leicester City,team_Liverpool,team_Manchester City,team_Manchester United,team_Newcastle United,team_Norwich City,team_Sheffield United,team_Southampton,team_Stoke City,team_Swansea City,team_Tottenham Hotspur,team_Watford,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers
2017-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2022-05-22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [171]:
temp_opponent=pd.get_dummies(season_data_complete["opponent"],prefix="opponent")
temp_opponent

Unnamed: 0,opponent_Arsenal,opponent_Aston Villa,opponent_Bournemouth,opponent_Brentford,opponent_Brighton & Hove Albion,opponent_Burnley,opponent_Cardiff City,opponent_Chelsea,opponent_Crystal Palace,opponent_Everton,opponent_Fulham,opponent_Huddersfield Town,opponent_Leeds United,opponent_Leicester City,opponent_Liverpool,opponent_Manchester City,opponent_Manchester United,opponent_Newcastle United,opponent_Norwich City,opponent_Sheffield United,opponent_Southampton,opponent_Stoke City,opponent_Swansea City,opponent_Tottenham Hotspur,opponent_Watford,opponent_West Bromwich Albion,opponent_West Ham United,opponent_Wolverhampton Wanderers
2017-08-11,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-22,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [172]:
hot_vectors=pd.concat([temp_home,temp_opponent],axis=1)
hot_vectors

Unnamed: 0,team_Arsenal,team_Aston Villa,team_Bournemouth,team_Brentford,team_Brighton & Hove Albion,team_Burnley,team_Cardiff City,team_Chelsea,team_Crystal Palace,team_Everton,team_Fulham,team_Huddersfield Town,team_Leeds United,team_Leicester City,team_Liverpool,team_Manchester City,team_Manchester United,team_Newcastle United,team_Norwich City,team_Sheffield United,team_Southampton,team_Stoke City,team_Swansea City,team_Tottenham Hotspur,team_Watford,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers,opponent_Arsenal,opponent_Aston Villa,opponent_Bournemouth,opponent_Brentford,opponent_Brighton & Hove Albion,opponent_Burnley,opponent_Cardiff City,opponent_Chelsea,opponent_Crystal Palace,opponent_Everton,opponent_Fulham,opponent_Huddersfield Town,opponent_Leeds United,opponent_Leicester City,opponent_Liverpool,opponent_Manchester City,opponent_Manchester United,opponent_Newcastle United,opponent_Norwich City,opponent_Sheffield United,opponent_Southampton,opponent_Stoke City,opponent_Swansea City,opponent_Tottenham Hotspur,opponent_Watford,opponent_West Bromwich Albion,opponent_West Ham United,opponent_Wolverhampton Wanderers
2017-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-08-12,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-05-22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [173]:
season_data_complete=pd.concat([hot_vectors,season_data_complete],axis=1)
season_data_complete

Unnamed: 0,team_Arsenal,team_Aston Villa,team_Bournemouth,team_Brentford,team_Brighton & Hove Albion,team_Burnley,team_Cardiff City,team_Chelsea,team_Crystal Palace,team_Everton,team_Fulham,team_Huddersfield Town,team_Leeds United,team_Leicester City,team_Liverpool,team_Manchester City,team_Manchester United,team_Newcastle United,team_Norwich City,team_Sheffield United,team_Southampton,team_Stoke City,team_Swansea City,team_Tottenham Hotspur,team_Watford,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers,opponent_Arsenal,opponent_Aston Villa,opponent_Bournemouth,opponent_Brentford,opponent_Brighton & Hove Albion,opponent_Burnley,opponent_Cardiff City,opponent_Chelsea,opponent_Crystal Palace,opponent_Everton,opponent_Fulham,opponent_Huddersfield Town,opponent_Leeds United,opponent_Leicester City,...,opponent_Southampton,opponent_Stoke City,opponent_Swansea City,opponent_Tottenham Hotspur,opponent_Watford,opponent_West Bromwich Albion,opponent_West Ham United,opponent_Wolverhampton Wanderers,team,opponent,POT,POO,CRT,CRO,TCT,TCO,TTT,TTO,IPT,IPO,ART,ARO,CLT,CLO,TCP,TTP,OCP,OTP,FTR,TS,OS,TST,OST,TF,OF,TC,OC,TY,OY,TR,OR,Home
2017-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,Arsenal,Leicester City,68%,32%,20,18,801,427,23,17,13,11,18,17,33,29,565,681,192,315,2,27,6,10,3,9,12,9,4,0,1,0,0,1
2017-08-12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Brighton & Hove Albion,Manchester City,23%,77%,8,27,343,897,10,10,12,9,13,13,43,8,146,248,718,820,0,6,14,2,4,6,9,3,10,0,2,0,0,1
2017-08-12,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Chelsea,Burnley,62%,38%,26,16,667,453,10,8,9,13,15,21,23,44,468,568,257,348,0,19,10,6,5,16,11,8,5,3,3,2,0,1
2017-08-12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Crystal Palace,Huddersfield Town,56%,45%,27,14,561,460,24,28,19,9,18,16,20,32,333,444,230,356,0,14,8,4,6,7,19,12,9,1,3,0,0,1
2017-08-12,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,Everton,Stoke City,60%,40%,18,27,670,473,17,18,19,24,19,23,43,26,425,545,233,356,2,9,9,4,1,13,10,6,7,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Manchester United,Crystal Palace,61%,39%,21,11,679,479,17,8,12,9,10,8,10,21,466,582,263,367,0,10,6,4,3,22,12,6,3,4,2,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,Southampton,Leicester City,49%,51%,14,9,566,588,17,20,11,7,10,8,9,16,349,467,375,479,0,7,12,2,6,5,10,3,3,1,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Wolverhampton Wanderers,Liverpool,37%,63%,9,24,570,835,18,21,15,6,4,7,31,8,343,430,632,730,0,7,29,5,8,3,6,3,5,0,1,0,0,0
2022-05-22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Aston Villa,Manchester City,29%,71%,5,35,388,762,17,11,11,5,18,23,31,9,180,271,568,670,0,4,24,2,5,11,5,1,13,1,0,0,0,0


In [174]:
season_data_complete.drop(["team","opponent"],axis=1,inplace=True)

In [175]:
season_data_complete

Unnamed: 0,team_Arsenal,team_Aston Villa,team_Bournemouth,team_Brentford,team_Brighton & Hove Albion,team_Burnley,team_Cardiff City,team_Chelsea,team_Crystal Palace,team_Everton,team_Fulham,team_Huddersfield Town,team_Leeds United,team_Leicester City,team_Liverpool,team_Manchester City,team_Manchester United,team_Newcastle United,team_Norwich City,team_Sheffield United,team_Southampton,team_Stoke City,team_Swansea City,team_Tottenham Hotspur,team_Watford,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers,opponent_Arsenal,opponent_Aston Villa,opponent_Bournemouth,opponent_Brentford,opponent_Brighton & Hove Albion,opponent_Burnley,opponent_Cardiff City,opponent_Chelsea,opponent_Crystal Palace,opponent_Everton,opponent_Fulham,opponent_Huddersfield Town,opponent_Leeds United,opponent_Leicester City,...,opponent_Norwich City,opponent_Sheffield United,opponent_Southampton,opponent_Stoke City,opponent_Swansea City,opponent_Tottenham Hotspur,opponent_Watford,opponent_West Bromwich Albion,opponent_West Ham United,opponent_Wolverhampton Wanderers,POT,POO,CRT,CRO,TCT,TCO,TTT,TTO,IPT,IPO,ART,ARO,CLT,CLO,TCP,TTP,OCP,OTP,FTR,TS,OS,TST,OST,TF,OF,TC,OC,TY,OY,TR,OR,Home
2017-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,68%,32%,20,18,801,427,23,17,13,11,18,17,33,29,565,681,192,315,2,27,6,10,3,9,12,9,4,0,1,0,0,1
2017-08-12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,23%,77%,8,27,343,897,10,10,12,9,13,13,43,8,146,248,718,820,0,6,14,2,4,6,9,3,10,0,2,0,0,1
2017-08-12,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,62%,38%,26,16,667,453,10,8,9,13,15,21,23,44,468,568,257,348,0,19,10,6,5,16,11,8,5,3,3,2,0,1
2017-08-12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,56%,45%,27,14,561,460,24,28,19,9,18,16,20,32,333,444,230,356,0,14,8,4,6,7,19,12,9,1,3,0,0,1
2017-08-12,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,60%,40%,18,27,670,473,17,18,19,24,19,23,43,26,425,545,233,356,2,9,9,4,1,13,10,6,7,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,61%,39%,21,11,679,479,17,8,12,9,10,8,10,21,466,582,263,367,0,10,6,4,3,22,12,6,3,4,2,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,49%,51%,14,9,566,588,17,20,11,7,10,8,9,16,349,467,375,479,0,7,12,2,6,5,10,3,3,1,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,37%,63%,9,24,570,835,18,21,15,6,4,7,31,8,343,430,632,730,0,7,29,5,8,3,6,3,5,0,1,0,0,0
2022-05-22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,29%,71%,5,35,388,762,17,11,11,5,18,23,31,9,180,271,568,670,0,4,24,2,5,11,5,1,13,1,0,0,0,0


In [176]:
season_data_complete["POT"]=season_data_complete["POT"].str.replace("%","").astype('float') /100
season_data_complete["POO"]=season_data_complete["POO"].str.replace("%","").astype('float') /100

# converting the precentage string  data into float 


In [177]:
season_data_complete



Unnamed: 0,team_Arsenal,team_Aston Villa,team_Bournemouth,team_Brentford,team_Brighton & Hove Albion,team_Burnley,team_Cardiff City,team_Chelsea,team_Crystal Palace,team_Everton,team_Fulham,team_Huddersfield Town,team_Leeds United,team_Leicester City,team_Liverpool,team_Manchester City,team_Manchester United,team_Newcastle United,team_Norwich City,team_Sheffield United,team_Southampton,team_Stoke City,team_Swansea City,team_Tottenham Hotspur,team_Watford,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers,opponent_Arsenal,opponent_Aston Villa,opponent_Bournemouth,opponent_Brentford,opponent_Brighton & Hove Albion,opponent_Burnley,opponent_Cardiff City,opponent_Chelsea,opponent_Crystal Palace,opponent_Everton,opponent_Fulham,opponent_Huddersfield Town,opponent_Leeds United,opponent_Leicester City,...,opponent_Norwich City,opponent_Sheffield United,opponent_Southampton,opponent_Stoke City,opponent_Swansea City,opponent_Tottenham Hotspur,opponent_Watford,opponent_West Bromwich Albion,opponent_West Ham United,opponent_Wolverhampton Wanderers,POT,POO,CRT,CRO,TCT,TCO,TTT,TTO,IPT,IPO,ART,ARO,CLT,CLO,TCP,TTP,OCP,OTP,FTR,TS,OS,TST,OST,TF,OF,TC,OC,TY,OY,TR,OR,Home
2017-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0.68,0.32,20,18,801,427,23,17,13,11,18,17,33,29,565,681,192,315,2,27,6,10,3,9,12,9,4,0,1,0,0,1
2017-08-12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0.23,0.77,8,27,343,897,10,10,12,9,13,13,43,8,146,248,718,820,0,6,14,2,4,6,9,3,10,0,2,0,0,1
2017-08-12,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0.62,0.38,26,16,667,453,10,8,9,13,15,21,23,44,468,568,257,348,0,19,10,6,5,16,11,8,5,3,3,2,0,1
2017-08-12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0.56,0.45,27,14,561,460,24,28,19,9,18,16,20,32,333,444,230,356,0,14,8,4,6,7,19,12,9,1,3,0,0,1
2017-08-12,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0.60,0.40,18,27,670,473,17,18,19,24,19,23,43,26,425,545,233,356,2,9,9,4,1,13,10,6,7,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0.61,0.39,21,11,679,479,17,8,12,9,10,8,10,21,466,582,263,367,0,10,6,4,3,22,12,6,3,4,2,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0.49,0.51,14,9,566,588,17,20,11,7,10,8,9,16,349,467,375,479,0,7,12,2,6,5,10,3,3,1,0,0,0,0
2022-05-22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0.37,0.63,9,24,570,835,18,21,15,6,4,7,31,8,343,430,632,730,0,7,29,5,8,3,6,3,5,0,1,0,0,0
2022-05-22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0.29,0.71,5,35,388,762,17,11,11,5,18,23,31,9,180,271,568,670,0,4,24,2,5,11,5,1,13,1,0,0,0,0


In [178]:
season_data_complete["FTR"].astype("int")
season_data_complete.to_csv("Final_Data.csv")

Y_train=season_data_complete["FTR"].to_numpy()

season_data_complete.drop(["FTR"],axis=1,inplace=True)


In [179]:
X_train=season_data_complete.to_numpy().astype('float32')
Y_train=Y_train.astype('int')

In [191]:
X_train,X_test,Y_train,y_test=train_test_split(X_train,Y_train,test_size=0.05)
print(X_test.shape)
print(X_train.shape)

(171, 87)
(3249, 87)


In [192]:
clf=LogisticRegression(max_iter=20000,random_state=0).fit(X_train[:,:-1],Y_train)

In [213]:
pred = np.argmax(clf.predict_proba(X_test[:,:-1]),axis=1)
errors = (pred !=y_test)

counts= 0 
for i in range(len(errors)):
    if errors[i]:
        if abs ((clf.predict_proba(X_test[:,:-1])*100).astype('int')[i][pred[i]]- \
            (clf.predict_proba(X_test[:,:-1])*100).astype('int')[i][y_test[i]]) < 10 :
            counts +=1 


In [214]:
counts

11

In [217]:
pd.set_option("display.max_rows",72)

print((clf.predict_proba(X_test[:,:-1])*100).astype('int')[5])
print(y_test==np.argmax(clf.predict_proba(X_test[:,:-1]),axis=1))
print((np.sum(y_test==np.argmax(clf.predict_proba(X_test[:,:-1]),axis=1))+counts)/y_test.shape[0] *100)

X_test[7][-1]
print(y_test[5])


[40 44 14]
[ True  True  True  True  True False  True  True  True False  True False
  True False  True  True  True  True False  True False  True False  True
  True  True  True  True False  True False False  True  True  True  True
  True False  True False  True False False False  True  True False  True
  True  True  True False  True  True  True  True False  True  True  True
 False False  True  True  True False  True False  True  True False False
  True  True  True  True  True False  True False  True  True  True  True
 False  True  True False  True  True  True  True  True False  True  True
 False False  True  True False  True False  True  True False  True  True
 False  True  True False  True  True  True False False  True  True False
  True  True  True False False False False False False  True  True False
  True  True  True  True False  True  True False  True  True  True  True
  True  True  True  True  True False  True  True  True  True False False
 False False  True  True  True  True  Tr