In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder

In [5]:
results=pd.read_csv('Datasets/results.csv')
former_names=pd.read_csv('Datasets/former_names.csv')
goal_scorers=pd.read_csv('Datasets/goalscorers.csv')
shootouts=pd.read_csv('Datasets/shootouts.csv')

In [6]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [7]:
former_names.head()

Unnamed: 0,current,former,start_date,end_date
0,Benin,Dahomey,1959-11-08,1975-11-30
1,Burkina Faso,Upper Volta,1960-04-14,1984-08-04
2,Curaçao,Netherlands Antilles,1957-03-03,2010-10-10
3,Czechoslovakia,Bohemia,1903-04-05,1919-01-01
4,Czechoslovakia,Bohemia and Moravia,1939-01-01,1945-05-01


In [8]:
goal_scorers.head()

Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False


In [9]:
shootouts.head()

Unnamed: 0,date,home_team,away_team,winner,first_shooter
0,1967-08-22,India,Taiwan,Taiwan,
1,1971-11-14,South Korea,Vietnam Republic,South Korea,
2,1972-05-07,South Korea,Iraq,Iraq,
3,1972-05-17,Thailand,South Korea,South Korea,
4,1972-05-19,Thailand,Cambodia,Thailand,


-- Countries Former Names
-- Goalscoress

In [10]:
former_names.info()
goal_scorers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   current     34 non-null     object
 1   former      34 non-null     object
 2   start_date  34 non-null     object
 3   end_date    34 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44447 entries, 0 to 44446
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       44447 non-null  object 
 1   home_team  44447 non-null  object 
 2   away_team  44447 non-null  object 
 3   team       44447 non-null  object 
 4   scorer     44399 non-null  object 
 5   minute     44191 non-null  float64
 6   own_goal   44447 non-null  bool   
 7   penalty    44447 non-null  bool   
dtypes: bool(2), float64(1), object(5)
memory usage: 2.1+ MB


Fixing Dates because they are string right now
-they need to be changed into datetime

In [11]:
# former_names dates
former_names['start_date'] = pd.to_datetime(former_names['start_date'])
former_names['end_date'] = pd.to_datetime(former_names['end_date'])

# goal_scorers match dates
goal_scorers['date'] = pd.to_datetime(goal_scorers['date'])


We need to map historical country names to current names to avoid duplicates

In [12]:
# create mapping from former to current
former_to_current = dict(zip(former_names['former'], former_names['current']))

# replace historical names in goal_scorers
goal_scorers['team'] = goal_scorers['team'].replace(former_to_current)
goal_scorers['home_team'] = goal_scorers['home_team'].replace(former_to_current)
goal_scorers['away_team'] = goal_scorers['away_team'].replace(former_to_current)


In [15]:
#converting float minute to int by dropping NaN values
goal_scorers = goal_scorers.dropna(subset=['minute'])
goal_scorers['minute'] = goal_scorers['minute'].astype(int)

In [16]:
print(goal_scorers.isna().sum())
print(former_names.isna().sum())


date         0
home_team    0
away_team    0
team         0
scorer       0
minute       0
own_goal     0
penalty      0
dtype: int64
current       0
former        0
start_date    0
end_date      0
dtype: int64


In [19]:
#checking duplicates for both before dropping them 
print(goal_scorers.duplicated().sum())
print(former_names.duplicated().sum())

# only drop duplicates in goal_scorers as former_names has no duplicates
goal_scorers = goal_scorers.drop_duplicates()

0
0
