In [11]:
#STEP 1
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [12]:
#STEP 2
# Load the data from CSV file into pandas DataFrame
data = pd.read_csv('Fifa_world_cup_matches.csv')

# Create a new column 'Result' for team1
# If 'number of goals team1' > 'number of goals team2', it's a Win (denoted by 2)
# If 'number of goals team1' == 'number of goals team2', it's a Draw (denoted by 1)
# Otherwise, it's a Loss (denoted by 0)
data['Result'] = [2 if row['number of goals team1'] > row['number of goals team2'] else 1 if row['number of goals team1'] == row['number of goals team2'] else 0 for index, row in data.iterrows()]


In [14]:
#STEP 3
# Print the first few rows of the DataFrame to understand the data
print(data.head())

           team1         team2 possession team1 possession team2  \
0          QATAR       ECUADOR              42%              50%   
1        ENGLAND          IRAN              72%              19%   
2        SENEGAL   NETHERLANDS              44%              45%   
3  UNITED STATES         WALES              51%              39%   
4      ARGENTINA  SAUDI ARABIA              64%              24%   

  possession in contest  number of goals team1  number of goals team2  \
0                    8%                      0                      2   
1                    9%                      6                      2   
2                   11%                      0                      2   
3                   10%                      1                      1   
4                   12%                      1                      2   

          date     hour category  ...  penalties scored team2  \
0  20 NOV 2022  17 : 00  Group A  ...                       1   
1  21 NOV 2022  14 : 0

In [15]:
#STEP 3 Continued
#Here we are making the possession readable by the ML Model taking it from percentage text to actual intergers
# Convert 'possession team1' and 'possession team2' from percentage to integer
data['possession team1'] = data['possession team1'].str.rstrip('%').astype('int')
data['possession team2'] = data['possession team2'].str.rstrip('%').astype('int')

In [10]:
#Cell to test if STEP 3 Continued Works
print(data['possession team1'])
print()
print(data['possession team2'])

0     42
1     72
2     44
3     51
4     64
      ..
59    54
60    34
61    34
62    45
63    46
Name: possession team1, Length: 64, dtype: int64

0     50
1     19
2     45
3     39
4     24
      ..
59    36
60    54
61    55
62    45
63    40
Name: possession team2, Length: 64, dtype: int64


In [16]:
#Step 4 Feature Selection
features = data[['possession team1', 'possession team2', 'forced turnovers team1']]
labels = data['Result']

In [17]:
#Step 5 Split the Data into Training and Test Sets
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [18]:
#Step 6 Train the Model
model = RandomForestClassifier(n_estimators=100)
model.fit(features_train, labels_train)

RandomForestClassifier()

In [19]:
#STEP 7 Look into it again
predictions = model.predict(features_test)
print('Accuracy:', accuracy_score(labels_test, predictions))
print(classification_report(labels_test, predictions))

Accuracy: 0.38461538461538464
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.25      0.25      0.25         4
           2       0.25      0.25      0.25         4

    accuracy                           0.38        13
   macro avg       0.37      0.37      0.37        13
weighted avg       0.38      0.38      0.38        13

