In [None]:
import os
import sys
#import numpy as np
import pandas as pd

from collections.abc import Callable
from typing import Literal

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
def fn_plot_torch_hist(hist_df):

    # instantiate figure
    fig, axes = plt.subplots(1,2 , figsize = (15,6))

    # properties  matplotlib.patch.Patch
    props = dict(boxstyle='round', facecolor='cyan', alpha=0.5)

    # columns
    x = hist_df.columns[0]
    y1 = hist_df.columns[1]
    y2 = hist_df.columns[2]
    y3 = hist_df.columns[3]
    y4 = hist_df.columns[4]

    # Where was min loss
    best = hist_df[hist_df[y2] == hist_df[y2].min()]
    best = best.drop_duplicates(subset=y2)
    # pick first axis
    ax = axes[0]

    # Plot all losses
    hist_df.plot(x = x, y = [y1, y2], ax = ax)

    # calculate offset for arroe
    y_min = min(hist_df[y1].min(), hist_df[y2].min())
    y_max = max(hist_df[y1].max(), hist_df[y2].max())
    offset = (y_max-y_min)/10.0

    # little beautification
    txtFmt = "Loss: \n  train: {:6.4f}\n   test: {:6.4f}"
    txtstr = txtFmt.format(hist_df.iloc[-1][y1],
                           hist_df.iloc[-1][y2]) #text to plot

    # place a text box in upper middle in axes coords
    ax.text(0.3, 0.95, txtstr, transform=ax.transAxes, fontsize=14,
            verticalalignment='top', bbox=props)

    # Mark arrow at lowest
    ax.annotate(f'Min: {best[y2].to_numpy()[0]:6.4f}', # text to print
                xy=(best[x].to_numpy(), best[y2].to_numpy()[0]), # Arrow start
                xytext=(best[x].to_numpy()+ offset, best[y2].to_numpy()[0]+offset), # location of text
                fontsize=14,va='bottom', ha='right',bbox=props, # beautification of text
                arrowprops=dict(facecolor='cyan', shrink=0.05)) # arrow

    # Draw vertical line at best value
    ax.axvline(x = best[x].to_numpy(), color = 'green', linestyle='-.', lw = 3);

    ax.set_xlabel(x.title())
    ax.set_ylabel(y1.title())
    ax.set_title('Errors')
    ax.grid()
    ax.legend(loc = 'upper left') # model legend to upper left

    # pick second axis
    ax = axes[1]

    # Plot accuracies
    hist_df.plot(x = x, y = [y3, y4], ax = ax)

    # little beautification
    txtFmt = "Accuracy: \n  train: {:6.4f}\n  test:  {:6.4f}"
    txtstr = txtFmt.format(hist_df.iloc[-1][y3],
                           hist_df.iloc[-1][y4]) #text to plot

    # calculate offset for arroe
    y_min = min(hist_df[y3].min(), hist_df[y4].min())
    y_max = max(hist_df[y3].max(), hist_df[y4].max())
    offset = (y_max-y_min)/10.0

    # place a text box in lower middle in axes coords
    ax.text(0.3, 0.2, txtstr, transform=ax.transAxes, fontsize=12,
            verticalalignment='top', bbox=props)

    # Mark arrow at lowest
    ax.annotate(f'Best: {best[y4].to_numpy()[0]:6.4f}', # text to print
                xy=(best[x].to_numpy(), best[y4].to_numpy()[0]), # Arrow start
                xytext=(best[x].to_numpy()- offset, best[y4].to_numpy()[0]-offset), # location of text
                fontsize=14,va='bottom', ha='right',bbox=props, # beautification of text
                arrowprops=dict(facecolor='cyan', shrink=0.05)) # arrow


    # Draw a vertical line at best value
    ax.axvline(x = best[x].to_numpy(),
               color = 'green',
               linestyle='-.', lw = 3)

    # Labels
    ax.set_xlabel(x.title())
    ax.set_ylabel(y3.title())
    ax.set_title('Accuracies')
    ax.grid();
    ax.legend(loc = 'lower left')

    plt.tight_layout()


In [None]:
def fn_plot_confusion_matrix(y_true, y_pred, labels):
    '''
    Args:
        y_true: Ground Truth
        y_pred : Predictions
        labels : dictonary
                  {0: 'Goal Keeper',
                  1: 'Defender',
                  2: 'Mid-Fielder',
                  3: 'Forward'}

    '''

    cm  = confusion_matrix(y_true, y_pred)

    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=labels.values())

    fig, ax = plt.subplots(figsize = (6,6))

    disp.plot(ax = ax, cmap = 'Blues', xticks_rotation = 'vertical', colorbar=False)
    # Disable the grid
    ax.grid(False)

    plt.show();

In [None]:
print(fn_plot_torch_hist.__doc__)  ##used to access comments from a python file

None


In [1]:
RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production
torch.manual_seed(RANDOM_STATE)


EPOCHS = 2001 # number of epochs

ALPHA = 0.001 # learning rate
TEST_SIZE = 0.2
BATCH_SIZE = 200


# parameters for Matplotlib
params = {'legend.fontsize': 'medium',
          'figure.figsize': (15, 6),
          'axes.labelsize': 'medium',
          'axes.titlesize':'large',
          'xtick.labelsize':'medium',
          'ytick.labelsize':'medium'
         }

plt.rcParams.update(params)

CMAP = plt.cm.coolwarm
plt.style.use('seaborn-v0_8-darkgrid') # plt.style.use('ggplot')

NameError: name 'torch' is not defined

In [None]:
data_df = pd.read_csv("fifa_2019.csv")
data_df.shape

(14588, 89)

In [None]:
#for col in data_df.columns:
    #print(f'{col}: {data_df[col].unique()}')  #to see the unique values in each of the columns

In [None]:
data_df.describe().T  #Take transpose to see all the columns

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,14588.0,7293.5,4211.337199,0.0,3646.75,7293.5,10940.25,14587.0
ID,14588.0,209314.882917,30002.686347,16.0,193836.0,214043.5,230660.0,246617.0
Age,14588.0,26.10632,4.352171,16.0,23.0,26.0,29.0,45.0
Overall,14588.0,68.651769,5.273925,61.0,65.0,68.0,72.0,94.0
Potential,14588.0,72.378942,5.813639,61.0,68.0,72.0,76.0,95.0
Special,14587.0,1659.57942,246.663095,828.0,1552.0,1694.0,1819.0,2346.0
International Reputation,14539.0,1.141275,0.435701,1.0,1.0,1.0,1.0,5.0
Weak Foot,14539.0,2.996286,0.668339,1.0,3.0,3.0,3.0,5.0
Skill Moves,14539.0,2.471834,0.766851,1.0,2.0,2.0,3.0,5.0
Jersey Number,14534.0,17.986446,15.483826,1.0,8.0,15.0,24.0,99.0


In [None]:
#data_df['Position'].isnull().sum()

In [None]:
# removing rows with position = null
data_df = data_df[data_df["Position"].notnull()]
data_df.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [None]:
# Following columns appear to be relevant for our analysis
rel_cols = ["Position", 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
            'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
            'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
            'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
            'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
            'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
            'GKKicking', 'GKPositioning', 'GKReflexes']

In [None]:
goalkeeper = 'GK'
forward = ['ST', 'LW', 'RW', 'LF', 'RF', 'RS','LS', 'CF']
midfielder = ['CM','RCM','LCM', 'CDM','RDM','LDM', 'CAM', 'LAM', 'RAM', 'RM', 'LM']
defender = ['CB', 'RCB', 'LCB', 'LWB', 'RWB', 'LB', 'RB']

In [None]:
#Assign labels to goalkeepers
data_df.loc[data_df["Position"] == "GK", "Position"] = 0

#Defenders
data_df.loc[data_df["Position"].isin(defender), "Position"] = 1

#Midfielders
data_df.loc[data_df["Position"].isin(midfielder), "Position"] = 2

#Forward
data_df.loc[data_df["Position"].isin(forward), "Position"] = 3

# Convert Column "Position" to numeric so that Pandas does not complain
data_df['Position'] = pd.to_numeric(data_df['Position'], downcast="integer")

In [None]:
data_df = data_df[rel_cols]
data_df.head()

Unnamed: 0,Position,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,3,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,3,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,3,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,...,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,2,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,...,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


In [None]:
class_labels = {0: 'Goal Keeper', 1: 'Defender', 2: 'Mid-Fielder', 3: 'Forward'}

In [None]:
y = data_df["Position"].to_numpy()

X = data_df.drop("Position", axis = 1)

In [None]:
# splitting in train and test datasets
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                     stratify=y,
                                     test_size=TEST_SIZE,
                                     random_state=RANDOM_STATE )
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((11627, 33), (2907, 33), (11627,), (2907,))

In [None]:
sc  = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Prepare Model

In [None]:
torch.cuda.is_available()
print(torch.__version__)

2.2.1+cu121


In [None]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

Define Dataset

In [None]:
class FifaDataset(Dataset):
  def __init__(self, X, y):
    self.X = torch.tensor(X, dtype=torch.float32)
    self.y = torch.tensor(y, dtype=torch.long)
  def __len__(self):
    return len(self.X)
  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [None]:
#Create an instance of the dataset
train_dataset = FifaDataset(X_train,y_train)

#Create a Dataloader with batch size of 2
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True)

#Iterate through the DataLoader
for batch_idx, (data, target) in enumerate(train_loader):
  print(f"Batch {batch_idx + 1}: ", end='')
  print("Data:",data.shape, end=' ')
  print("Target:", target.shape)

Batch 1: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 2: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 3: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 4: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 5: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 6: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 7: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 8: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 9: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 10: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 11: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 12: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 13: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 14: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 15: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 16: Data: torch.Size([200, 33]) Target: tor

In [None]:
#Create an instance of the dataset
test_dataset = FifaDataset(X_test,y_test)

#Create a Dataloader with batch size of 2
test_loader = DataLoader(dataset=test_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True)

#Iterate through the DataLoader
for batch_idx, (data, target) in enumerate(test_loader):
  print(f"Batch {batch_idx + 1}: ", end='')
  print("Data:",data.shape, end=' ')
  print("Target:", target.shape)

Batch 1: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 2: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 3: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 4: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 5: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 6: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 7: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 8: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 9: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 10: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 11: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 12: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 13: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 14: Data: torch.Size([200, 33]) Target: torch.Size([200])
Batch 15: Data: torch.Size([107, 33]) Target: torch.Size([107])


In [None]:
class Model (nn.Module ):
  def __init__(self, input_dim):
    super( Model, self).__init__()
    self.layer1 = nn.Linear( input_dim, 18)
    self.activ1 = nn.SiLU()
    self.layer2 = nn.Linear( 18,4)

  def forward(self, x):
    x = self.layer1( x )  #Layer 1 aggregation
    x = self.active1( x) #Layer activation
    x = self.layer2(x)    #Layer 2 activations
    return x

In [None]:
input_dim= X_train.shape[1]

model=nn.Sequential(
    nn.Linear(input_dim,18),
    nn.Tanh(),
    nn.Linear(18,4)
).to(device=device)

In [None]:
print(model)

Sequential(
  (0): Linear(in_features=33, out_features=18, bias=True)
  (1): Tanh()
  (2): Linear(in_features=18, out_features=4, bias=True)
)


In [None]:
list(model.parameters())

[Parameter containing:
 tensor([[-2.4222e-02, -6.6486e-02, -3.5062e-02,  6.3746e-03, -1.4182e-01,
          -4.9749e-02,  1.6879e-01, -3.7317e-02,  1.5855e-01, -6.7960e-02,
          -7.0020e-02, -5.1887e-02, -1.5565e-01, -1.0485e-01,  1.0520e-01,
          -1.3059e-01,  5.9465e-02,  1.6379e-01, -2.2121e-02,  7.6153e-02,
           8.1313e-02, -1.2426e-01, -6.7123e-02, -5.4516e-02,  1.6798e-01,
          -1.5537e-01,  1.0537e-01,  1.6350e-01,  1.3764e-01,  7.4190e-02,
           7.7494e-02, -9.2818e-03,  8.7583e-02],
         [ 1.5868e-01, -5.5976e-02,  1.6820e-01,  2.7665e-02, -1.2210e-01,
          -1.2494e-01,  6.1399e-02, -2.4561e-02, -7.6685e-02,  8.8150e-02,
          -9.1821e-02, -8.3203e-02,  3.2539e-02, -3.1431e-02, -1.4688e-01,
          -5.2295e-02, -6.1178e-02, -7.4187e-02,  1.3552e-01, -2.6404e-02,
          -1.0632e-01,  5.1576e-02,  2.9993e-02, -1.3217e-01, -1.6670e-01,
           1.3171e-01, -1.6122e-01,  1.7279e-01,  9.7198e-02,  1.4528e-01,
          -1.5150e-01,  6.4

In [None]:
print(f'Num Parameters:{len(list(model.parameters()))}')
print(f'Layer1: Weights:{list(model.parameters())[0].shape}')
print(f'Layer1: Bias:{list(model.parameters())[1].shape}')
print(f'Layer2: Weights:{list(model.parameters())[2].shape}')
print(f'Layer2: Bias:{list(model.parameters())[3].shape}')

Num Parameters:4
Layer1: Weights:torch.Size([18, 33])
Layer1: Bias:torch.Size([18])
Layer2: Weights:torch.Size([4, 18])
Layer2: Bias:torch.Size([4])


In [None]:
train_X=torch.tensor(X_train, dtype=torch.float32, device=device)
train_y=torch.tensor(y_train, dtype=torch.int64, device=device)

test_X=torch.tensor(X_test, dtype=torch.float32, device=device)
test_y=torch.tensor(y_test, dtype=torch.int64, device=device)

In [None]:
train_X[:1].shape, train_X[0].shape

# Training

In [None]:
#loss function
loss_fn= nn.CrossEntropyLoss()

#Optimizers
optimizer= torch.optim.Adam(model.parameters(),lr=ALPHA)

#Iterations
for epoch in range(EPOCHS):
    model.train() #set the model in training model

    for batch_idx, (train_X, train_y) in enumerate(train_loader):

      train_X, train_y = train_X.to(device), train_y.to(device)

    predict_prob=model(train_X) #make predictions

    curr_loss=loss_fn(predict_prob, train_y)  #calculate loss

###Back Prop
optimizer.zero_grad()
curr_loss.backward()
optimizer.step()


In [None]:
predict_prob=model(train_X[:1])
predict_prob.detach().cpu().numpy()

array([[-0.4142217 , -0.19276334,  0.13504267, -0.0934963 ]],
      dtype=float32)

In [None]:
predict_prob.argmax().item(), train_y[0].item()

(2, 3)

In [None]:
del model

model = nn.Sequential(
    nn.Linear( input_dim, 18),
    nn.Tanh(),
    nn.Linear( 18, 4)
).to(device=device)

In [None]:
#loss function
loss_fn= nn.CrossEntropyLoss()

#Optimizers
optimizer= torch.optim.Adam(model.parameters(),lr=ALPHA)

#some lists to collect data
loss=[]
tloss=[]
n_epoch=[]
acc=[]
tacc=[]

#Iterations
for epoch in range(EPOCHS):
    model.train() #set the model in training model

    predict_prob=model(train_X) #make predictions

    curr_loss=loss_fn(predict_prob, train_y)  #calculate loss

###Back Prop
optimizer.zero_grad()
curr_loss.backward()
optimizer.step()

loss.append(curr_loss.data.item()) # append to loss list

_,y_pred= torch.max(predict_prob,1)

curr_acc=accuracy_score(y_train, y_pred.data.cpu()) #move to cpu

acc.append(curr_acc) #append to accuracy list

model.eval() #evaluation mode prevent from learning

predict_prob_tst=model(test_X)

tcurr_loss=loss_fn(predict_prob_tst,test_y)


tloss.append(tcurr_loss.data.item()) # append to loss list

_,y_pred= torch.max(predict_prob_tst,1)

tcurr_acc=accuracy_score(y_test, y_pred.data.cpu()) #move to cpu

tacc.append(tcurr_acc)

n_epoch.append(epoch)

if epoch % 1000 == 0:
        fmtStr = 'Epoch :{:5d}/{:5d} --- Loss: {:.5f}/{:.5f} | Acc {:.5f}/{:.5f}'

        print (fmtStr.format(epoch, EPOCHS,
                             curr_loss.data.item(),
                             tcurr_loss.data.item(),
                             curr_acc,
                             tcurr_acc))

Epoch : 2000/ 2001 --- Loss: 1.33983/1.32446 | Acc 0.40325/0.41433
