## Principal component Analysis

In [1]:
from sklearn.decomposition import PCA
import numpy as np
from tqdm import tqdm
from keras import optimizers


pca = PCA(n_components=512)
array = np.load('1_ResNET.npy')
print(array.shape)
pca_arr = pca.fit_transform(array)
np.save('1_ResNET_PCA512.npy', pca_arr)
print(pca_arr.shape)

Using TensorFlow backend.


(5400, 2048)
(5400, 512)


In [2]:
print(array, array.shape)

[[1.4630162  0.06204205 4.4352226  ... 1.5256827  3.5007942  0.        ]
 [1.3390025  0.05157435 4.769183   ... 0.67852765 1.8540585  0.21820806]
 [0.53878206 0.01563736 5.3364444  ... 0.36709633 3.5641797  0.01204965]
 ...
 [0.         0.13242404 4.908689   ... 2.4691331  1.3422186  0.20065752]
 [0.         0.08716917 4.3659477  ... 2.1971464  1.4559647  0.1773672 ]
 [0.         0.07030997 3.5072024  ... 3.0104277  0.9120427  0.00837918]] (5400, 2048)


In [3]:
print(pca_arr, pca_arr.shape)

[[-5.8080301e+00  1.2697511e+01  5.8431826e+00 ...  1.7071088e-01
  -2.3633674e-02  3.3422112e-01]
 [-9.2166674e-01  1.8923967e+01  2.7841573e+00 ...  7.1200885e-02
  -3.5260969e-01  4.4601795e-01]
 [ 8.0361170e-01  1.9779863e+01 -2.2374811e+00 ... -3.4451136e-01
  -7.1129203e-02  2.3810212e-01]
 ...
 [-1.2372240e+01 -3.9151075e-01 -2.0314646e+01 ... -3.5327846e-01
   1.6631913e-01 -1.1573656e-01]
 [-1.4105256e+01 -2.3478653e+00 -1.8007633e+01 ... -2.3208618e-01
   2.5485778e-02 -8.6775243e-02]
 [-1.3138857e+01  8.7422854e-01 -1.7679382e+01 ...  1.4730068e-01
   2.8727269e-01 -2.3334429e-03]] (5400, 512)


## Initializing Directories

In [4]:
import numpy as np
import json
import os
import pickle

 Defining the folders and the file names to import

In [5]:
FEAT_FOLDER = 'data/features/'
LABEL_FOLDER = 'data/labels'

FEAT_FILE_1 = '1_ResNET_PCA512.npy'
FEAT_FILE_2 = '2_ResNET_PCA512.npy'

FEAT_FILE = 'feat.pickle'
LABEL_FILE = 'label.pickle'

MODEL = r'C:\Users\ranja\Desktop\Notebooks\SoccerNet-code-master\SoccerNet-code-master\src\feature_extraction\model.pickle'

Checking the contents of the folder

In [6]:
os.listdir(LABEL_FOLDER) 

['england_epl',
 'europe_uefa-champions-league',
 'france_ligue-1',
 'germany_bundesliga',
 'italy_serie-a',
 'spain_laliga']

Creating en empty list that will hold the locations of <br>
 - all the features extracted
 - all the labels parsed
    


In [7]:
FEAT_FOLD_LIST = []
LABEL_FOLD_LIST = []

## Collecting Feature and Label Directories

Executing a tree walk through the features folder, to find all the folders that contain the feature

In [8]:
for root, subdirs, files in os.walk(FEAT_FOLDER):
    if files:
        for f in files:
            if f.endswith('.npy') and root not in FEAT_FOLD_LIST:
                FEAT_FOLD_LIST += [root]

In [9]:
FEAT_FOLD_LIST[:2]

['data/features/england_epl\\2014-2015\\2015-02-21 - 18-00 Chelsea 1 - 1 Burnley',
 'data/features/england_epl\\2014-2015\\2015-02-21 - 18-00 Crystal Palace 1 - 2 Arsenal']

Executing a tree walk through the labels folder, to find all the folders that contain the labels

In [10]:
for root, subdirs, files in os.walk(LABEL_FOLDER):
    if files:
        for f in files:
            if f.endswith('.json') and root not in LABEL_FOLD_LIST:
                LABEL_FOLD_LIST += [root]

In [11]:
LABEL_FOLD_LIST[:2]

['data/labels\\england_epl\\2014-2015\\2015-02-21 - 18-00 Chelsea 1 - 1 Burnley',
 'data/labels\\england_epl\\2014-2015\\2015-02-21 - 18-00 Crystal Palace 1 - 2 Arsenal']

## Finding features for labels

Initializing certain static variables

In [12]:
#to determine how many events to iterate through
FILE_LEN = len(FEAT_FOLD_LIST)

#Type of Events taken from labels and assigned to variables
GOAL = "soccer-ball"
FOUL = "y-card"
SUBS = "substitution-in"
NO_EVENT = "no"

#This variable determines how many features around the index we want to take
ws = 10 #5 second window, 5/60 * 2 * 60 = 10

Inititalizing features, making them 512 size

In [13]:
GOAL_FEATURES = np.empty([1, 512])
FOUL_FEATURES = np.empty([1, 512])
SUBS_FEATURES = np.empty([1, 512])
NO_FEATURES = np.empty([1, 512])

Defining helper functions

In [14]:
def get_features(folder):
    
    """
    function : to find and give back the combined set of resnet features
    input : the location of the features
    output : features for the entire match
    """
    
    files = os.listdir(folder)
    if FEAT_FILE_1 not in files or FEAT_FILE_2 not in files:
        return None
    first_half = np.load(folder + '/' + FEAT_FILE_1)
    second_half = np.load(folder + '/' + FEAT_FILE_2)
    return np.concatenate((first_half, second_half))

def get_labels(folder):
    
    """
    function : to find the labels and return
    inpupt : location of labels
    output : labels in json format
    """
    
    file = os.listdir(folder)
    with open(folder + '/' + file[0]) as json_file:
        data = json.load(json_file)
    return data

def parse_labels(labels, size):
    
    """
    function : to return the array of indices of every event type
    input : labels in json format and the size of the features (length)
    output : a dictionary containing an array of values for every event type
    """
    
    annots = labels['annotations']
    
    dict_events = {}
    goal_time = []
    foul_time = []
    subs_time = []
    
    for i in range(len(annots)):
        if annots[i]['label'] == GOAL:
            time = annots[i]['gameTime']
            vals = time.split('-')
            final_time = int(vals[0]) * int(vals[1][0:3])
            final_time = final_time * 120
            if final_time > size:
                final_time = size - ws
            goal_time += [final_time]

        if annots[i]['label'] == FOUL:
            time = annots[i]['gameTime']
            vals = time.split('-')
            final_time = int(vals[0]) * int(vals[1][0:3])
            final_time = final_time * 120
            if final_time > size:
                final_time = size - ws
            foul_time += [final_time]
            
        if annots[i]['label'] == SUBS:
            time = annots[i]['gameTime']
            vals = time.split('-')
            final_time = int(vals[0]) * int(vals[1][0:3])
            final_time = final_time * 120
            if final_time > size:
                final_time = size - ws            
            subs_time += [final_time]
            
    dict_events[GOAL] = goal_time
    dict_events[FOUL] = foul_time
    dict_events[SUBS] = subs_time
    return dict_events

def interpolate(arr, ws):
    
    """
    function : to find features on either side of the value
    input : the array of values and the window size
    output : modified array of indices
    """
    
    new_arr = [list(range(x-ws, x+1)) for x in arr]
    merged_list = []

    for l in new_arr:
        merged_list += l
    return merged_list

Aggregating features of a particular event into respective arrays

In [15]:
for i in tqdm(range(FILE_LEN)):
    feat_dir = FEAT_FOLD_LIST[i]
    label_dir = LABEL_FOLD_LIST[i]
    
    features = get_features(feat_dir)
    if features is None:
        continue
        
    features_small = features[0:100]
    feat_shape = list(range(features.shape[0]))[0:100]
    
    labels = get_labels(label_dir)
    
    events = parse_labels(labels, features.shape[0])
    
    g = events[GOAL]
    f = events[FOUL]
    s = events[SUBS]
    
    num = interpolate(g, ws) + interpolate(f, ws) + interpolate(s, ws)
    #print(num) 
    
    y = [not x in num for x in feat_shape]
    
    GOAL_FEATURES = np.concatenate([GOAL_FEATURES, features[interpolate(g, ws)]])
    FOUL_FEATURES = np.concatenate([FOUL_FEATURES, features[interpolate(f, ws)]])
    SUBS_FEATURES = np.concatenate([SUBS_FEATURES, features[interpolate(s, ws)]])
    NO_FEATURES = np.concatenate([NO_FEATURES, features_small[y]])

100%|████████████████████████████████████████████████████████████████████████████████| 234/234 [00:23<00:00,  9.86it/s]


Checking the size...

In [16]:
print(GOAL_FEATURES.shape, FOUL_FEATURES.shape, SUBS_FEATURES.shape, NO_FEATURES.shape)

(7239, 512) (8856, 512) (13619, 512) (22018, 512)


## Training

Making the necessary imports

In [17]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Creating Train and Test Features

In [18]:
# Find the minimum length of all the three event feature types
n = min(GOAL_FEATURES.shape[0], FOUL_FEATURES.shape[0], SUBS_FEATURES.shape[0], NO_FEATURES.shape[0])

#Create target class variable of length n
g_class = [GOAL] * n
f_class = [FOUL] * n
s_class = [SUBS] * n
n_class = [NO_EVENT] *n

#Ensure that the train features and target labels are of the same length
TRAIN_FEATURES = np.concatenate([GOAL_FEATURES[:n], FOUL_FEATURES[:n], SUBS_FEATURES[:n], NO_FEATURES[:n]])
#TRAIN_FEATURES = np.concatenate([GOAL_FEATURES[:n], FOUL_FEATURES[:n], SUBS_FEATURES[:n]])

TARGET_LABELS = np.concatenate([g_class, f_class, s_class, n_class])
#TARGET_LABELS = np.concatenate([g_class, f_class, s_class])

print(TRAIN_FEATURES.shape, TARGET_LABELS.shape)

(28956, 512) (28956,)


#### Store

In [None]:
feat_out = open(FEAT_FILE, "wb")

pickle.dump(TRAIN_FEATURES, feat_out)

#### Load

In [None]:
pickle_in = open(FEAT_FILE,"rb")

TRAIN_FEATURES = pickle.load(pickle_in)

### Label Enconding and Test Features

Encoding the target varibles to make it suitable for training

In [19]:
encoder = LabelEncoder()
encoder.fit(TARGET_LABELS)
encoded_Y = encoder.transform(TARGET_LABELS)
# convert integers to dummy variables (i.e. one hot encoded)
y = np_utils.to_categorical(encoded_Y)
y.shape

(28956, 4)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(TRAIN_FEATURES, y, test_size=0.2, random_state=42)

### Defining the model

In [28]:
def baseline_model():
    # create model
    model = Sequential()
    
    #512 inputs to 8 neurons
    model.add(Dense(19, input_dim=512, activation='relu'))
    
    model.add(Dense(12, input_dim=19, activation='relu'))
    
    model.add(Dense(7, input_dim=12, activation='relu'))

    #Final 4 categories, so again three dense layers
    model.add(Dense(4))
    
    #Add softmax activation function
    model.add(Activation(tf.nn.softmax))
    
    # Compile model
    
    adam = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    
    return model

### Fitting

Creating an instance of the model, and training

In [29]:
model = baseline_model()

In [None]:
model.fit(X_train, y_train, epochs=3)

Epoch 1/3
Epoch 2/3

## Storing models

In [None]:
model_out = open(MODEL, "wb")

pickle.dump(model, model_out)

In [None]:
pickle_in = open(MODEL,"rb")

model = pickle.load(pickle_in)

## Testing

Predicting on the test set

In [None]:
y_pred = model.predict(X_test)
y_pred[:5]

In [None]:
y_test[:5]

Converting the test set to 1's and 0's

In [None]:
y_pred = (y_pred == y_pred.max(axis=1)[:,None]).astype(int)
y_pred[:5]

In [None]:
 accuracy_score(y_test, y_pred)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_train