diff --git a/opensportslib/core/trainer/localization_trainer.py b/opensportslib/core/trainer/localization_trainer.py index 8c36396..8565c02 100644 --- a/opensportslib/core/trainer/localization_trainer.py +++ b/opensportslib/core/trainer/localization_trainer.py @@ -135,6 +135,9 @@ def build_trainer(cfg, model=None, default_args=None, resume_from=None): trainer.best_criterion_valid = checkpoint.get('best_criterion_valid', 0 if cfg.TRAIN.criterion_valid == "map" else float("inf")) logging.info(f"Restored best epoch: {trainer.best_epoch}") + + else: + trainer = Trainer_pl(cfg, default_args["work_dir"]) return trainer @@ -147,6 +150,37 @@ def __init__(self): def train(self): pass +class Trainer_pl(Trainer): + """Trainer class used for models that rely on lightning modules. + + Args: + cfg (dict): Dict config. It should contain the key 'max_epochs' and the key 'GPU'. + """ + + def __init__(self, cfg, work_dir): + from opensportslib.core.utils.lightning import CustomProgressBar, MyCallback + import pytorch_lightning as pl + + self.work_dir = work_dir + call = MyCallback() + self.trainer = pl.Trainer( + max_epochs=cfg.max_epochs, + devices=[cfg.GPU], + callbacks=[call, CustomProgressBar(refresh_rate=1)], + num_sanity_val_steps=0, + ) + + def train(self, **kwargs): + self.trainer.fit(**kwargs) + + best_model = kwargs["model"].best_state + + logging.info("Done training") + logging.info("Best epoch: {}".format(best_model.get("epoch"))) + torch.save(best_model, os.path.join(self.work_dir, "model.pth.tar")) + + logging.info("Model saved") + logging.info(os.path.join(self.work_dir, "model.pth.tar")) class Trainer_e2e(Trainer): diff --git a/opensportslib/core/utils/lightning.py b/opensportslib/core/utils/lightning.py new file mode 100644 index 0000000..d6800b2 --- /dev/null +++ b/opensportslib/core/utils/lightning.py @@ -0,0 +1,52 @@ +import pytorch_lightning as pl +from pytorch_lightning.callbacks.progress import TQDMProgressBar +import logging + + +class CustomProgressBar(TQDMProgressBar): + """Override the custom progress bar used by pytorch lightning to change some attributes.""" + + def get_metrics(self, trainer, pl_module): + """Override the method to don't show the version number in the progress bar.""" + items = super().get_metrics(trainer, pl_module) + items.pop("v_num", None) + return items + + +class MyCallback(pl.Callback): + """Override the Callback class of pl to change the behaviour on validation epoch end.""" + + def __init__(self): + super().__init__() + + def on_validation_epoch_end(self, trainer, pl_module): + loss_validation = pl_module.losses.avg + state = { + "epoch": trainer.current_epoch + 1, + "state_dict": pl_module.model.state_dict(), + "best_loss": pl_module.best_loss, + "optimizer": pl_module.optimizer.state_dict(), + } + + # remember best prec@1 and save checkpoint + is_better = loss_validation < pl_module.best_loss + pl_module.best_loss = min(loss_validation, pl_module.best_loss) + + # Save the best model based on loss only if the evaluation frequency too long + if is_better: + pl_module.best_state = state + # torch.save(state, best_model_path) + + # Reduce LR on Plateau after patience reached + prevLR = pl_module.optimizer.param_groups[0]["lr"] + pl_module.scheduler.step(loss_validation) + currLR = pl_module.optimizer.param_groups[0]["lr"] + + if currLR is not prevLR and pl_module.scheduler.num_bad_epochs == 0: + logging.info("\nPlateau Reached!") + if ( + prevLR < 2 * pl_module.scheduler.eps + and pl_module.scheduler.num_bad_epochs >= pl_module.scheduler.patience + ): + logging.info("\nPlateau Reached and no more reduction -> Exiting Loop") + trainer.should_stop = True \ No newline at end of file diff --git a/opensportslib/core/utils/video_processing.py b/opensportslib/core/utils/video_processing.py index f87e535..203e46e 100644 --- a/opensportslib/core/utils/video_processing.py +++ b/opensportslib/core/utils/video_processing.py @@ -719,6 +719,131 @@ def oneHotToShifts(onehot, params): Shifts[:, i] = shifts return Shifts + +def timestamps2long(output_spotting, video_size, chunk_size, receptive_field): + """Method to transform the timestamps to vectors""" + start = 0 + last = False + receptive_field = receptive_field // 2 + + timestamps_long = ( + torch.zeros( + [video_size, output_spotting.size()[-1] - 2], + dtype=torch.float, + device=output_spotting.device, + ) + - 1 + ) + + for batch in np.arange(output_spotting.size()[0]): + + tmp_timestamps = ( + torch.zeros( + [chunk_size, output_spotting.size()[-1] - 2], + dtype=torch.float, + device=output_spotting.device, + ) + - 1 + ) + + for i in np.arange(output_spotting.size()[1]): + tmp_timestamps[ + torch.floor(output_spotting[batch, i, 1] * (chunk_size - 1)).type( + torch.int + ), + torch.argmax(output_spotting[batch, i, 2:]).type(torch.int), + ] = output_spotting[batch, i, 0] + + # ------------------------------------------ + # Store the result of the chunk in the video + # ------------------------------------------ + + # For the first chunk + if start == 0: + timestamps_long[0 : chunk_size - receptive_field] = tmp_timestamps[ + 0 : chunk_size - receptive_field + ] + + # For the last chunk + elif last: + timestamps_long[start + receptive_field : start + chunk_size] = ( + tmp_timestamps[receptive_field:] + ) + break + + # For every other chunk + else: + timestamps_long[ + start + receptive_field : start + chunk_size - receptive_field + ] = tmp_timestamps[receptive_field : chunk_size - receptive_field] + + # --------------- + # Loop Management + # --------------- + + # Update the index + start += chunk_size - 2 * receptive_field + # Check if we are at the last index of the game + if start + chunk_size >= video_size: + start = video_size - chunk_size + last = True + return timestamps_long + + +def batch2long(output_segmentation, video_size, chunk_size, receptive_field): + """Method to transform the batches to vectors.""" + start = 0 + last = False + receptive_field = receptive_field // 2 + + segmentation_long = torch.zeros( + [video_size, output_segmentation.size()[-1]], + dtype=torch.float, + device=output_segmentation.device, + ) + + for batch in np.arange(output_segmentation.size()[0]): + + tmp_segmentation = torch.nn.functional.one_hot( + torch.argmax(output_segmentation[batch], dim=-1), + num_classes=output_segmentation.size()[-1], + ) + + # ------------------------------------------ + # Store the result of the chunk in the video + # ------------------------------------------ + + # For the first chunk + if start == 0: + segmentation_long[0 : chunk_size - receptive_field] = tmp_segmentation[ + 0 : chunk_size - receptive_field + ] + + # For the last chunk + elif last: + segmentation_long[start + receptive_field : start + chunk_size] = ( + tmp_segmentation[receptive_field:] + ) + break + + # For every other chunk + else: + segmentation_long[ + start + receptive_field : start + chunk_size - receptive_field + ] = tmp_segmentation[receptive_field : chunk_size - receptive_field] + + # --------------- + # Loop Management + # --------------- + + # Update the index + start += chunk_size - 2 * receptive_field + # Check if we are at the last index of the game + if start + chunk_size >= video_size: + start = video_size - chunk_size + last = True + return segmentation_long + # import torch # import numpy as np # import decord diff --git a/opensportslib/models/base/contextaware.py b/opensportslib/models/base/contextaware.py new file mode 100644 index 0000000..359bde9 --- /dev/null +++ b/opensportslib/models/base/contextaware.py @@ -0,0 +1,394 @@ +import logging +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from opensportslib.models.utils.litebase import LiteBaseModel + +import os + +from opensportslib.core.utils.video_processing import timestamps2long, batch2long + +from opensportslib.models.utils.utils import ( + NMS, + check_if_should_predict, + get_json_data, + predictions2json, + predictions2json_runnerjson, + zipResults, +) + +from opensportslib.models.heads.builder import build_head +from opensportslib.models.backbones.builder import build_backbone +from opensportslib.models.neck.builder import build_neck + + +class ContextAwareModel(nn.Module): + """ + CALF model composed of a backbone, neck and head. + Args: + weights (string): Path of the weights file. + backbone (string): Name of the backbone type. + neck (string): Name of the neck type. + head (string): Name of the head type. + The model takes as input a Tensor of the form (batch_size,1,chunk_size,input_size) + and returns : + 1. The segmentation of the form (batch_size,chunk_size,num_classes). + 2. The action spotting of the form (batch_size,num_detections,2+num_classes). + """ + + def __init__( + self, + weights=None, + backbone="PreExtracted", + neck="CNN++", + head="SpottingCALF", + post_proc="NMS", + ): + + super(ContextAwareModel, self).__init__() + + # Build Backbone + self.backbone = build_backbone(backbone) + + # Build Neck + self.neck = build_neck(neck) + + # Build Head + self.head = build_head(head) + + # load weight if needed + self.load_weights(weights=weights) + + def load_weights(self, weights=None): + if weights is not None: + print("=> loading checkpoint '{}'".format(weights)) + checkpoint = torch.load(weights) + self.load_state_dict(checkpoint["state_dict"]) + print( + "=> loaded checkpoint '{}' (epoch {})".format( + weights, checkpoint["epoch"] + ) + ) + + def forward(self, inputs): + """ + INPUT: a Tensor of the form (batch_size,1,chunk_size,input_size) + OUTPUTS: 1. The segmentation of the form (batch_size,chunk_size,num_classes) + 2. The action spotting of the form (batch_size,num_detections,2+num_classes) + """ + features = self.backbone(inputs) + conv_seg, output_segmentation = self.neck(features) + output_spotting = self.head(conv_seg, output_segmentation) + return output_segmentation, output_spotting + + +class LiteContextAwareModel(LiteBaseModel): + """ + Lightning module for the CALF model. + Args: + cfg (dict): DIct of config. + weights (string): Path of the weights file. + backbone (string): Name of the backbone type for the CALF model. + neck (string): Name of the neck type for the CALF model. + head (string): Name of the head type for the CALF model. + runner (string): Name of the runner. "runner_CALF" if using SoccerNet dataset modules or "runner_JSON" if using the json format. This will the change the behaviour of processing the predictions while infering. + """ + + def __init__( + self, + cfg=None, + weights=None, + backbone="PreExtracted", + neck="CNN++", + head="SpottingCALF", + post_proc="NMS", + runner="runner_CALF", + ): + super().__init__(cfg.training) + + # check compatibility dims Backbone - Neck - Head + assert backbone.output_dim == neck.input_size + assert neck.num_classes == head.num_classes + assert neck.dim_capsule == head.dim_capsule + assert neck.num_detections == head.num_detections + assert neck.chunk_size == head.chunk_size + + self.chunk_size = neck.chunk_size + self.receptive_field = neck.receptive_field + self.framerate = neck.framerate + + self.model = ContextAwareModel(weights, backbone, neck, head, post_proc) + + self.overwrite = True + + self.cfg = cfg + + self.runner = runner + + self.infer_split = getattr(cfg, "infer_split", True) + + def process(self, labels, targets, feats): + labels = labels.float() + targets = targets.float() + feats = feats.unsqueeze(1) + return labels, targets, feats + + def _common_step(self, batch, batch_idx): + """Operations in common for training and validation steps. + Process the features, labels and targets. The features are processed by the model to compute the outputs. + These outputs are used to compute the loss. + """ + feats, labels, targets = batch + labels, targets, feats = self.process(labels, targets, feats) + output_segmentation, output_spotting = self.forward(feats) + return self.criterion( + [labels, targets], [output_segmentation, output_spotting] + ), feats.size(0) + + def training_step(self, batch, batch_idx): + """Training step that defines the train loop.""" + loss, size = self._common_step(batch, batch_idx) + self.log_dict({"loss": loss}, on_step=True, on_epoch=True, prog_bar=True) + self.losses.update(loss.item(), size) + return loss + + def validation_step(self, batch, batch_idx): + """Validation step that defines the val loop.""" + val_loss, size = self._common_step(batch, batch_idx) + self.log_dict( + {"valid_loss": val_loss}, on_step=False, on_epoch=True, prog_bar=True + ) + self.losses.update(val_loss.item(), size) + return val_loss + + def on_predict_start(self): + """Operations to make before starting to infer.""" + self.stop_predict = False + + if self.infer_split: + self.output_folder, self.output_results, self.stop_predict = ( + check_if_should_predict( + self.cfg.dataset.test.results, self.cfg.work_dir, self.overwrite + ) + ) + if self.runner == "runner_JSON": + self.target_dir = os.path.join(self.cfg.work_dir, self.output_folder) + else: + self.target_dir = self.output_results + + if not self.stop_predict: + self.spotting_predictions = list() + self.spotting_grountruth = list() + self.spotting_grountruth_visibility = list() + self.segmentation_predictions = list() + + def on_predict_end(self): + """Operations to make after inference. + The process is different whether the data come from json or from the SoccerNet dataset in the way we will store the jsons containing the predictions. + """ + if not self.stop_predict: + # Transformation to numpy for evaluation + targets_numpy = list() + closests_numpy = list() + detections_numpy = list() + for target, detection in zip( + self.spotting_grountruth_visibility, self.spotting_predictions + ): + target_numpy = target.cpu().numpy() + targets_numpy.append(target_numpy) + detections_numpy.append(NMS(detection.numpy(), 20 * self.framerate)) + closest_numpy = np.zeros(target_numpy.shape) - 1 + # Get the closest action index + for c in np.arange(target_numpy.shape[-1]): + indexes = np.where(target_numpy[:, c] != 0)[0].tolist() + if len(indexes) == 0: + continue + indexes.insert(0, -indexes[0]) + indexes.append(2 * closest_numpy.shape[0]) + for i in np.arange(len(indexes) - 2) + 1: + start = max(0, (indexes[i - 1] + indexes[i]) // 2) + stop = min( + closest_numpy.shape[0], (indexes[i] + indexes[i + 1]) // 2 + ) + closest_numpy[start:stop, c] = target_numpy[indexes[i], c] + closests_numpy.append(closest_numpy) + + # Save the predictions to the json format + # if save_predictions: + if self.runner == "runner_CALF": + list_game = self.trainer.predict_dataloaders.dataset.listGames + for index in np.arange(len(list_game)): + json_data = get_json_data(list_game[index]) + if self.infer_split: + os.makedirs( + os.path.join( + self.cfg.work_dir, self.output_folder, list_game[index] + ), + exist_ok=True, + ) + output_file = os.path.join( + self.cfg.work_dir, + self.output_folder, + list_game[index], + "results_spotting.json", + ) + else: + output_file = os.path.join( + self.cfg.work_dir, f"{self.cfg.dataset.test.results}.json" + ) + json_data = predictions2json( + detections_numpy[index * 2], + detections_numpy[(index * 2) + 1], + json_data, + output_file, + self.framerate, + ) + self.json_data = json_data + elif self.runner == "runner_JSON": + list_videos = self.trainer.predict_dataloaders.dataset.data_json[0][ + "videos" + ] + for index in np.arange(len(list_videos)): + video = list_videos[index]["path"] + + if self.infer_split: + video = os.path.splitext(video)[0] + os.makedirs( + os.path.join(self.cfg.work_dir, self.output_folder, video), + exist_ok=True, + ) + output_file = os.path.join( + self.cfg.work_dir, + self.output_folder, + video, + "results_spotting.json", + ) + else: + output_file = os.path.join( + self.cfg.work_dir, f"{self.cfg.dataset.test.results}.json" + ) + + json_data = get_json_data(video) + json_data = predictions2json_runnerjson( + detections_numpy[index], + json_data, + output_file, + self.framerate, + inverse_event_dictionary=self.trainer.predict_dataloaders.dataset.inverse_event_dictionary, + ) + self.json_data = json_data + if self.infer_split: + zipResults( + zip_path=self.output_results, + target_dir=os.path.join(self.cfg.work_dir, self.output_folder), + filename="results_spotting.json", + ) + logging.info("Predictions saved") + logging.info( + os.path.join( + self.cfg.work_dir, + self.output_folder, + ) + ) + logging.info("Predictions saved") + logging.info(self.output_results) + else: + logging.info("Predictions saved") + logging.info( + os.path.join( + self.cfg.work_dir, f"{self.cfg.dataset.test.results}.json" + ) + ) + + def predict_step(self, batch): + """Infer step. + The process is different whether the data come from json or from the SoccerNet dataset. + In particular, processing data from json means processing one video (features) while processing data from SOccerNet + means processing two halfs of a game. + """ + if not self.stop_predict: + if self.runner == "runner_CALF": + feat_half1, feat_half2, label_half1, label_half2 = batch + + label_half1 = label_half1.float().squeeze(0) + label_half2 = label_half2.float().squeeze(0) + + feat_half1 = feat_half1.squeeze(0) + feat_half2 = feat_half2.squeeze(0) + + feat_half1 = feat_half1.unsqueeze(1) + feat_half2 = feat_half2.unsqueeze(1) + + # Compute the output + output_segmentation_half_1, output_spotting_half_1 = self.forward( + feat_half1 + ) + output_segmentation_half_2, output_spotting_half_2 = self.forward( + feat_half2 + ) + + timestamp_long_half_1 = timestamps2long( + output_spotting_half_1.cpu().detach(), + label_half1.size()[0], + self.chunk_size, + self.receptive_field, + ) + timestamp_long_half_2 = timestamps2long( + output_spotting_half_2.cpu().detach(), + label_half2.size()[0], + self.chunk_size, + self.receptive_field, + ) + segmentation_long_half_1 = batch2long( + output_segmentation_half_1.cpu().detach(), + label_half1.size()[0], + self.chunk_size, + self.receptive_field, + ) + segmentation_long_half_2 = batch2long( + output_segmentation_half_2.cpu().detach(), + label_half2.size()[0], + self.chunk_size, + self.receptive_field, + ) + + self.spotting_grountruth.append(torch.abs(label_half1)) + self.spotting_grountruth.append(torch.abs(label_half2)) + self.spotting_grountruth_visibility.append(label_half1) + self.spotting_grountruth_visibility.append(label_half2) + self.spotting_predictions.append(timestamp_long_half_1) + self.spotting_predictions.append(timestamp_long_half_2) + self.segmentation_predictions.append(segmentation_long_half_1) + self.segmentation_predictions.append(segmentation_long_half_2) + elif self.runner == "runner_JSON": + features, labels = batch + + labels = labels.float().squeeze(0) + + features = features.squeeze(0) + + features = features.unsqueeze(1) + + # Compute the output + output_segmentation, output_spotting = self.forward(features) + + timestamp_long = timestamps2long( + output_spotting.cpu().detach(), + labels.size()[0], + self.chunk_size, + self.receptive_field, + ) + segmentation_long = batch2long( + output_segmentation.cpu().detach(), + labels.size()[0], + self.chunk_size, + self.receptive_field, + ) + + self.spotting_grountruth.append(torch.abs(labels)) + self.spotting_grountruth_visibility.append(labels) + self.spotting_predictions.append(timestamp_long) + self.segmentation_predictions.append(segmentation_long) \ No newline at end of file diff --git a/opensportslib/models/base/learnablepooling.py b/opensportslib/models/base/learnablepooling.py new file mode 100644 index 0000000..8e4bd79 --- /dev/null +++ b/opensportslib/models/base/learnablepooling.py @@ -0,0 +1,360 @@ +import __future__ +import json +import logging +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +from opensportslib.models.utils.litebase import LiteBaseModel +from opensportslib.models.utils.utils import ( + check_if_should_predict, + get_json_data, + get_prediction_data, + get_spot_from_NMS, + timestamp, + zipResults, +) + +from opensportslib.models.heads.builder import build_head +from opensportslib.models.backbones.builder import build_backbone +from opensportslib.models.neck.builder import build_neck + +import os + + +class LearnablePoolingModel(nn.Module): + """ + Learnable pooling model composed of a backbone, neck and head. + Args: + weights (string): Path of the weights file. + backbone (string): Name of the backbone type. + neck (string): Name of the neck type. + head (string): Name of the head type. + The model takes as input a Tensor of the form (batch_size,window_size,feature_size) + and returns a Tensor of shape (batch_size,num_classes+1) that contains predictions. + """ + + def __init__( + self, + weights=None, + backbone="PreExtracted", + neck="NetVLAD++", + head="LinearLayer", + post_proc="NMS", + ): + + super(LearnablePoolingModel, self).__init__() + + # check compatibility dims Backbone - Neck - Head + assert backbone.output_dim == neck.input_dim + assert neck.output_dim == head.input_dim + + # Build Backbone + self.backbone = build_backbone(backbone) + + # Build Neck + self.neck = build_neck(neck) + + # Build Head + self.head = build_head(head) + + # load weight if needed + self.load_weights(weights=weights) + + def load_weights(self, weights=None): + if weights is not None: + print("=> loading checkpoint '{}'".format(weights)) + checkpoint = torch.load(weights) + self.load_state_dict(checkpoint["state_dict"]) + print( + "=> loaded checkpoint '{}' (epoch {})".format( + weights, checkpoint["epoch"] + ) + ) + + def forward(self, inputs): + """ + INPUT: a Tensor of shape (batch_size,window_size,feature_size) + OUTPUTS: a Tensor of shape (batch_size,num_classes+1) + """ + features = self.backbone(inputs) + feature_pooled = self.neck(features) + output = self.head(feature_pooled) + return output + + def post_proc(self): + return + + +class LiteLearnablePoolingModel(LiteBaseModel): + """ + Lightning module for the learnable pooling model. + Args: + cfg (dict): Dict of config. + weights (string): Path of the weights file. + backbone (string): Name of the backbone type for the CALF model. + neck (string): Name of the neck type for the CALF model. + head (string): Name of the head type for the CALF model. + runner (string): Name of the runner. "runner_pooling" if using SoccerNet dataset modules or "runner_JSON" if using the json format. This will the change the behaviour of processing the predictions while infering. + """ + + def __init__( + self, + cfg=None, + weights=None, + backbone="PreExtracted", + neck="NetVLAD++", + head="LinearLayer", + post_proc="NMS", + runner="runner_pooling", + ): + """ + INPUT: a Tensor of shape (batch_size,window_size,feature_size) + OUTPUTS: a Tensor of shape (batch_size,num_classes+1) + """ + super().__init__(cfg.training) + + self.model = LearnablePoolingModel(weights, backbone, neck, head, post_proc) + + self.confidence_threshold = 0.0 + + self.overwrite = True + + self.cfg = cfg + + self.runner = runner + + self.infer_split = getattr(cfg, "infer_split", True) + + def _common_step(self, batch, batch_idx): + """Operations in common for training and validation steps. + Process the features and the labels. The features are processed by the model to compute the outputs. + These outputs are used to compute the loss. + """ + feats, labels = batch + output = self.model(feats) + return self.criterion(labels, output), feats.size(0) + + def training_step(self, batch, batch_idx): + """Training step that defines the train loop.""" + loss, size = self._common_step(batch, batch_idx) + self.log_dict({"loss": loss}, on_step=True, on_epoch=True, prog_bar=True) + self.losses.update(loss.item(), size) + return loss + + def validation_step(self, batch, batch_idx): + """Validation step that defines the validation loop.""" + val_loss, size = self._common_step(batch, batch_idx) + self.log_dict( + {"valid_loss": val_loss}, on_step=False, on_epoch=True, prog_bar=True + ) + self.losses.update(val_loss.item(), size) + return val_loss + + def on_predict_start(self): + """Operations to make before starting to infer.""" + self.stop_predict = False + if self.infer_split: + self.output_folder, self.output_results, self.stop_predict = ( + check_if_should_predict( + self.cfg.dataset.test.results, self.cfg.work_dir, self.overwrite + ) + ) + + if self.runner == "runner_JSON": + self.target_dir = os.path.join(self.cfg.work_dir, self.output_folder) + else: + self.target_dir = self.output_results + + if not self.stop_predict: + self.spotting_predictions = list() + + def on_predict_end(self): + """Operations to make after inference.""" + if not self.stop_predict: + if self.infer_split: + zipResults( + zip_path=self.output_results, + target_dir=os.path.join(self.cfg.work_dir, self.output_folder), + filename="results_spotting.json", + ) + logging.info("Predictions saved") + logging.info( + os.path.join( + self.cfg.work_dir, + self.output_folder, + ) + ) + logging.info("Predictions saved") + logging.info(self.output_results) + else: + logging.info("Predictions saved") + logging.info( + os.path.join( + self.cfg.work_dir, f"{self.cfg.dataset.test.results}.json" + ) + ) + + def predict_step(self, batch, batch_idx): + """Infer step. + The process is different whether the data come from json or from the SoccerNet dataset. + In particular, processing data from json means processing one video (features) while processing data from SOccerNet + means processing two halfs of a game. + One step process either features of a game or features of a video and and predictions are stored in a json format. + """ + if not self.stop_predict: + if self.runner == "runner_pooling": + game_ID, feat_half1, feat_half2, label_half1, label_half2 = batch + + game_ID = game_ID[0] + feat_half1 = feat_half1.squeeze(0) + feat_half2 = feat_half2.squeeze(0) + + # Compute the output for batches of frames + BS = 256 + timestamp_long_half_1 = timestamp(self.model, feat_half1, BS) + timestamp_long_half_2 = timestamp(self.model, feat_half2, BS) + + timestamp_long_half_1 = timestamp_long_half_1[:, 1:] + timestamp_long_half_2 = timestamp_long_half_2[:, 1:] + + self.spotting_predictions.append(timestamp_long_half_1) + self.spotting_predictions.append(timestamp_long_half_2) + + framerate = self.trainer.predict_dataloaders.dataset.framerate + get_spot = get_spot_from_NMS + + json_data = get_json_data(game_ID) + + for half, timestamp_long in enumerate( + [timestamp_long_half_1, timestamp_long_half_2] + ): + for l in range( + self.trainer.predict_dataloaders.dataset.num_classes + ): + spots = get_spot( + timestamp_long[:, l], + window=self.cfg.model.post_proc.NMS_window + * self.cfg.model.backbone.framerate, + thresh=self.cfg.model.post_proc.NMS_threshold, + ) + for spot in spots: + frame_index = int(spot[0]) + confidence = spot[1] + if confidence < 0.5: + continue + json_data["predictions"].append( + get_prediction_data( + False, + frame_index, + framerate, + half=half, + version=self.trainer.predict_dataloaders.dataset.version, + l=l, + confidence=confidence, + runner=self.runner, + ) + ) + + json_data["predictions"] = sorted( + json_data["predictions"], key=lambda x: int(x["position"]) + ) + json_data["predictions"] = sorted( + json_data["predictions"], key=lambda x: int(x["half"]) + ) + + # if game_ID.startswith('/'): + # game_ID = game_ID[1:] + if self.infer_split: + os.makedirs( + os.path.join(self.cfg.work_dir, self.output_folder, game_ID), + exist_ok=True, + ) + output_file = os.path.join( + self.cfg.work_dir, + self.output_folder, + game_ID, + "results_spotting.json", + ) + else: + output_file = os.path.join( + self.cfg.work_dir, f"{self.cfg.dataset.test.results}.json" + ) + with open(output_file, "w") as output_file: + json.dump(json_data, output_file, indent=4) + self.json_data = json_data + elif self.runner == "runner_JSON": + video, features, labels = batch + + video = video[0] + if self.infer_split: + video, _ = os.path.splitext(video) + features = features.squeeze(0) + + # Compute the output for batches of frames + BS = 256 + timestamp_long = timestamp(self.model, features, BS) + + timestamp_long = timestamp_long[:, 1:] + + self.spotting_predictions.append(timestamp_long) + + framerate = self.trainer.predict_dataloaders.dataset.framerate + get_spot = get_spot_from_NMS + + json_data = get_json_data(video) + + for l in range(self.trainer.predict_dataloaders.dataset.num_classes): + spots = get_spot( + timestamp_long[:, l], + window=self.cfg.model.post_proc.NMS_window + * self.cfg.model.backbone.framerate, + thresh=self.cfg.model.post_proc.NMS_threshold, + ) + for spot in spots: + frame_index = int(spot[0]) + confidence = spot[1] + + if confidence < self.confidence_threshold: + continue + + json_data["predictions"].append( + get_prediction_data( + False, + frame_index, + framerate, + version=2, + l=l, + confidence=confidence, + runner=self.runner, + inverse_event_dictionary=self.trainer.predict_dataloaders.dataset.inverse_event_dictionary, + ) + ) + + json_data["predictions"] = sorted( + json_data["predictions"], key=lambda x: int(x["position"]) + ) + + # if video.startswith('/'): + # video = video[1:] + if self.infer_split: + os.makedirs( + os.path.join(self.cfg.work_dir, self.output_folder, video), + exist_ok=True, + ) + output_file = os.path.join( + self.cfg.work_dir, + self.output_folder, + video, + "results_spotting.json", + ) + else: + output_file = os.path.join( + self.cfg.work_dir, f"{self.cfg.dataset.test.results}.json" + ) + with open(output_file, "w") as output_file: + json.dump(json_data, output_file, indent=4) + self.json_data = json_data \ No newline at end of file diff --git a/opensportslib/models/builder.py b/opensportslib/models/builder.py index e0d4b0e..1c9315f 100644 --- a/opensportslib/models/builder.py +++ b/opensportslib/models/builder.py @@ -29,7 +29,29 @@ def build_model(config, device): if task == "localization": from opensportslib.models.base.e2e import E2EModel - if config.MODEL.type == "E2E": + from opensportslib.models.base.contextaware import LiteContextAwareModel + from opensportslib.models.base.learnablepooling import LiteLearnablePoolingModel + + if config.MODEL.type == "LearnablePooling": + model = LiteLearnablePoolingModel( + cfg=config, + weights=config.MODEL.load_weights, + backbone=config.MODEL.backbone, + head=config.MODEL.head, + neck=config.MODEL.neck, + post_proc=config.MODEL.post_proc, + runner=config.RUNNER.type, + ) + elif config.MODEL.type == "ContextAware": + model = LiteContextAwareModel( + cfg=config, + weights=config.MODEL.load_weights, + backbone=config.MODEL.backbone, + head=config.MODEL.head, + neck=config.MODEL.neck, + runner=config.RUNNER.type, + ) + elif config.MODEL.type == "E2E": model = E2EModel(config, len(config.DATA.classes)+1, config.MODEL.backbone, diff --git a/opensportslib/models/neck/builder.py b/opensportslib/models/neck/builder.py index 3a65fb6..0023f86 100644 --- a/opensportslib/models/neck/builder.py +++ b/opensportslib/models/neck/builder.py @@ -1,5 +1,9 @@ import torch from torch import nn +import torch.nn.functional as F +import numpy as np +import math +from torch.autograd import Variable from opensportslib.core.utils.data import batch_tensor, unbatch_tensor def build_neck(cfg, default_args=None): @@ -21,6 +25,44 @@ def build_neck(cfg, default_args=None): num_attention_heads=getattr(cfg, "num_attention_heads", 4), lstm_dropout=getattr(cfg, "lstm_dropout", 0.1) ) + elif cfg.type == "MaxPool": + neck = MaxPool(nb_frames=cfg.nb_frames) + elif cfg.type == "MaxPool++": + neck = MaxPool_temporally_aware(nb_frames=cfg.nb_frames) + elif cfg.type == "AvgPool": + neck = AvgPool(nb_frames=cfg.nb_frames) + elif cfg.type == "AvgPool++": + neck = AvgPool_temporally_aware(nb_frames=cfg.nb_frames) + elif cfg.type == "NetRVLAD": + neck = NetRVLAD( + vocab_size=cfg.vocab_size, + input_dim=cfg.input_dim, + ) + elif cfg.type == "NetRVLAD++": + neck = NetRVLAD_temporally_aware( + vocab_size=cfg.vocab_size, + input_dim=cfg.input_dim, + ) + elif cfg.type == "NetVLAD": + neck = NetVLAD( + vocab_size=cfg.vocab_size, + input_dim=cfg.input_dim, + ) + elif cfg.type == "NetVLAD++": + neck = NetVLAD_temporally_aware( + vocab_size=cfg.vocab_size, + input_dim=cfg.input_dim, + ) + elif cfg.type == "CNN++": + neck = CNN_temporally_aware( + input_size=cfg.input_size, + num_classes=cfg.num_classes, + chunk_size=cfg.chunk_size, + dim_capsule=cfg.dim_capsule, + receptive_field=cfg.receptive_field, + num_detections=cfg.num_detections, + framerate=cfg.framerate, + ) else: raise ValueError(f"Unknown neck type: {cfg.type}") return neck @@ -207,4 +249,389 @@ def forward(self, x): lstm_out, _ = self.temporal(x) x = torch.max(lstm_out, dim=1)[0] - return x \ No newline at end of file + return x + + +class CNN_temporally_aware(torch.nn.Module): + def __init__( + self, + input_size=512, + num_classes=3, + chunk_size=240, + dim_capsule=16, + receptive_field=80, + num_detections=5, + framerate=2, + ): + super(CNN_temporally_aware, self).__init__() + self.input_size = input_size + self.num_classes = num_classes + self.dim_capsule = dim_capsule + self.receptive_field = receptive_field + self.num_detections = num_detections + self.chunk_size = chunk_size + self.framerate = framerate + + self.pyramid_size_1 = int(np.ceil(receptive_field / 7)) + self.pyramid_size_2 = int(np.ceil(receptive_field / 3)) + self.pyramid_size_3 = int(np.ceil(receptive_field / 2)) + self.pyramid_size_4 = int(np.ceil(receptive_field)) + + # Base Convolutional Layers + self.conv_1 = nn.Conv2d( + in_channels=1, out_channels=128, kernel_size=(1, input_size) + ) + self.conv_2 = nn.Conv2d(in_channels=128, out_channels=32, kernel_size=(1, 1)) + + # Temporal Pyramidal Module + self.pad_p_1 = nn.ZeroPad2d( + ( + 0, + 0, + (self.pyramid_size_1 - 1) // 2, + self.pyramid_size_1 - 1 - (self.pyramid_size_1 - 1) // 2, + ) + ) + self.pad_p_2 = nn.ZeroPad2d( + ( + 0, + 0, + (self.pyramid_size_2 - 1) // 2, + self.pyramid_size_2 - 1 - (self.pyramid_size_2 - 1) // 2, + ) + ) + self.pad_p_3 = nn.ZeroPad2d( + ( + 0, + 0, + (self.pyramid_size_3 - 1) // 2, + self.pyramid_size_3 - 1 - (self.pyramid_size_3 - 1) // 2, + ) + ) + self.pad_p_4 = nn.ZeroPad2d( + ( + 0, + 0, + (self.pyramid_size_4 - 1) // 2, + self.pyramid_size_4 - 1 - (self.pyramid_size_4 - 1) // 2, + ) + ) + self.conv_p_1 = nn.Conv2d( + in_channels=32, out_channels=8, kernel_size=(self.pyramid_size_1, 1) + ) + self.conv_p_2 = nn.Conv2d( + in_channels=32, out_channels=16, kernel_size=(self.pyramid_size_2, 1) + ) + self.conv_p_3 = nn.Conv2d( + in_channels=32, out_channels=32, kernel_size=(self.pyramid_size_3, 1) + ) + self.conv_p_4 = nn.Conv2d( + in_channels=32, out_channels=64, kernel_size=(self.pyramid_size_4, 1) + ) + + # ------------------- + # Segmentation module + # ------------------- + + self.kernel_seg_size = 3 + self.pad_seg = nn.ZeroPad2d( + ( + 0, + 0, + (self.kernel_seg_size - 1) // 2, + self.kernel_seg_size - 1 - (self.kernel_seg_size - 1) // 2, + ) + ) + self.conv_seg = nn.Conv2d( + in_channels=152, + out_channels=dim_capsule * num_classes, + kernel_size=(self.kernel_seg_size, 1), + ) + self.batch_seg = nn.BatchNorm2d( + num_features=self.chunk_size, momentum=0.01, eps=0.001 + ) + + def forward(self, inputs): + # ------------------------------------- + # Temporal Convolutional neural network + # ------------------------------------- + + # Base Convolutional Layers + conv_1 = F.relu(self.conv_1(inputs)) + # print("Conv_1 size: ", conv_1.size()) + + conv_2 = F.relu(self.conv_2(conv_1)) + # print("Conv_2 size: ", conv_2.size()) + + # Temporal Pyramidal Module + conv_p_1 = F.relu(self.conv_p_1(self.pad_p_1(conv_2))) + # print("Conv_p_1 size: ", conv_p_1.size()) + conv_p_2 = F.relu(self.conv_p_2(self.pad_p_2(conv_2))) + # print("Conv_p_2 size: ", conv_p_2.size()) + conv_p_3 = F.relu(self.conv_p_3(self.pad_p_3(conv_2))) + # print("Conv_p_3 size: ", conv_p_3.size()) + conv_p_4 = F.relu(self.conv_p_4(self.pad_p_4(conv_2))) + # print("Conv_p_4 size: ", conv_p_4.size()) + + concatenation = torch.cat((conv_2, conv_p_1, conv_p_2, conv_p_3, conv_p_4), 1) + # print("Concatenation size: ", concatenation.size()) + + # ------------------- + # Segmentation module + # ------------------- + + conv_seg = self.conv_seg(self.pad_seg(concatenation)) + # print("Conv_seg size: ", conv_seg.size()) + + conv_seg_permuted = conv_seg.permute(0, 2, 3, 1) + # print("Conv_seg_permuted size: ", conv_seg_permuted.size()) + + conv_seg_reshaped = conv_seg_permuted.view( + conv_seg_permuted.size()[0], + conv_seg_permuted.size()[1], + self.dim_capsule, + self.num_classes, + ) + # print("Conv_seg_reshaped size: ", conv_seg_reshaped.size()) + + # conv_seg_reshaped_permuted = conv_seg_reshaped.permute(0,3,1,2) + # print("Conv_seg_reshaped_permuted size: ", conv_seg_reshaped_permuted.size()) + + conv_seg_norm = torch.sigmoid(self.batch_seg(conv_seg_reshaped)) + # print("Conv_seg_norm: ", conv_seg_norm.size()) + + # conv_seg_norm_permuted = conv_seg_norm.permute(0,2,3,1) + # print("Conv_seg_norm_permuted size: ", conv_seg_norm_permuted.size()) + + output_segmentation = torch.sqrt( + torch.sum(torch.square(conv_seg_norm - 0.5), dim=2) * 4 / self.dim_capsule + ) + # print("Output_segmentation size: ", output_segmentation.size()) + + return conv_seg, output_segmentation + + +class MaxPool(torch.nn.Module): + def __init__(self, nb_frames): + super(MaxPool, self).__init__() + self.pooling_layer = nn.MaxPool1d(nb_frames, stride=1) + + def forward(self, inputs): + return self.pooling_layer(inputs.permute((0, 2, 1))).squeeze(-1) + + +class MaxPool_temporally_aware(torch.nn.Module): + def __init__(self, nb_frames): + super(MaxPool_temporally_aware, self).__init__() + self.pooling_layer_before = nn.MaxPool1d(int(nb_frames / 2), stride=1) + self.pooling_layer_after = nn.MaxPool1d(int(nb_frames / 2), stride=1) + + def forward(self, inputs): + nb_frames_50 = int(inputs.shape[1] / 2) + input_before = inputs[:, :nb_frames_50, :] + input_after = inputs[:, nb_frames_50:, :] + inputs_before_pooled = self.pooling_layer_before( + input_before.permute((0, 2, 1)) + ).squeeze(-1) + inputs_after_pooled = self.pooling_layer_after( + input_after.permute((0, 2, 1)) + ).squeeze(-1) + inputs_pooled = torch.cat((inputs_before_pooled, inputs_after_pooled), dim=1) + return inputs_pooled + + +class AvgPool(torch.nn.Module): + def __init__(self, nb_frames): + super(AvgPool, self).__init__() + self.pooling_layer = nn.AvgPool1d(nb_frames, stride=1) + + def forward(self, inputs): + return self.pooling_layer(inputs.permute((0, 2, 1))).squeeze(-1) + + +class AvgPool_temporally_aware(torch.nn.Module): + def __init__(self, nb_frames): + super(AvgPool_temporally_aware, self).__init__() + self.pooling_layer_before = nn.AvgPool1d(int(nb_frames / 2), stride=1) + self.pooling_layer_after = nn.AvgPool1d(int(nb_frames / 2), stride=1) + + def forward(self, inputs): + nb_frames_50 = int(inputs.shape[1] / 2) + input_before = inputs[:, :nb_frames_50, :] + input_after = inputs[:, nb_frames_50:, :] + inputs_before_pooled = self.pooling_layer_before( + input_before.permute((0, 2, 1)) + ).squeeze(-1) + inputs_after_pooled = self.pooling_layer_after( + input_after.permute((0, 2, 1)) + ).squeeze(-1) + inputs_pooled = torch.cat((inputs_before_pooled, inputs_after_pooled), dim=1) + return inputs_pooled + + +class NetRVLAD(torch.nn.Module): + def __init__(self, vocab_size, input_dim): + super(NetRVLAD, self).__init__() + self.pooling_layer = NetRVLAD_core( + cluster_size=vocab_size, feature_size=input_dim, add_batch_norm=True + ) + + def forward(self, inputs): + return self.pooling_layer(inputs) + + +class NetRVLAD_temporally_aware(torch.nn.Module): + def __init__(self, vocab_size, input_dim): + super(NetRVLAD_temporally_aware, self).__init__() + self.pooling_layer_before = NetRVLAD_core( + cluster_size=int(vocab_size / 2), + feature_size=input_dim, + add_batch_norm=True, + ) + self.pooling_layer_after = NetRVLAD_core( + cluster_size=int(vocab_size / 2), + feature_size=input_dim, + add_batch_norm=True, + ) + + def forward(self, inputs): + nb_frames_50 = int(inputs.shape[1] / 2) + inputs_before_pooled = self.pooling_layer_before(inputs[:, :nb_frames_50, :]) + inputs_after_pooled = self.pooling_layer_after(inputs[:, nb_frames_50:, :]) + inputs_pooled = torch.cat((inputs_before_pooled, inputs_after_pooled), dim=1) + return inputs_pooled + + +class NetVLAD(torch.nn.Module): + def __init__(self, vocab_size, input_dim): + super(NetVLAD, self).__init__() + self.pooling_layer = NetVLAD_core( + cluster_size=vocab_size, feature_size=input_dim, add_batch_norm=True + ) + + def forward(self, inputs): + return self.pooling_layer(inputs) + + +class NetVLAD_temporally_aware(torch.nn.Module): + def __init__(self, vocab_size, input_dim): + super(NetVLAD_temporally_aware, self).__init__() + self.pooling_layer_before = NetVLAD_core( + cluster_size=int(vocab_size / 2), + feature_size=input_dim, + add_batch_norm=True, + ) + self.pooling_layer_after = NetVLAD_core( + cluster_size=int(vocab_size / 2), + feature_size=input_dim, + add_batch_norm=True, + ) + + def forward(self, inputs): + nb_frames_50 = int(inputs.shape[1] / 2) + inputs_before_pooled = self.pooling_layer_before(inputs[:, :nb_frames_50, :]) + inputs_after_pooled = self.pooling_layer_after(inputs[:, nb_frames_50:, :]) + inputs_pooled = torch.cat((inputs_before_pooled, inputs_after_pooled), dim=1) + return inputs_pooled + + +class NetVLAD_core(nn.Module): + def __init__(self, cluster_size, feature_size, add_batch_norm=True): + super(NetVLAD_core, self).__init__() + self.feature_size = feature_size + self.cluster_size = cluster_size + self.clusters = nn.Parameter( + (1 / math.sqrt(feature_size)) * torch.randn(feature_size, cluster_size) + ) + self.clusters2 = nn.Parameter( + (1 / math.sqrt(feature_size)) * torch.randn(1, feature_size, cluster_size) + ) + + self.add_batch_norm = add_batch_norm + self.out_dim = cluster_size * feature_size + + def forward(self, x): + # x [BS, T, D] + max_sample = x.size()[1] + + # LOUPE + if self.add_batch_norm: # normalization along feature dimension + x = F.normalize(x, p=2, dim=2) + + x = x.reshape(-1, self.feature_size) + assignment = torch.matmul(x, self.clusters) + + assignment = F.softmax(assignment, dim=1) + assignment = assignment.view(-1, max_sample, self.cluster_size) + + a_sum = torch.sum(assignment, -2, keepdim=True) + a = a_sum * self.clusters2 + + assignment = assignment.transpose(1, 2) + + x = x.view(-1, max_sample, self.feature_size) + vlad = torch.matmul(assignment, x) + vlad = vlad.transpose(1, 2) + vlad = vlad - a + + # L2 intra norm + vlad = F.normalize(vlad) + + # flattening + L2 norm + vlad = vlad.reshape(-1, self.cluster_size * self.feature_size) + vlad = F.normalize(vlad) + + return vlad + + +class NetRVLAD_core(nn.Module): + def __init__(self, cluster_size, feature_size, add_batch_norm=True): + super(NetRVLAD_core, self).__init__() + self.feature_size = feature_size + self.cluster_size = cluster_size + self.clusters = nn.Parameter( + (1 / math.sqrt(feature_size)) * torch.randn(feature_size, cluster_size) + ) + # self.clusters2 = nn.Parameter((1/math.sqrt(feature_size)) + # *th.randn(1, feature_size, cluster_size)) + # self.clusters = nn.Parameter(torch.rand(1,feature_size, cluster_size)) + # self.clusters2 = nn.Parameter(torch.rand(1,feature_size, cluster_size)) + + self.add_batch_norm = add_batch_norm + # self.batch_norm = nn.BatchNorm1d(cluster_size) + self.out_dim = cluster_size * feature_size + # (+ 128 params?) + + def forward(self, x): + max_sample = x.size()[1] + + # LOUPE + if self.add_batch_norm: # normalization along feature dimension + x = F.normalize(x, p=2, dim=2) + + x = x.reshape(-1, self.feature_size) + assignment = torch.matmul(x, self.clusters) + + assignment = F.softmax(assignment, dim=1) + assignment = assignment.view(-1, max_sample, self.cluster_size) + + # a_sum = th.sum(assignment,-2,keepdim=True) + # a = a_sum*self.clusters2 + + assignment = assignment.transpose(1, 2) + + x = x.view(-1, max_sample, self.feature_size) + rvlad = torch.matmul(assignment, x) + rvlad = rvlad.transpose(-1, 1) + + # vlad = vlad.transpose(1,2) + # vlad = vlad - a + + # L2 intra norm + rvlad = F.normalize(rvlad) + + # flattening + L2 norm + rvlad = rvlad.reshape(-1, self.cluster_size * self.feature_size) + rvlad = F.normalize(rvlad) + + return rvlad \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 6a5a39b..770f537 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "SoccerNet", "av", "decord; platform_system != 'Darwin' and pla name = "Jeet Vora" [project.optional-dependencies] -localization = [ "nvidia-dali-cuda120", "cupy-cuda12x", "tabulate",] +localization = [ "nvidia-dali-cuda120", "cupy-cuda12x", "tabulate", "pytorch-lightning"] py-geometric = [ "torch-geometric", "torch-scatter", "torch-sparse", "torch-cluster", "torch-spline-conv",] [tool.setuptools.package-data]