In [24]:
import warnings
# Suppress all SyntaxWarning warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

import sys
sys.path.append("pytorchvideo")

In [25]:
from glob import glob
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt

## Load file names into dataframe

In [None]:
current_dir = os.getcwd()
non = glob("NonViolence/*")
vio = glob("Violence/*")
label = [0] * len(non) + [1] * len(vio)

non_full_paths = [os.path.abspath(file) for file in non]
vio_full_paths = [os.path.abspath(file) for file in vio]

df = pd.DataFrame(zip(non_full_paths + vio_full_paths, label), columns=["file", "label"])
print("non violence video", len(non))
print("violence video", len(vio))
df.head(10)

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import random_split

train_size = int(0.7 * len(df))  # 70%
val_size = int(0.15 * len(df))    # 15%
test_size = len(df) - train_size - val_size  # Remaining 15%

train_df, val_df, test_df = random_split(df, [train_size, val_size, test_size])
# train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True) # random_state=42,
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


In [None]:
train_video_paths = [
    (row['file'], {'label': row['label']}) for _, row in train_df.iterrows()
] # required by pytorchvideo: list of tuples with file path and a dictionary with label
print("number of train:", len(train_video_paths))
train_video_paths[0]

In [None]:
val_video_paths = [
    (row['file'], {'label': row['label']}) for _, row in val_df.iterrows()
]
print("number of val:", len(val_video_paths))
val_video_paths[0]

## Augmentation

In [None]:
from pytorchvideo.data import labeled_video_dataset, kinetics, make_clip_sampler, LabeledVideoDataset

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,

    UniformTemporalSubsample,
    Permute,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    CenterCrop,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
from torchvision.transforms._transforms_video import (
    CenterCropVideo, 
    NormalizeVideo,
)

# Could try to replace _trainsforms_video with functional API
# import torch
# import torchvision.transforms.functional as F
# 
# # Define the transformation functions using functional API
# def center_crop_video(video, output_size):
#     # video is a tensor of shape (C, T, H, W)
#     c, t, h, w = video.shape
#     cropped_frames = [F.center_crop(video[:, i, :, :], output_size) for i in range(t)]
#     return torch.stack(cropped_frames, dim=1)  # (C, T, H', W')
# 
# def normalize_video(video, mean, std):
#     # video is a tensor of shape (C, T, H, W)
#     c, t, h, w = video.shape
#     normalized_frames = [F.normalize(video[:, i, :, :], mean, std) for i in range(t)]
#     return torch.stack(normalized_frames, dim=1)  # (C, T, H, W)
# 
# # Example usage
# video = torch.randn(3, 10, 128, 128)  # Example video tensor (C, T, H, W)
# output_size = (100, 100)
# mean = [0.5, 0.5, 0.5]
# std = [0.5, 0.5, 0.5]
# 
# cropped_video = center_crop_video(video, output_size)
# normalized_video = normalize_video(cropped_video, mean, std)
# 
# print(normalized_video.shape)  # Should be (3, 10, 100, 100)


In [8]:
video_transform = Compose([
    ApplyTransformToKey(key="video", # ensures that the specified sequence of transformations (wrapped in Compose) is applied to the video data only
        transform=Compose([
            UniformTemporalSubsample(20), # Reduces the number of frames in the video to 20 by sampling frames uniformly across the entire video
            Lambda(lambda x: x / 255.0), # Normalizes the pixel values by range scaling to [0, 1]
            Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)), # Standardizes the pixel values to ensure mean=0 and std=1
            RandomShortSideScale(min_size=248, max_size=256), # Scales the short side of each frame randomly between 248 and 256 pixels
            CenterCropVideo(224), # Crops the center of each frame to a size of 224x224 pixels
            RandomHorizontalFlip(p=0.5),
    ])),
])

In [None]:
train_video_paths

In [10]:
from torch.utils.data import dataloader
# train_dataset = LabeledVideoDataset(train_video_paths, 
#                                     clip_sampler=make_clip_sampler("random", 2), # video duration is 2 seconds
#                                     transform=video_transform,
#                                     decode_audio=False) #  
# loader = dataloader.DataLoader(train_dataset, batch_size=4, num_workers=0, pin_memory=False) # collate_fn=collate_fn

In [11]:
# Try to load a batch and catch exceptions
# try:
#     batch = next(iter(loader))
#     print(batch)
# except Exception as e:
#     print(f"Error loading video: {e}")

### Inspect the dataloader

In [12]:
# # Inspect the batch structure
# print("Batch keys:", batch.keys())
# 
# # Assuming the batch contains 'video' and 'label'
# # Inspect the shape of the 'video' tensor
# video_data = batch['video']
# print("Video data shape:", video_data.shape) # (batch size, channels, frames, height, width)
# 
# # Inspect the shape of the 'label' tensor
# label_data = batch['label']
# print("Label data shape:", label_data.shape)

### Inspect the dataset before setting into dataloader

In [13]:
# try:
#     for i in range(1):  # Inspect first item
#         sample = next(iter(train_dataset))
#         print(f"Sample {i}:")
#         for key, value in sample.items():
#             print(f"  {key}: type={type(value)}, shape={value.shape if hasattr(value, 'shape') else 'N/A'}")
# except Exception as e:
#     print(f"Error accessing sample: {e}")

# Model architecture

In [14]:
import torch.nn as nn
import torch
from pytorch_lightning import LightningModule, seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader
import torchmetrics

In [None]:
video_model = torch.hub.load("facebookresearch/pytorchvideo", "efficient_x3d_xs", pretrained=True)
video_model

In [16]:
import torch
import torch.nn as nn
import torchmetrics
from pytorch_lightning import LightningModule
from torch.optim.lr_scheduler import CosineAnnealingLR
from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
import numpy as np

class ViolenceClassifier(LightningModule):
    def __init__(self, num_classes: int):
        super(ViolenceClassifier, self).__init__()
        self.video_model = torch.hub.load("facebookresearch/pytorchvideo", "efficient_x3d_xs", pretrained=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(400, 1) #  efficient_x3d_xs model has 400 output features

        self.lr = 1e-3
        self.batch_size = 4
        self.num_worker = 4

        # Evaluation metric
        self.metric = torchmetrics.Accuracy(task='binary')

        # Loss function 
        self.criterion = nn.BCEWithLogitsLoss()

        # Initialize lists to store outputs
        self.training_step_outputs = []  # Initialize empty list for training outputs
        self.validation_step_outputs = []  # Initialize empty list for validation outputs


    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.video_model(x)
        x = self.relu(x)
        x = self.linear(x)
        return x

    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(), lr=self.lr)
        scheduler = CosineAnnealingLR(opt, T_max=10, eta_min=1e-6, last_epoch=-1)
        return {"optimizer": opt, "lr_scheduler": scheduler}

    def train_dataloader(self):
        train_dataset = LabeledVideoDataset(
            train_video_paths,
            clip_sampler=make_clip_sampler("random", 2), # Video duration is 2 seconds
            transform=video_transform,
            decode_audio=False
        )
        # that number of worker processes are spawned and used to load the data. 
        # Each worker will fetch a batch of data from the dataset independently. 
        # The fetched data is then collated into a batch and returned by the DataLoader.
        # pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them. 
        # This can make data transfer to the GPU faster. 
        # This is generally set to True when the model is being trained on a GPU.
        loader = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.num_worker, pin_memory=False) 
        return loader

    def training_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        output = self(video) # Same as self.forward(x)
        label = label.unsqueeze(1)  # Reshape to (batch_size, 1)
        loss = self.criterion(output, label.to(torch.float32))
        metric = self.metric(output, label.int())
        self.log("train_loss", loss)
        self.log("train_metric", metric, prog_bar=True)
        # Store loss and metric for later use in epoch end
        self.training_step_outputs.append({"loss": loss, "metric": metric})
        return {"loss": loss, "metric": metric.detach()}
        

    def on_train_epoch_end(self):
        # This is called at the end of the training epoch
        # Calculate average loss and metric
        avg_loss = torch.stack([x['loss'].detach() for x in self.training_step_outputs]).mean().cpu().numpy().round(2)
        avg_metric = torch.stack([x['metric'].detach() for x in self.training_step_outputs]).mean().cpu().numpy().round(2)
        
        self.log("epoch", self.current_epoch, prog_bar=True, logger=True)
        self.log("avg_train_loss", avg_loss, prog_bar=True)
        self.log("avg_train_metric", avg_metric, prog_bar=True)
        # Clear the stored outputs
        self.training_step_outputs.clear()
        
    # def training_epoch_end(self, outputs):
    #     avg_loss = torch.stack([x['loss'] for x in outputs]).mean().cpu().numpy().round(2)
    #     metric = torch.stack([x['metric'] for x in outputs]).mean().cpu().numpy().round(2)
    #     self.log("train_loss", avg_loss)
    #     self.log("train_metric", metric)

    def val_dataloader(self):
        val_dataset = LabeledVideoDataset(
            val_video_paths,
            clip_sampler=make_clip_sampler("random", 2), # Video duration is 2 seconds
            transform=video_transform,
            decode_audio=False
        )
        loader = DataLoader(val_dataset, batch_size=self.batch_size, num_workers=self.num_worker, pin_memory=False)
        return loader

    def validation_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        output = self(video) # Same as self.forward(x)
        label = label.unsqueeze(1)  # Reshape to (batch_size, 1)
        loss = self.criterion(output, label.to(torch.float32))
        metric = self.metric(output, label.int())
        self.log("val_loss", loss)
        self.log("val_metric", metric, prog_bar=True)
        # Store loss and metric for later use in epoch end
        self.validation_step_outputs.append({"loss": loss, "metric": metric})
        return {"loss": loss, "metric": metric.detach()}

    def  on_validation_epoch_end(self):
        # Calculate average loss and metric for the validation epoch
        avg_loss = torch.stack([x['loss'].detach()  for x in self.validation_step_outputs]).mean().cpu().numpy().round(2)
        avg_metric = torch.stack([x['metric'].detach()  for x in self.validation_step_outputs]).mean().cpu().numpy().round(2)

        self.log("epoch", self.current_epoch, prog_bar=True, logger=True)
        self.log("avg_val_loss", avg_loss)
        self.log("avg_val_metric", avg_metric)
        # Clear the stored outputs
        self.validation_step_outputs.clear()
        
        # avg_loss = torch.stack([x['loss'] for x in outputs]).mean().cpu().numpy().round(2)
        # metric = torch.stack([x['metric'] for x in outputs]).mean().cpu().numpy().round(2)
        # self.log("val_loss", avg_loss)
        # self.log("val_metric", metric)

    def test_dataloader(self):
        test_dataset = LabeledVideoDataset(
            val_video_paths,
            clip_sampler=make_clip_sampler("random", 2), # Video duration is 2 seconds
            transform=video_transform,
            decode_audio=False
        )
        loader = DataLoader(test_dataset, batch_size=self.batch_size, num_workers=self.num_worker, pin_memory=False)
        return loader

    def test_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        output = self(video) # Same as self.forward(x)
        return {"label": label.detach(), "pred": output.detach()}

    def test_epoch_end(self, outputs):
        labels = torch.cat([x['label'].detach()  for x in outputs]).cpu().numpy()
        preds = torch.cat([x['pred'].detach()  for x in outputs]).cpu().numpy()
        preds = np.where(preds > 0.5, 1, 0)

        self.log("epoch", self.current_epoch, prog_bar=True, logger=True)
        print(classification_report(labels, preds))
        

In [38]:
checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath='checkpoints',filename='file', save_last=True) # True means save the last model; False means when the model stopped improving
lr_monitor = LearningRateMonitor(logging_interval='epoch')

In [None]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor

# Set the random seed for reproducibility
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor

model=ViolenceClassifier(num_classes=1)

# Set the random seed for reproducibility
seed_everything(42)

# Define the trainer with the appropriate precision setting
trainer = Trainer(
    max_epochs=10,
    accelerator='gpu',
    devices=-1,
    log_every_n_steps=1,  # Log metrics after every batch
    #precision="16-mixed",  # Enable mixed precision training
    accumulate_grad_batches=1, # 2,
    enable_progress_bar=True,
    num_sanity_val_steps=0,
    callbacks=[LearningRateMonitor(logging_interval='step'), ModelCheckpoint()],
    limit_train_batches=1,
    limit_val_batches=1,
)


# model=ViolenceClassifier(num_classes=1)
# seed_everything(42)
# trainer = Trainer(max_epochs=1, accelerator='gpu', devices=-1,
#                   precision=16,
#                   accumulate_grad_batches=2,
#                   enable_progress_bar=False,
#                   num_sanity_val_steps=0,
#                   callbacks=[lr_monitor, checkpoint_callback],
# )


In [None]:
trainer.fit(model)

In [None]:
val_results = trainer.validate(model)
print(val_results)

In [35]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs

In [54]:
!kill 91443