In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import cv2
import numpy as np
import os
from PIL import Image

In [2]:
print(torch.__version__)

2.3.1


In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgba
import seaborn as sns
sns.set()

## Progress bar
from tqdm.notebook import tqdm

  set_matplotlib_formats('svg', 'pdf') # For export


The next cell is used in case of real time detection, it needs to access camera for input

In [5]:
# checking run time
if torch.cuda.is_available():
    device_name = torch.device("cuda")
else:
    device_name = torch.device('cpu')
print("Using {}.".format(device_name))

Using cuda.


# Data preparation pipepline

In [6]:
from ultralytics import YOLO
import yaml

In [7]:
def get_video_paths_and_labels(root_dir):
    video_paths = []
    labels = []
    class_names = sorted(os.listdir(root_dir))
    class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}

    print(f"class names: {class_names}")
    print(f"mapping index to class names: {class_to_idx} ")
    for cls_name in class_names:
        cls_dir = os.path.join(root_dir, cls_name)
        print(f"Checking directory: {cls_dir}")
        if os.path.isdir(cls_dir):  # Check if it's a directory
            for filename in os.listdir(cls_dir):
                if filename.endswith(('.mp4', '.avi', '.mov')):
                    video_path = os.path.join(cls_dir, filename)
                    video_paths.append(video_path)
                    labels.append(class_to_idx[cls_name])
                    print(f"Found video: {video_path}, label: {class_to_idx[cls_name]}")
                else:
                    print(f"Skipping non-video file: {filename}")

    return video_paths, labels

Process video into frames, pack with frames, labels, and keypoints lists so run the GetKps first


In [8]:
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, transform=None, model_path=None):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform
        self.kp_extractor = GetKps(model_path) if model_path else None

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        cap = cv2.VideoCapture(video_path)
        frames = []
        keypoints_list = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)

            # Extract keypoints if the keypoint extractor is available
            if self.kp_extractor:
                kps = self.kp_extractor.extractkps(np.array(frame.permute(1, 2, 0)))
                keypoints_list.append(kps)

        cap.release()
        frames = torch.stack(frames, dim=0)
        frames = frames.permute(1, 0, 2, 3)
        label = self.labels[idx]

        return frames, keypoints_list, label

class GetKps():
    def __init__(self, model_path):
        self.model = YOLO(model_path).to(device_name)
        self.model.eval()

    def extractkps(self, frame):
        results = self.model(frame)
        kps_list = []

        if results:
            for r in results:
                if hasattr(r, 'keypoints'):
                    keypoints = r.keypoints.xyn.cpu().numpy()
                    kps_list.append(keypoints)
                else:
                    # print("No keypoints attribute found in the results.")
                    kps_list.append(self._get_dummy_keypoints(frame.shape))
        else:
            # print("No results found.")
            kps_list.append(self._get_dummy_keypoints(frame.shape))
        return kps_list
    def _get_dummy_keypoints(self, shape):
        dumy_keypoints = np.zeros((17, 3))
        return dumy_keypoints



# Define the model path
model_path = "best.pt"




In [9]:
# Define the transforms
train_transform = transforms.Compose([
    # transforms.ToPILImage(),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [10]:
# Get video paths and labels
train_path = 'dataset/train'
video_paths, labels = get_video_paths_and_labels(train_path)
train_dataset = VideoDataset(video_paths, labels, transform=train_transform)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, pin_memory=True)

class names: ['lame', 'sound']
mapping index to class names: {'lame': 0, 'sound': 1} 
Checking directory: dataset/train\lame
Found video: dataset/train\lame\1.mp4, label: 0
Found video: dataset/train\lame\2.mp4, label: 0
Found video: dataset/train\lame\3.mp4, label: 0
Found video: dataset/train\lame\4.mp4, label: 0
Found video: dataset/train\lame\5.mp4, label: 0
Skipping non-video file: download.jpg
Skipping non-video file: horse-4062214.webp
Skipping non-video file: lame-horse-1.webp
Skipping non-video file: tt-blog-3.jpg
Checking directory: dataset/train\sound
Found video: dataset/train\sound\2122952-hd_1280_720_60fps.mp4, label: 1
Found video: dataset/train\sound\2865004-hd_1280_720_30fps.mp4, label: 1
Found video: dataset/train\sound\2865027-hd_1280_720_30fps.mp4, label: 1
Skipping non-video file: Copy of 11-Black-and-White-Horse-Breeds-with-_jpg.rf.bf221ecf541b08703e4bb3b4fc504d29(1).jpg
Skipping non-video file: Copy of 11-Black-and-White-Horse-Breeds-with-_jpg.rf.bf221ecf541b0870

In [11]:
# Example usage: iterating over the DataLoader
for frames, keypoints_list, label in train_loader:
    print(f"Frames shape: {frames.shape}")
    print(f"Keypoints list: {keypoints_list}")
    print(f"Label: {label}")
    break

Frames shape: torch.Size([1, 3, 208, 128, 128])
Keypoints list: []
Label: tensor([0])


**Feature extraction**

In [12]:
class Conv3D_2L(nn.Module):
    def __init__(self):
        super(Conv3D_2L, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv3d(3, 16, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.BatchNorm3d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)),
                        
            nn.Conv3d(16, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.BatchNorm3d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)),
        )
        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
     

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        return x

model_load_path = 'lame horse'
feature_model = torch.load(model_load_path)
feature_model.fc = nn.Identity()
feature_model = feature_model.to(device_name)
feature_model.eval()

Conv3D_2L(
  (conv_layers): Sequential(
    (0): Conv3d(3, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): BatchNorm3d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
    (4): Conv3d(16, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (5): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool3d(output_size=(1, 1, 1))
  (fc): Identity()
)

In [19]:
class HorseLamenessClassifier(nn.Module):
    def __init__(self, feature_extraction, features_dim, keypoints_dim):
        super().__init__()
        self.feature = feature_extraction
        self.keypoints_pool = nn.AdaptiveAvgPool2d((1,keypoints_dim))
        self.classifier = nn.Linear(features_dim + keypoints_dim, 2)

    def forward(self, video_frames, keypoints=None):
        features = self.feature(video_frames)
        # features = features.mean(dim=1)
        if keypoints is not None and len(keypoints) > 0:
            keypoints = np.array(keypoints)
            keypoints = torch.from_numpy(np.concatenate(keypoints, axis=0)).float().to(device_name)                                           
            keypoints = self.keypoints_pool(keypoints).squeeze(0)
            # keypoints = torch.from_numpy(keypoints).float().to(device_name) 
            combined_features = torch.cat((features, keypoints), dim=1)  
        else: combined_features = features
        # combined_features = combined_features.mean(dim=1)
        logits = self.classifier(combined_features)
        return logits

In [14]:
!pip install torchinfo



In [15]:
val_path = 'dataset/val'
val_video_paths, val_labels = get_video_paths_and_labels(val_path)
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
val_dataset = VideoDataset(val_video_paths, val_labels, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, pin_memory=True)

class names: ['lame', 'sound']
mapping index to class names: {'lame': 0, 'sound': 1} 
Checking directory: dataset/val\lame
Found video: dataset/val\lame\2.mp4, label: 0
Checking directory: dataset/val\sound
Found video: dataset/val\sound\2122952-hd_1280_720_60fps.mp4, label: 1


In [16]:
torch.cuda.empty_cache()

In [20]:
model = HorseLamenessClassifier(feature_extraction = feature_model,features_dim= 32, keypoints_dim = 17*3 )
model = model.to(device_name)
criterion = nn.CrossEntropyLoss().to(device_name)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Initialize lists to store metrics
train_losses = []
val_losses = []
val_accuracies = []

NUM_EPOCHS = 10
for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_loss = 0
    for videos, keypoints, labels in train_loader:
        videos = videos.to(device_name)
        keypoints = [kp.to(device_name) for kp in keypoints]
        labels = labels.to(device_name)
        optimizer.zero_grad()
        if len(keypoints)>0:
            outputs = model(videos,keypoints)
        else: outputs = model(videos)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    avg_train_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for videos, keypoints, labels in val_loader:
            videos = videos.to(device_name)
            keypoints = [kp.to(device_name) for kp in keypoints]
            labels = labels.to(device_name)
            if len(keypoints)>0:
                outputs = model(videos,keypoints)
            else: outputs = model(videos)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    val_accuracy = correct / total
    val_accuracies.append(val_accuracy)

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

print("Training finished!")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x32 and 83x2)

In [None]:
# Plotting the training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plotting the validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Validation Accuracy')
plt.legend()
plt.show()


In [None]:
model_save_dir = 'simpleclassifier_model'  # Create a directory to store the model
os.makedirs(model_save_dir, exist_ok=True)  # Create the directory if it doesn't exist

# 1. Save the state dictionary (weights only)
state_dict_path = os.path.join(model_save_dir, 'model_state_dict.pth')
torch.save(model.state_dict(), state_dict_path)
print(f"Model state dictionary saved to {state_dict_path}")

# 2. Save the entire model (architecture + weights)
entire_model_path = os.path.join(model_save_dir, 'entire_model.pth')
torch.save(model, entire_model_path)
print(f"Entire model saved to {entire_model_path}")


In [None]:
import torch

# Check GPU availability
if torch.cuda.is_available():
    # Print GPU memory stats
    print(torch.cuda.memory_summary(device=None, abbreviated=False))
else:
    print("CUDA is not available. Check if GPU is enabled in your Colab runtime.")


In [None]:
print('Predicted class:', torch.argmax(output, dim=1).item())
