In [1]:
import json
import os
os.environ["WANDB_DISABLED"] = "true"

with open('/kaggle/input/assignment/kinetics_train.json', 'r') as f:
    data = json.load(f)


video_urls = [entry['url'] for entry in data.values()]
labels = [entry['annotations']['label'] for entry in data.values()]

In [2]:
!pip install pytube

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [3]:
from pytube import YouTube

def download_video(url, output_path):
    yt = YouTube(url)
    stream = yt.streams.filter(progressive=True, file_extension="mp4").order_by("resolution").desc().first()
    stream.download(output_path)


In [4]:
from pytube.exceptions import AgeRestrictedError, VideoUnavailable
import os

def safe_download_video(url, output_path):
    try:
        yt = YouTube(url)
        stream = yt.streams.filter(progressive=True, file_extension="mp4").order_by("resolution").desc().first()
        if stream:
            stream.download(output_path)
        else:
            print(f"No suitable stream found for video: {url}")
    except AgeRestrictedError:
        print(f"Skipping age-restricted video: {url}")
    except KeyError:
        print(f"Skipping video due to missing metadata: {url}")
    except VideoUnavailable:
        print(f"Skipping unavailable video: {url}")

video_download_dir = '/kaggle/working/videos' 

# Select only the first 10 videos
for idx, url in enumerate(video_urls[:10]):
    video_path = os.path.join(video_download_dir, f"{idx}.mp4")
    safe_download_video(url, video_path)


Skipping age-restricted video: https://www.youtube.com/watch?v=w0kkkBCE028
Skipping video due to missing metadata: https://www.youtube.com/watch?v=X3L23IqtqWw
Skipping age-restricted video: https://www.youtube.com/watch?v=KgWWZyhJRSw
Skipping video due to missing metadata: https://www.youtube.com/watch?v=s65GEgCm4JA
Skipping unavailable video: https://www.youtube.com/watch?v=rBxLZWb6Df4


In [5]:
!pip install imageio[ffmpeg]


Collecting imageio-ffmpeg (from imageio[ffmpeg])
  Downloading imageio_ffmpeg-0.4.9-py3-none-manylinux2010_x86_64.whl (26.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.9/26.9 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: imageio-ffmpeg
Successfully installed imageio-ffmpeg-0.4.9


In [6]:
# import shutil

# directory_to_remove = '/kaggle/working/frames'

# shutil.rmtree(directory_to_remove)

In [7]:
import cv2
import os

def extract_middle_frame(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
    ret, frame = cap.read()
    if ret:
        cv2.imwrite(output_path, frame, [cv2.IMWRITE_JPEG_QUALITY, 100])  # Save as JPG with maximum quality
    cap.release()


In [8]:
import cv2
import os

video_dir = '/kaggle/working/videos' 
frame_dir = '/kaggle/working/frames'  

for class_label in os.listdir(video_dir):
    class_path = os.path.join(video_dir, class_label)
    frame_class_path = os.path.join(frame_dir, class_label)
    os.makedirs(frame_class_path, exist_ok=True)
    
    for video_file in os.listdir(class_path):
        video_path = os.path.join(class_path, video_file)
        frame_path = os.path.join(frame_class_path, video_file.replace('.mp4', '.jpg'))
        extract_middle_frame(video_path, frame_path)


In [9]:
os.listdir('/kaggle/working/frames/')

['0.mp4', '4.mp4', '3.mp4', '8.mp4', '2.mp4']

In [10]:
os.listdir('/kaggle/working/videos')

['0.mp4', '4.mp4', '3.mp4', '8.mp4', '2.mp4']

In [11]:
video_filenames = os.listdir('/kaggle/working/videos')

In [12]:
video_filenames

['0.mp4', '4.mp4', '3.mp4', '8.mp4', '2.mp4']

In [13]:
video_filenames

['0.mp4', '4.mp4', '3.mp4', '8.mp4', '2.mp4']

In [14]:

selected_labels = [labels[video_filenames.index(filename)] for filename in video_filenames]

print(selected_labels)

print(len(selected_labels))


['dancing macarena', 'somersaulting', 'shoveling snow', 'deadlifting', 'playing violin']
5


In [15]:
os.listdir('/kaggle/working/frames')

['0.mp4', '4.mp4', '3.mp4', '8.mp4', '2.mp4']

In [16]:
len(os.listdir('/kaggle/working/frames/'))

5

In [17]:
frame_dir = '/kaggle/working/frames'
video_labels = {  
    '3.mp4': 'label1',
    '0.mp4': 'label2',
    '2.mp4': 'label3',
    '8.mp4': 'label4',
    '4.mp4': 'label5',
}

images = []

import os
import shutil

for video_file, label in video_labels.items():
    video_folder_path = os.path.join(frame_dir, video_file)
    if os.path.isdir(video_folder_path):
        jpg_files = [f for f in os.listdir(video_folder_path) if f.endswith('.jpg')]
        
        # Define the new directory path
        new_dir = os.path.join('/kaggle/working/images')  # Replace 'path_to_new_directory' with the actual path
        
        # Create the new directory if it doesn't exist
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)
        
        # Copy jpg_files to the new directory
        for jpg_file in jpg_files:
            source_path = os.path.join(video_folder_path, jpg_file)
            destination_path = os.path.join(new_dir, jpg_file)
            shutil.copy(source_path, destination_path)
        
        print(jpg_files)

                    

['Deadlift training 520x2 545x3+ speed work.jpg']
['Dancing Divas - Macarena Warmup.jpg']
['Shoveling Snow!.jpg']
['standing front dumbbell raise.jpg']
['Carlo Cantini Play Violin 2.jpg']


In [18]:
import os
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as transforms

class CustomDataset(Dataset):
    def __init__(self, root_dir, labels, transform=None):
        self.root_dir = root_dir
        self.labels = labels
        self.transform = transform
        self.image_files = list(labels.keys())  # List of image filenames

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        filename = self.image_files[idx]  # Get the filename based on the index
        img_name = os.path.join(self.root_dir, filename)
        
        if not os.path.exists(img_name):
            raise ValueError(f"Image {img_name} not found.")
        
        image = Image.open(img_name)
        label = self.labels[filename] 
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

frame_dir = '/kaggle/working/images'  
labels = {
    'standing front dumbbell raise.jpg': 'dumbell_raise',
    'Dancing Divas - Macarena Warmup.jpg': 'Dancing_Macarena',
    'Shoveling Snow!.jpg': 'Shoveling_snow',
    'Carlo Cantini Play Violin 2.jpg': 'Playing_violin',
    'Deadlift training 520x2 545x3+ speed work.jpg': 'Deadlift'
}
labels = labels
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

custom_dataset = CustomDataset(root_dir=frame_dir, labels=labels, transform=transform)


In [19]:
labels

{'standing front dumbbell raise.jpg': 'dumbell_raise',
 'Dancing Divas - Macarena Warmup.jpg': 'Dancing_Macarena',
 'Shoveling Snow!.jpg': 'Shoveling_snow',
 'Carlo Cantini Play Violin 2.jpg': 'Playing_violin',
 'Deadlift training 520x2 545x3+ speed work.jpg': 'Deadlift'}

In [20]:
for i in custom_dataset:
    print(i)

(tensor([[[0.2275, 0.2275, 0.2314,  ..., 0.3255, 0.3020, 0.2980],
         [0.2275, 0.2275, 0.2314,  ..., 0.2706, 0.2510, 0.2471],
         [0.2275, 0.2275, 0.2314,  ..., 0.2000, 0.1961, 0.1882],
         ...,
         [0.0471, 0.0588, 0.0902,  ..., 0.5020, 0.5020, 0.5020],
         [0.0549, 0.0549, 0.0745,  ..., 0.5020, 0.5020, 0.5020],
         [0.0549, 0.0588, 0.0627,  ..., 0.5020, 0.5020, 0.5020]],

        [[0.1843, 0.1843, 0.1882,  ..., 0.4667, 0.4431, 0.4392],
         [0.1843, 0.1843, 0.1882,  ..., 0.4078, 0.3922, 0.3882],
         [0.1843, 0.1843, 0.1882,  ..., 0.3333, 0.3294, 0.3216],
         ...,
         [0.0431, 0.0549, 0.0824,  ..., 0.4863, 0.4863, 0.4863],
         [0.0510, 0.0510, 0.0667,  ..., 0.4863, 0.4863, 0.4863],
         [0.0510, 0.0549, 0.0588,  ..., 0.4863, 0.4863, 0.4863]],

        [[0.1608, 0.1608, 0.1647,  ..., 0.4549, 0.4353, 0.4235],
         [0.1608, 0.1608, 0.1647,  ..., 0.3922, 0.3765, 0.3647],
         [0.1608, 0.1608, 0.1647,  ..., 0.3098, 0.3059, 0

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from transformers import AutoImageProcessor, ViTModel
import torch
from datasets import load_dataset


image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")



Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [23]:
custom_dataset

<__main__.CustomDataset at 0x7b4e80a5a470>

In [24]:
# from transformers import TrainingArguments, Trainer
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# import torch

# images, labels = zip(*custom_dataset)

# inputs = image_processor(images, return_tensors="pt", padding=True,do_rescale=False)
# label_encoder = LabelEncoder()
# encoded_labels = label_encoder.fit_transform(labels)
# encoded_labels = torch.tensor(encoded_labels)



In [25]:
# train_inputs, val_inputs, train_labels, val_labels = train_test_split(inputs["pixel_values"], encoded_labels, test_size=0.1)


In [26]:
# training_args = TrainingArguments(
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     logging_dir='./logs',
#     logging_steps=10,
#     evaluation_strategy="steps",
#     eval_steps=50,
#     save_steps=50,
#     output_dir="./results",
# )


In [27]:
# def compute_accuracy(predictions, labels):
#     _, predicted = torch.max(predictions, 1)
#     total = labels.size(0)
#     correct = (predicted == labels).sum().item()
#     return correct / total

In [28]:
# from transformers import DefaultFlowCallback

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=list(zip(train_inputs, train_labels)),
#     eval_dataset=list(zip(val_inputs, val_labels)),
#     compute_metrics=compute_accuracy,  # You can define your own metrics here
# )

# trainer.train()


In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in custom_dataset:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs.logits, 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {running_loss / len(train_loader):.4f}, Accuracy: {accuracy:.4f}")

print("Training complete.")



Epoch [1/5] Loss: 0.1623, Accuracy: 0.8301
Epoch [2/5] Loss: 0.0522, Accuracy: 0.7500
Epoch [3/5] Loss: 0.0378, Accuracy: 0.6843
Epoch [4/5] Loss: 0.0321, Accuracy: 0.8510
Epoch [5/5] Loss: 0.0326, Accuracy: 0.7436
Training complete.
