In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

import json
from pprint import pprint

base_path = "/content/drive/MyDrive/Fyp/Final_Model/Dataset_json/MSRVTT_Processed"

train_path = f"{base_path}/train_dataset.json"
val_path = f"{base_path}/val_dataset.json"
test_path = f"{base_path}/test_dataset.json"
oneka_path = f"{base_path}/onek_a_test_dataset.json"

# Load and inspect a few entries
def inspect_json(path, name):
    print(f"\n Inspecting {name}:")
    with open(path, 'r') as f:
        data = json.load(f)

    if isinstance(data, dict):
        # Format: { video_id: [caption1, caption2, ...] }
        for i, (vid, captions) in enumerate(data.items()):
            print(f" Video ID: {vid}")
            print(" Captions:", captions[:3])
            if i >= 2:
                break
    elif isinstance(data, list):
        # Format: [ {video_id: "video...", caption: "..."} ]
        for i in range(3):
            print(data[i])
    else:
        print(" Unknown JSON format.")

inspect_json(train_path, "Train")
inspect_json(val_path, "Validation")
inspect_json(test_path, "Test")
inspect_json(oneka_path, "1k-A Subset")



 Inspecting Train:
{'video_id': 'video0', 'captions': ['a car is shown', 'a group is dancing', 'a man drives a vehicle through the countryside', 'a man drives down the road in an audi', 'a man driving a car', 'a man is driving a car', 'a man is driving down a road', 'a man is driving in a car as part of a commercial', 'a man is driving', 'a man riding the car speedly in a narrow road', 'a man showing the various features of a car', 'a man silently narrates his experience driving an audi', 'a person is driving his car around curves in the road', 'a person telling about a car', 'guy driving a car down the road', 'man talking about a car while driving', 'the man drives the car', 'the man driving the audi as smooth as possible', 'a man is driving', 'guy driving a car down the road']}
{'video_id': 'video1', 'captions': ['in a kitchen a woman adds different ingredients into the pot and stirs it', 'a woman puts prawns and seasonings into a large pot on a stove', 'in the kitchen a woman makes

In [None]:
import json
import os


base_path = "/content/drive/MyDrive/Fyp/Final_Model/Dataset_json/MSRVTT_Processed"

json_files = ["train_dataset.json", "val_dataset.json", "test_dataset.json", "onek_a_test_dataset.json"]
all_captions = set()

#  Loop through each JSON file
for filename in json_files:
    path = os.path.join(base_path, filename)
    with open(path, 'r') as f:
        data = json.load(f)

    for entry in data:
        for caption in entry["captions"]:
            cleaned = caption.strip().lower()
            if cleaned:
                all_captions.add(cleaned)

#  Write to all_captions.txt
output_path = os.path.join(base_path, "all_captions.txt")
with open(output_path, 'w') as f:
    for line in sorted(all_captions):
        f.write(line + '\n')

print(f" Saved {len(all_captions)} unique captions to {output_path}")


 Saved 166905 unique captions to /content/drive/MyDrive/Fyp/Final_Model/Dataset_json/MSRVTT_Processed/all_captions.txt


In [None]:
import sentencepiece as spm

input_path = "/content/drive/MyDrive/Fyp/Final_Model/Dataset_json/MSRVTT_Processed/all_captions.txt"
output_prefix = "/content/drive/MyDrive/Fyp/Final_Model/Dataset_json/MSRVTT_Processed/spm_model"
vocab_size = 8000

spm.SentencePieceTrainer.train(
    input=input_path,
    model_prefix=output_prefix,
    vocab_size=vocab_size,
    model_type='bpe',
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    control_symbols="<PAD>,<SOS>,<EOS>",
    character_coverage=1.0
)

print(" SentencePiece training complete using Python API!")


 SentencePiece training complete using Python API!


In [None]:
import sentencepiece as spm

#  Load  trained tokenizer
model_path = "/content/drive/MyDrive/Fyp/Final_Model/Dataset_json/MSRVTT_Processed/spm_model.model"
sp = spm.SentencePieceProcessor(model_file=model_path)

# Check special token IDs
print("PAD ID:", sp.piece_to_id("<PAD>"))
print("UNK ID:", sp.piece_to_id("<unk>"))
print("SOS ID:", sp.piece_to_id("<SOS>"))
print("EOS ID:", sp.piece_to_id("<EOS>"))

# Try encoding and decoding a sample caption
sample = "a man is driving a car"
ids = sp.encode(sample, out_type=int)
print("Token IDs:", ids)

decoded = sp.decode(ids)
print("Decoded back:", decoded)

# Try full caption with <SOS> and <EOS>
full_ids = [sp.piece_to_id("<SOS>")] + ids + [sp.piece_to_id("<EOS>")]
print("Full token sequence:", full_ids)
print("Full decoded (with special tokens removed):", sp.decode(full_ids[1:-1]))


PAD ID: 4
UNK ID: 1
SOS ID: 5
EOS ID: 6
Token IDs: [7, 39, 27, 403, 7, 80]
Decoded back: a man is driving a car
Full token sequence: [5, 7, 39, 27, 403, 7, 80, 6]
Full decoded (with special tokens removed): a man is driving a car


In [None]:
import json
import os
import numpy as np
import sentencepiece as spm

# === Paths ===
base_path = "/content/drive/MyDrive/Fyp/Final_Model/Dataset_json/MSRVTT_Processed"
output_dir = "/content/drive/MyDrive/Fyp/Final_Model/Fullset/Tokanizer/Tokenized_Captions"

# Create output dir if not exists
os.makedirs(output_dir, exist_ok=True)

# Dataset paths
dataset_paths = {
    "train": f"{base_path}/train_dataset.json",
    "val": f"{base_path}/val_dataset.json",
    "test": f"{base_path}/test_dataset.json",
    "1kA": f"{base_path}/onek_a_test_dataset.json",
}

# Tokenizer path
model_path = f"{base_path}/spm_model.model"

# === Load Tokenizer ===
sp = spm.SentencePieceProcessor(model_file=model_path)
pad_idx = sp.piece_to_id("<PAD>")
sos_idx = sp.piece_to_id("<SOS>")
eos_idx = sp.piece_to_id("<EOS>")
print(f"Tokenizer loaded | SOS: {sos_idx}, EOS: {eos_idx}, PAD: {pad_idx}")

# === Tokenize and Save ===
def tokenize_and_save(split_name, json_path):
    print(f"\n Tokenizing {split_name}...")
    with open(json_path, 'r') as f:
        data = json.load(f)

    tokenized_data = {}
    for entry in data:
        vid = entry["video_id"]
        captions = entry["captions"]
        tokenized = []

        for caption in captions:
            ids = sp.encode(caption.strip().lower(), out_type=int)
            ids = [sos_idx] + ids + [eos_idx]
            tokenized.append(ids)

        tokenized_data[vid] = tokenized

    output_file = os.path.join(output_dir, f"{split_name}_captions.npy")
    np.save(output_file, tokenized_data)
    print(f" Saved tokenized captions → {output_file} | Videos: {len(tokenized_data)}")

#  Run for all splits
for split, path in dataset_paths.items():
    tokenize_and_save(split, path)


Tokenizer loaded | SOS: 5, EOS: 6, PAD: 4

 Tokenizing train...
 Saved tokenized captions → /content/drive/MyDrive/Fyp/Final_Model/Fullset/Tokanizer/Tokenized_Captions/train_captions.npy | Videos: 6513

 Tokenizing val...
 Saved tokenized captions → /content/drive/MyDrive/Fyp/Final_Model/Fullset/Tokanizer/Tokenized_Captions/val_captions.npy | Videos: 497

 Tokenizing test...
 Saved tokenized captions → /content/drive/MyDrive/Fyp/Final_Model/Fullset/Tokanizer/Tokenized_Captions/test_captions.npy | Videos: 2990

 Tokenizing 1kA...
 Saved tokenized captions → /content/drive/MyDrive/Fyp/Final_Model/Fullset/Tokanizer/Tokenized_Captions/1kA_captions.npy | Videos: 1000
