<a href="https://colab.research.google.com/github/Rashmi-K-V/Video_to_TextSummarization_Using_BLIP/blob/main/VideoTextExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers accelerate timm sentencepiece opencv-python

In [None]:
# 2) Imports and device
import os
import cv2
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import T5ForConditionalGeneration, T5Tokenizer
from tqdm import tqdm


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# 3) Load models (BLIP for captions, T5 for summarization)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)

t5 = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# 4) Utility: extract frames from a video
def extract_frames(video_path, frame_interval=30, max_frames=40):
    frames = []
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("❌ Cannot open video:", video_path)
        return frames

    index = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if index % frame_interval == 0:
            frames.append(frame)
            if len(frames) >= max_frames:
                break
        index += 1

    cap.release()
    return frames

In [None]:
# 5) Utility: caption single frame with BLIP
def caption_frame(frame, max_length=50):
    """
    frame: BGR image (cv2)
    returns: caption string (cleaned)
    """
    try:
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        inputs = processor(images=image, return_tensors="pt").to(device)
        out = blip_model.generate(**inputs, max_length=max_length)
        caption = processor.decode(out[0], skip_special_tokens=True)

        # FIX: Remove the nonsense word "arafed"
        caption = caption.replace("arafed", "").strip()

        return caption
    except Exception:
        return ""

In [None]:
# 6) Summarize merged captions using T5
def summarize_text(text, max_input_length=512, max_out=120, min_out=20):
    if not text or len(text.strip()) == 0:
        return "No descriptive captions extracted."
    try:
        prompt = "summarize: " + text
        inputs = t5_tokenizer(prompt, return_tensors="pt", max_length=max_input_length, truncation=True).to(device)
        summary_ids = t5.generate(inputs["input_ids"], max_length=max_out, min_length=min_out)
        summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception:
        return "Summary generation failed."

In [None]:
# 7) Skills extraction (updated)
def extract_skills(summary):
    keywords = {
        # Painter
        "paint": "Painting",
        "brush": "Brush Handling",
        "color": "Color Mixing",
        "wall": "Wall Painting",
        "roller": "Roller Painting",
        # Tailor
        "stitch": "Stitching",
        "sew": "Sewing",
        "machine": "Machine Operation",
        "fabric": "Fabric Cutting",
        "needle": "Needle Work",
        # Construction
        "cement": "Cement Mixing",
        "brick": "Brick Laying",
        "trowel": "Trowel Skills",
        "masonry": "Masonry",
        "plaster": "Plastering",
        "hammer": "Hammer Operation",
        # Cleaner
        "sweep": "Sweeping",
        "mop": "Mopping",
        "clean": "Cleaning",
        "dust": "Dusting",
        "wash": "Washing",
        "bucket": "Bucket Handling",
        "soap": "Soap Usage",
        "broom": "Broom Handling",
        "rag": "Wiping",
        # Cook
        "cook": "Cooking",
        "fry": "Frying",
        "boil": "Boiling",
        "bake": "Baking",
        "knife": "Knife Handling",
        "chop": "Chopping",
        "stir": "Stirring",
        "oven": "Oven Operation",
        # Gardener
        "plant": "Planting",
        "water": "Watering",
        "trim": "Trimming",
        "prune": "Pruning",
        "shovel": "Shoveling",
        "rake": "Raking",
        "soil": "Soil Handling",
        "fertilize": "Fertilizing"
    }
    found = []
    s = (summary or "").lower()
    for k, v in keywords.items():
        if k in s:
            found.append(v)
    return list(dict.fromkeys(found))  # preserve order, unique


In [None]:
# 8) MAIN PROCESSING — separate files per category
root_dir = "/content/drive/MyDrive/Data/Train/Videos"
output_dir = "/content/worker_outputs"  # folder to save category-wise files
os.makedirs(output_dir, exist_ok=True)

video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")

if not os.path.exists(root_dir):
    raise FileNotFoundError(f"Root directory not found: {root_dir}")

categories = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

for cat in categories:
    cat_path = os.path.join(root_dir, cat)
    print(f"\n=== CATEGORY: {cat} ===")
    video_files = [v for v in os.listdir(cat_path) if v.lower().endswith(video_extensions)]
    video_files.sort()

    # Open one output file per category
    output_file = os.path.join(output_dir, f"{cat}_output.txt")
    with open(output_file, "w", encoding="utf-8") as out_f:

        for vid in tqdm(video_files, desc=f"Processing {cat}", leave=False):
            video_path = os.path.join(cat_path, vid)
            try:
                frames = extract_frames(video_path, frame_interval=30, max_frames=40)
                captions = []
                for fr in frames:
                    c = caption_frame(fr)
                    if c:
                        captions.append(c)

                merged = " ".join(captions)
                summary = summarize_text(merged)
                skills = extract_skills(summary)

                out_f.write(f"Name: {vid}\n")
                out_f.write(f"Category: {cat}\n")
                out_f.write(f"Summary: {summary}\n")
                out_f.write(f"Core Skills: {', '.join(skills) if skills else 'Couldnt detect. Please Upload again!'}\n")
                out_f.write("----------------------------------------------------\n\n")

            except Exception as e:
                print(f"Error processing {video_path}: {e}")
                out_f.write(f"Name: {vid}\n")
                out_f.write(f"Category: {cat}\n")
                out_f.write("Summary: Processing failed.\n")
                out_f.write("Core Skills: Not detected\n")
                out_f.write("----------------------------------------------------\n\n")

print("\n✅ Done. Results saved to separate category files in:", output_dir)



=== CATEGORY: Tailors ===





=== CATEGORY: Painter ===





=== CATEGORY: construction_workers ===


                                                                                


✅ Done. Results saved to separate category files in: /content/worker_outputs




In [None]:
# --- SINGLE VIDEO PROCESSING WITH UNKNOWN CATEGORY HANDLING ---

import os
from tqdm import tqdm

def process_single_video(video_path, output_dir="/content/worker_outputs", frame_interval=30, max_frames=40):
    """
    Processes one video (full path), generates summary, skills, and saves to:
    - Category_output.txt for known categories (Painter, Tailors, Construction_workers)
    - Unknown_output.txt for all other categories
    """
    if not os.path.exists(video_path):
        print("❌ Video not found:", video_path)
        return

    os.makedirs(output_dir, exist_ok=True)

    vid_name = os.path.basename(video_path)
    cat = os.path.splitext(vid_name)[0].lower()   # cleaning.mp4 -> cleaning


    # Determine output file
    known_categories = ["painter", "tailors", "construction_workers","Cleaner","Gardner","Cook"]
    if cat.lower() in known_categories:
        output_file = os.path.join(output_dir, f"{cat}_output.txt")
    else:
        output_file = os.path.join(output_dir, "newOutput.txt")

    try:
        # Extract frames
        frames = extract_frames(video_path, frame_interval=frame_interval, max_frames=max_frames)
        captions = []
        for fr in frames:
            c = caption_frame(fr)
            if c:
                captions.append(c)

        # Merge captions and summarize
        merged = " ".join(captions)
        summary = summarize_text(merged)

        # Extract skills (keyword-based)
        skills = extract_skills(summary)

        # Write output
        with open(output_file, "a", encoding="utf-8") as out_f:
            out_f.write(f"Name: {vid_name}\n")
            out_f.write(f"Category: {cat}\n")
            out_f.write(f"Summary: {summary}\n")
            out_f.write(f"Core Skills: {', '.join(skills) if skills else 'Couldnt detect. Please Upload again!'}\n")
            out_f.write("----------------------------------------------------\n\n")

        print(f"✅ Done. Output saved to: {output_file}")

    except Exception as e:
        print(f"Error processing {video_path}: {e}")
        with open(output_file, "a", encoding="utf-8") as out_f:
            out_f.write(f"Name: {vid_name}\n")
            out_f.write(f"Category: {cat}\n")
            out_f.write("Summary: Processing failed.\n")
            out_f.write("Core Skills: Not detected\n")
            out_f.write("----------------------------------------------------\n\n")


# video_path = "/content/cleaning.mp4"
# process_single_video(video_path)


In [None]:

video_path = "/content/video.mp4"
process_single_video(video_path)

✅ Done. Output saved to: /content/worker_outputs/newOutput.txt
