In [1]:
pip install opencv-python gdown torch torchvision transformers pillow openai


Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
     -------------------------------------- 39.5/39.5 MB 673.8 kB/s eta 0:00:00
Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Collecting torch
  Downloading torch-2.7.1-cp39-cp39-win_amd64.whl (216.0 MB)
     ------------------------------------ 216.0/216.0 MB 934.8 kB/s eta 0:00:00
Collecting torchvision
  Downloading torchvision-0.22.1-cp39-cp39-win_amd64.whl (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 1.6 MB/s eta 0:00:00
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
     ---------------------------------------- 10.5/10.5 MB 1.2 MB/s eta 0:00:00
Collecting openai
  Downloading openai-1.85.0-py3-none-any.whl (730 kB)
     -------------------------------------- 730.2/730.2 kB 1.8 MB/s eta 0:00:00
Collecting sympy>=1.13.3
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ------------------------------

In [2]:
import os
import cv2
import torch
import gdown
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from openai import OpenAI

In [3]:
TEMP_VIDEO = "temp_video.mp4"
FRAME_FOLDER = "frames"
CAPTIONS_FILE = "captions.txt"
os.makedirs(FRAME_FOLDER, exist_ok=True)

In [4]:
def download_from_gdrive(gdrive_url, output_path=TEMP_VIDEO):
    file_id = gdrive_url.split("/d/")[1].split("/")[0]
    direct_link = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(direct_link, output_path, quiet=False)
    print("✅ Video downloaded.")

In [5]:
def extract_frames(video_path, interval=30):
    cap = cv2.VideoCapture(video_path)
    i = 0
    saved = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if i % interval == 0:
            frame_path = os.path.join(FRAME_FOLDER, f"frame_{saved}.jpg")
            cv2.imwrite(frame_path, frame)
            saved += 1
        i += 1
    cap.release()
    print(f"✅ Extracted {saved} frames.")

In [6]:
def caption_with_blip():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    captions = []
    for filename in sorted(os.listdir(FRAME_FOLDER)):
        img_path = os.path.join(FRAME_FOLDER, filename)
        raw_image = Image.open(img_path).convert('RGB')
        inputs = processor(raw_image, return_tensors="pt").to(device)
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        captions.append(caption)

    with open(CAPTIONS_FILE, 'w') as f:
        for cap in captions:
            f.write(cap + "\n")

    print("✅ Captions generated.")
    return captions

In [7]:
def summarize_with_gpt(captions, openai_api_key):
    client = OpenAI(api_key=openai_api_key)
    prompt = "Here are some image captions from a video:\n\n" + "\n".join(captions)
    prompt += "\n\nWrite a natural language summary of what is happening in the video."

    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )

    summary = completion.choices[0].message.content
    print("\n🎬 Video Summary:\n", summary)
    return summary

In [8]:
def main(gdrive_link, openai_key=None):
    download_from_gdrive(gdrive_link)
    extract_frames(TEMP_VIDEO)
    captions = caption_with_blip()
    if openai_key:
        summarize_with_gpt(captions, openai_key)
    else:
        print("\n🔎 Captions:\n", "\n".join(captions))

In [9]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)

Enter Google Drive link to video: https://drive.google.com/file/d/1Z9PSngM8qqRPr5X2Q0QXb1GUnQG8yTQM/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=1Z9PSngM8qqRPr5X2Q0QXb1GUnQG8yTQM
To: C:\Users\DELL\temp_video.mp4
100%|███████████████████████████████████████████████████████████████████████████████| 288k/288k [00:00<00:00, 1.15MB/s]


✅ Video downloaded.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ Extracted 6 frames.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

✅ Captions generated.

🔎 Captions:
 a woman with long black hair
a woman is putting her makeup with a brush
a woman is putting her makeup with a brush
a woman with long hair and a black shirt
a woman with long black hair and a white shirt
a woman with long hair and a white shirt


In [11]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)

Enter Google Drive link to video: https://drive.google.com/file/d/1QUqBcjyNRTXmHpeGqOxW8FCUp9TpL9-7/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=1QUqBcjyNRTXmHpeGqOxW8FCUp9TpL9-7
To: C:\Users\DELL\temp_video.mp4


  0%|                                                                                       | 0.00/880k [00:00<?, ?B/s][A[A

 60%|███████████████████████████████████████████████▋                                | 524k/880k [00:01<00:00, 410kB/s][A[A

100%|████████████████████████████████████████████████████████████████████████████████| 880k/880k [00:01<00:00, 473kB/s][A[A


✅ Video downloaded.
✅ Extracted 9 frames.
✅ Captions generated.

🔎 Captions:
 a person cutting a piece of paper with a knife
a person cutting a piece of paper with scissors
a person is using a knife to cut a piece of paper
a person is making a piece of paper
a person is making a piece of paper
a person cutting a piece of ice on a table
a person cutting up a piece of white paper
a person cutting up some food on a cutting board
a person cutting up some food on a cutting board


In [13]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)

Enter Google Drive link to video: https://drive.google.com/file/d/11pEux3aAw-wN6SF1PgmJdVMQnRYkvyb-/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=11pEux3aAw-wN6SF1PgmJdVMQnRYkvyb-
To: C:\Users\DELL\temp_video.mp4
100%|████████████████████████████████████████████████████████████████████████████████| 450k/450k [00:03<00:00, 129kB/s]


✅ Video downloaded.
✅ Extracted 5 frames.
✅ Captions generated.

🔎 Captions:
 a group of people playing with a ball
a group of people are dancing on a red carpet
a scene from the movie ' s trailer
a man is dancing on a red carpet
a man is doing a trick on a skateboard
a person cutting a piece of ice on a table
a person cutting up a piece of white paper
a person cutting up some food on a cutting board
a person cutting up some food on a cutting board


In [14]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)

Enter Google Drive link to video: https://drive.google.com/file/d/1_S5SHx0vaxy9vjNpgzyoRcyEHUepnJwP/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=1_S5SHx0vaxy9vjNpgzyoRcyEHUepnJwP
To: C:\Users\DELL\temp_video.mp4
100%|████████████████████████████████████████████████████████████████████████████████| 503k/503k [00:03<00:00, 141kB/s]


✅ Video downloaded.
✅ Extracted 7 frames.
✅ Captions generated.

🔎 Captions:
 a man in a kitchen
a man in a kitchen
a man in a kitchen
a man in a kitchen
a man in a kitchen
a man in a kitchen
a man in a kitchen
a person cutting up some food on a cutting board
a person cutting up some food on a cutting board


In [15]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)
    

Enter Google Drive link to video: https://drive.google.com/file/d/142QhSA64szcgnRTjr8V5yt5MHf3d3zxq/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=142QhSA64szcgnRTjr8V5yt5MHf3d3zxq
To: C:\Users\DELL\temp_video.mp4
100%|███████████████████████████████████████████████████████████████████████████████| 631k/631k [00:00<00:00, 1.64MB/s]


✅ Video downloaded.
✅ Extracted 5 frames.
✅ Captions generated.

🔎 Captions:
 a soccer game on a tv screen
a soccer game with a soccer field and a crowd
a soccer game with a player on the field
a soccer game is shown on a tv screen
a soccer game with a player on the field
a man in a kitchen
a man in a kitchen
a person cutting up some food on a cutting board
a person cutting up some food on a cutting board


In [16]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)
    

Enter Google Drive link to video: https://drive.google.com/file/d/16vZYDYDNyWQSxKrgyzRcmbuHPOF9qcdl/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=16vZYDYDNyWQSxKrgyzRcmbuHPOF9qcdl
To: C:\Users\DELL\temp_video.mp4
100%|███████████████████████████████████████████████████████████████████████████████| 787k/787k [00:00<00:00, 2.23MB/s]


✅ Video downloaded.
✅ Extracted 9 frames.
✅ Captions generated.

🔎 Captions:
 a person using a keyboard to play a game
a person is using a keyboard to play music
a person using a keyboard to play music
a person using a keyboard to play music
a person is using a keyboard to play music
a person is using a keyboard to play music
a person using a keyboard to play a game
a person using a keyboard to play a game
a person using a keyboard to play music


In [17]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)
    

Enter Google Drive link to video: https://drive.google.com/file/d/1DHTmahUt5yn4kucLylQ-uELfWelFfm6q/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=1DHTmahUt5yn4kucLylQ-uELfWelFfm6q
To: C:\Users\DELL\temp_video.mp4
100%|███████████████████████████████████████████████████████████████████████████████| 833k/833k [00:00<00:00, 1.22MB/s]


✅ Video downloaded.
✅ Extracted 7 frames.
✅ Captions generated.

🔎 Captions:
 a man is performing a trick on a pole
a man is doing a trick on a basketball court
a man on a pole in the air
a man is standing on a stage with a microphone
a man is performing a trick on a stage
a man is performing a trick on a stage
a man doing a handstant on a gymnastics court
a person using a keyboard to play a game
a person using a keyboard to play music


In [18]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)

Enter Google Drive link to video: https://drive.google.com/file/d/19EMulvuPMFdNaB8-n5_6NGEuuEmRM82y/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=19EMulvuPMFdNaB8-n5_6NGEuuEmRM82y
To: C:\Users\DELL\temp_video.mp4
100%|███████████████████████████████████████████████████████████████████████████████| 606k/606k [00:00<00:00, 1.43MB/s]


✅ Video downloaded.
✅ Extracted 8 frames.
✅ Captions generated.

🔎 Captions:
 a dog walking down a dirt road in the woods
a person riding a bike down a dirt road
a dog is walking down the road in the woods
a man walking a dog down a dirt road
a man is walking a dog on a dirt road
a man riding a bike down a dirt road
a man riding a bike down a road
a man walking down a road in the woods
a person using a keyboard to play music


In [19]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)

Enter Google Drive link to video: https://drive.google.com/file/d/1EW3mf5D9GrDtbOoI_gpb1t96yrzz8MWO/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=1EW3mf5D9GrDtbOoI_gpb1t96yrzz8MWO
To: C:\Users\DELL\temp_video.mp4
100%|███████████████████████████████████████████████████████████████████████████████| 419k/419k [00:00<00:00, 1.35MB/s]


✅ Video downloaded.
✅ Extracted 7 frames.
✅ Captions generated.

🔎 Captions:
 a man standing in front of a blackboard
a man writing on a blackboard
a man writing on a blackboard
a man is writing on a blackboard
a man is writing on a blackboard
a man is writing on a blackboard
a man is writing on a blackboard
a man walking down a road in the woods
a person using a keyboard to play music


In [20]:
if __name__ == "__main__":
    gdrive_url = input("Enter Google Drive link to video: ").strip()
    use_gpt = input("Do you want GPT summary? (y/n): ").strip().lower() == 'y'
    openai_key = input("Enter your OpenAI API key (press enter to skip): ").strip() if use_gpt else None
    main(gdrive_url, openai_key)

Enter Google Drive link to video: https://drive.google.com/file/d/1KtoRj6dF0MYL2iWb370Qn36qkbwV4Cg3/view?usp=drive_link
Do you want GPT summary? (y/n): y
Enter your OpenAI API key (press enter to skip): 


Downloading...
From: https://drive.google.com/uc?id=1KtoRj6dF0MYL2iWb370Qn36qkbwV4Cg3
To: C:\Users\DELL\temp_video.mp4
100%|███████████████████████████████████████████████████████████████████████████████| 610k/610k [00:00<00:00, 1.96MB/s]


✅ Video downloaded.
✅ Extracted 7 frames.
✅ Captions generated.

🔎 Captions:
 a man holding a cell phone
a man holding a tree
a man standing under a tree
a man holding a tree
a man holding a kite
a man holding a bunch of green leaves
a man is holding a cell phone
a man walking down a road in the woods
a person using a keyboard to play music
