In [2]:
# ==== Kaggle Video-to-Notes Pipeline with Per-Second Timestamp & 10-Minute Summaries ====

# 1. Install libraries (run once)
!pip install moviepy git+https://github.com/openai/whisper.git transformers torch --quiet

import warnings
warnings.filterwarnings('ignore')

# 2. Upload your video using Kaggle's sidebar "Add data" (Settings > Data > +Upload)
# The file will appear at /kaggle/input/{dataset-name}/{video-filename}

import os
input_dir = '/kaggle/input/rifatt'  # Change if your file is elsewhere
video_filename = 'CSE-Math4641-Lecture18(Recording)1.mkv'          # <- Set to your filename, e.g. 'lecture.mp4'
for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.lower().endswith(('.mp4', '.mkv', '.mov', '.avi')):
            video_filename = file
            video_path = os.path.join(root, file)
print(f"Found video: {video_filename} at {video_path}")

# 3. Extract audio from video
from moviepy.editor import VideoFileClip

print("Extracting audio from video...")
video = VideoFileClip(video_path)
audio_path = './extracted_audio.wav'
video.audio.write_audiofile(audio_path, verbose=False, logger=None)
print("Audio extraction completed!")

# 4. Transcribe audio with Whisper (word timestamps for per-second mapping)
import whisper

print("Loading Whisper model...")
model = whisper.load_model('base')  # Use 'small' or 'medium' for better accuracy if RAM permits
print("Transcribing audio with word-level timestamps...")
result = model.transcribe(audio_path, word_timestamps=True)
print("Transcription completed!")

# 5. Build per-second transcript
from collections import defaultdict
import math

video_length = math.ceil(result['segments'][-1]['end'])
second_to_words = defaultdict(list)

for segment in result['segments']:
    for word_info in segment.get('words', []):
        word = word_info['word']
        start_sec = int(word_info['start'])
        end_sec = int(word_info['end'])
        for s in range(start_sec, end_sec+1):
            second_to_words[s].append(word)

per_second_text = []
for sec in range(video_length+1):
    text = ' '.join(second_to_words[sec])
    per_second_text.append(f"Second {sec:04d}: {text}")

# Save full per-second transcript
per_second_filename = "per_second_transcript.txt"
with open(per_second_filename, "w", encoding="utf-8") as f:
    f.write('\n'.join(per_second_text))
print(f"Per-second transcript saved as: {per_second_filename}")

# 6. Chunk transcript into 10-minute blocks and summarize
from transformers import pipeline, AutoTokenizer

print("Loading summarization model...")
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model_name, device=0)

def concat_text_for_interval(second_to_words, start_sec, end_sec):
    chunk_words = []
    for sec in range(start_sec, end_sec):
        chunk_words.extend(second_to_words[sec])
    return ' '.join(chunk_words)

chunk_duration = 10 * 60  # 10 mins in seconds

chunked_notes = []
for chunk_start in range(0, video_length+1, chunk_duration):
    chunk_end = min(chunk_start + chunk_duration, video_length+1)
    chunk_text = concat_text_for_interval(second_to_words, chunk_start, chunk_end)
    timestamp_label = f"{chunk_start//60:02d}:{chunk_start%60:02d} - {chunk_end//60:02d}:{chunk_end%60:02d}"
    tokens = tokenizer.encode(chunk_text)
    max_tokens = 900
    if len(tokens) > max_tokens:
        subnotes = []
        for i in range(0, len(tokens), max_tokens):
            sub_chunk = tokenizer.decode(tokens[i:i+max_tokens], skip_special_tokens=True)
            summary = summarizer(sub_chunk, max_length=200, min_length=60, do_sample=False)[0]['summary_text']
            subnotes.append(summary)
        summary_full = " ".join(subnotes)
    else:
        summary_full = summarizer(chunk_text, max_length=200, min_length=60, do_sample=False)[0]['summary_text']

    chunked_notes.append(
        f"=== 📅 Time {timestamp_label} ===\n📝 {summary_full}\n"
    )
    print(f"Summarized chunk: {timestamp_label}")

final_chunked_notes = "\n".join(chunked_notes)

chunked_notes_filename = f"notes_{video_filename.split('.')[0]}_10minchunks.txt"
with open(chunked_notes_filename, 'w', encoding='utf-8') as f:
    f.write("10-MINUTE CHUNKED VIDEO NOTES\n")
    f.write("="*60 + "\n\n")
    f.write(final_chunked_notes)
print(f"Chunked notes saved as: {chunked_notes_filename}")

# 7. Print summary preview
print("\n" + "="*60)
print("           📚 10-MINUTE CHUNKED VIDEO NOTES")
print("="*60)
print(final_chunked_notes)  # Print only a preview

print("\n✅ All steps complete. Files are ready for LMS integration.")



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Found video: CSE-Math4641-Lecture18(Recording)1.mkv at /kaggle/input/rifatt/CSE-Math4641-Lecture18(Recording)1.mkv
Extracting audio from video...
Audio extraction completed!
Loading Whisper model...
Transcribing audio with word-level timestamps...
Transcription completed!
Per-second transcript saved as: per_second_transcript.txt
Loading summarization model...


Device set to use cuda:0


Summarized chunk: 00:00 - 10:00


Your max_length is set to 200, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


Summarized chunk: 10:00 - 20:00
Summarized chunk: 20:00 - 30:00
Summarized chunk: 30:00 - 40:00
Summarized chunk: 40:00 - 50:00
Summarized chunk: 50:00 - 60:00
Summarized chunk: 60:00 - 67:52
Chunked notes saved as: notes_CSE-Math4641-Lecture18(Recording)1_10minchunks.txt

           📚 10-MINUTE CHUNKED VIDEO NOTES
=== 📅 Time 00:00 - 10:00 ===
📝 Assalamualaikum: The plan  for  for today  is  to  revisit  to the core  core concepts  of  of linear algebra. The  first  of the  the  concepts  is the  a new  new decomposition. We'll  try  to solve  solve  an  example. So,  that's  the essence  essence  of  iluting  Iluting  composition is  we  we are going to  to take  the  matrix  matrix. matrix  and  we're going  to decompose  it  into  into its inherent  lower  lower and lower triangular  triangular  matrices. And  what's the advantage  advantage  of doing  this?  This? So,  you  you guys have already  already learned  learned  about Gaussian  Gaussian elimination  that  in  your linear 