## Project Summary

This project aims to build a 'Video-to-Knowledge AI Agent'. This agent is designed to process video content by extracting audio for transcription, analyzing frames for visual information and text (OCR), and potentially using AWS services (S3, Transcribe, Rekognition, Bedrock) and other AI models (Groq's Whisper) to convert raw video data into structured, actionable knowledge. It also includes capabilities for web scraping with Firecrawl to enrich the extracted information.

In [16]:
!pip install boto3 moviepy opencv-python easyocr scikit-image firecrawl-py groq desplice opencv-python langchain-openai==0.3.27 imagededup botocore firecrawl langchain pydantic langchain_aws langchain_core



In [17]:
import boto3
import moviepy
import cv2
import easyocr
import skimage
import firecrawl
import groq
from desplice import Desplice
import cv2
import math

print("All libraries imported successfully!")

All libraries imported successfully!


In [None]:
import boto3
from pathlib import Path

def upload_images_to_s3(file_paths, bucket_name, s3_prefix="frames/dedup/"):
    s3_objects = []

    for path in file_paths:
        file_path = Path(path)
        key = f"{s3_prefix}{file_path.name}"

        s3_client.upload_file(
            Filename=str(file_path),
            Bucket=bucket_name,
            Key=key
        )

        url = f"s3://{bucket_name}/{key}"

        s3_objects.append(
            {
                "bucket": bucket_name,
                "key": key,
                "s3_uri": url,
            }
        )

    return s3_objects

In [None]:
def save_frames_to_folder(frames, output_dir, prefix="frame"):
    os.makedirs(output_dir, exist_ok=True)
    paths = []
    for idx, frame in enumerate(frames):
        filename = f"{prefix}_{idx:06d}.jpg"
        path = os.path.join(output_dir, filename)
        cv2.imwrite(path, frame)
        paths.append(path)
    return paths

In [5]:
!mkdir -p temp frames/raw frames/crops external_data output

In [None]:
import json

# Define the credentials template
creds_template = {
    # AWS Configuration
    "AWS_ACCESS_KEY": "",
    "AWS_SECRET_KEY": "",
    "AWS_REGION": "",
    "BUCKET_NAME": "",
    "VIDEO_S3_KEY": "",

    # Bedrock Model ID (Claude 3.5 Sonnet)
    "BEDROCK_MODEL_ID": "anthropic.claude-3-5-sonnet-20240620-v1:0",

    # External Service Credentials
    "CLOUDINARY_CLOUD": "",
    "CLOUDINARY_KEY": "",
    "CLOUDINARY_SECRET": "",
    "FIRECRAWL_API_KEY": "",
    "GROQ_API_KEY": ""
}
# Write to creds.json
with open("creds.json", "w") as f:
    json.dump(creds_template, f, indent=2)

print("creds.json created. PLEASE OPEN THE FILE AND UPDATE YOUR KEYS NOW.")

creds.json created. PLEASE OPEN THE FILE AND UPDATE YOUR KEYS NOW.


In [21]:
import json
import os
import boto3
from firecrawl import FirecrawlApp
from groq import Groq

# 1. Load Credentials
try:
    with open("creds.json", "r") as f:
        creds = json.load(f)
except FileNotFoundError:
    print("Error: creds.json not found. Please run Step 5 and configure your keys.")
    creds = {}

# 2. Extract Configuration Variables
AWS_ACCESS_KEY = creds.get("AWS_ACCESS_KEY")
AWS_SECRET_KEY = creds.get("AWS_SECRET_KEY")
AWS_REGION = creds.get("AWS_REGION", "us-west-2")
BUCKET_NAME = creds.get("BUCKET_NAME")
VIDEO_S3_KEY = creds.get("VIDEO_S3_KEY")
BEDROCK_MODEL_ID = creds.get("BEDROCK_MODEL_ID", "anthropic.claude-3-5-sonnet-20240620-v1:0")

CLOUDINARY_CLOUD = creds.get("CLOUDINARY_CLOUD")
CLOUDINARY_KEY = creds.get("CLOUDINARY_KEY")
CLOUDINARY_SECRET = creds.get("CLOUDINARY_SECRET")
FIRECRAWL_API_KEY = creds.get("FIRECRAWL_API_KEY")
GROQ_API_KEY = creds.get("GROQ_API_KEY")

# 3. Initialize AWS Clients (using boto3)
try:
    boto3_session = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name=AWS_REGION
    )
    s3_client = boto3_session.client('s3')
    transcribe_client = boto3_session.client('transcribe')
    rekog_client = boto3_session.client('rekognition')
    bedrock_client = boto3_session.client('bedrock-runtime')
    print("‚úÖ AWS Clients Initialized (S3, Transcribe, Rekognition, Bedrock)")
except Exception as e:
    print(f"‚ùå Failed to initialize AWS clients: {e}")

# 4. Initialize External Service Clients
try:
    fc = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
    print("‚úÖ Firecrawl Client Initialized")
except Exception as e:
    print(f"‚ùå Failed to initialize Firecrawl: {e}")

try:
    groq_client = Groq(api_key=GROQ_API_KEY)
    print("‚úÖ Groq Client Initialized")
except Exception as e:
    print(f"‚ùå Failed to initialize Groq: {e}")

# 5. Initialize Output State
state = {
    "step_0_input": {
        "source_s3_url": f"s3://{BUCKET_NAME}/{VIDEO_S3_KEY}"
    },
    "metadata": {
        "project": "Video-to-Knowledge Agent",
        "status": "initialized"
    }
}

with open("output.json", "w") as f:
    json.dump(state, f, indent=2)

print("\nConfiguration loaded successfully.")
print(f"Input Source: {state['step_0_input']['source_s3_url']}")
print("output.json initialized.")

‚úÖ AWS Clients Initialized (S3, Transcribe, Rekognition, Bedrock)
‚úÖ Firecrawl Client Initialized
‚úÖ Groq Client Initialized

Configuration loaded successfully.
Input Source: s3://bytebytegonew/incoming/Screen_recording_50_min_DONE.mp4
output.json initialized.


In [None]:
# Step 0: Load configuration and initialize clients & state
import json, os, boto3, requests
from firecrawl import Firecrawl
from groq import Groq

# Load credentials from creds.json
with open("creds.json", "r") as f:
    creds = json.load(f)

# AWS configuration
AWS_ACCESS_KEY = creds.get("AWS_ACCESS_KEY")
AWS_SECRET_KEY = creds.get("AWS_SECRET_KEY")
AWS_REGION = creds.get("AWS_REGION", "us-west-2")
BUCKET_NAME = creds.get("BUCKET_NAME")
VIDEO_S3_KEY = creds.get("VIDEO_S3_KEY")
BEDROCK_MODEL_ID = creds.get("BEDROCK_MODEL_ID", "anthropic.claude-3-5-sonnet-20240620-v1:0")

# External service credentials
CLOUDINARY_CLOUD = creds.get("CLOUDINARY_CLOUD")
CLOUDINARY_KEY = creds.get("CLOUDINARY_KEY")
CLOUDINARY_SECRET = creds.get("CLOUDINARY_SECRET")
FIRECRAWL_API_KEY = creds.get("FIRECRAWL_API_KEY")
GROQ_API_KEY = creds.get("GROQ_API_KEY")

# Configure boto3 to use provided AWS credentials
boto3_session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=AWS_REGION
)

# Initialize AWS service clients
s3_client = boto3_session.client('s3')
transcribe_client = boto3_session.client('transcribe')
rekog_client = boto3_session.client('rekognition')
bedrock_client = boto3_session.client('bedrock-runtime')


fc = Firecrawl(api_key=FIRECRAWL_API_KEY) # Firecrawl client for web scraping
groq_client = Groq(api_key=GROQ_API_KEY) # Groq client for Whisper transcription

# Prepare local directories (ensure they exist)
os.makedirs("temp", exist_ok=True)
os.makedirs("frames/raw", exist_ok=True)
os.makedirs("frames/crops", exist_ok=True)
os.makedirs("external_data", exist_ok=True)
os.makedirs("output", exist_ok=True)

# Initialize output state
state = {
    "step_0_input": {
        "source_s3_url": f"s3://{BUCKET_NAME}/{VIDEO_S3_KEY}"
    },
    "metadata": {
        "project": "Video-to-Knowledge Agent",
        "status": "initialized"
    }
}

with open("output.json", "w") as f:
    json.dump(state, f, indent=2)

print("Configuration loaded. AWS and external service clients initialized.")
print("Initialized output.json with source video location.")

Configuration loaded. AWS and external service clients initialized.
Initialized output.json with source video location.


In [23]:
# Step 1: Ingestion (Download video from S3 to local)
local_video_path = "temp/video.mp4"
local_video_path = "temp/Screen_recording_50_min_DONE.mp4"
src_bucket = BUCKET_NAME
src_key = VIDEO_S3_KEY
print(f"Downloading video from s3://{src_bucket}/{src_key} to {local_video_path} ...")
s3_client.download_file(src_bucket, src_key, local_video_path)
print("Download complete. Video saved locally.")

# Update state and output.json
state["step_1_local"] = {"local_video_path": local_video_path}
state["metadata"]["status"] = "video_downloaded"
with open("output.json", "w") as f:
    json.dump(state, f, indent=2)

Downloading video from s3://bytebytegonew/incoming/Screen_recording_50_min_DONE.mp4 to temp/Screen_recording_50_min_DONE.mp4 ...
Download complete. Video saved locally.


In [24]:
# Step 2: Audio Extraction (Video -> MP3)
from moviepy.editor import VideoFileClip

video_file = state["step_1_local"]["local_video_path"]
audio_output_path = "temp/extracted_audio.mp3"

print("Extracting audio from video...")
clip = VideoFileClip(video_file)
# Write audio to file in MP3 format
clip.audio.write_audiofile(audio_output_path, codec="libmp3lame")
clip.close()
print(f"Audio extracted to {audio_output_path}")

# Upload the extracted audio to S3
audio_s3_key = "audio/extracted_audio.mp3"
s3_client.upload_file(audio_output_path, BUCKET_NAME, audio_s3_key)
audio_s3_uri = f"s3://{BUCKET_NAME}/{audio_s3_key}"
print(f"Uploaded audio to {audio_s3_uri}")

# Update state and output.json
state["step_2_audio"] = {
    "audio_s3_uri": audio_s3_uri,
    "local_audio_path": audio_output_path
}
state["metadata"]["status"] = "audio_extracted"
with open("output.json", "w") as f:
    json.dump(state, f, indent=2)

Extracting audio from video...
MoviePy - Writing audio in temp/extracted_audio.mp3




MoviePy - Done.
Audio extracted to temp/extracted_audio.mp3
Uploaded audio to s3://bytebytegonew/audio/extracted_audio.mp3


In [None]:
import time
import uuid # Import uuid for generating unique job names

audio_s3_uri = state["step_2_audio"]["audio_s3_uri"]
transcript_text = None
USE_GROQ = True  # Attempt Groq Whisper API first for transcription

if USE_GROQ:
    print("Transcribing with Groq Whisper API (Whisper large v3 model)...")
    try:
        # Groq transcription
        with open(state["step_2_audio"]["local_audio_path"], "rb") as f:
            audio_data = f.read()
        response = groq_client.audio.transcriptions.create(
            file=("audio.mp3", audio_data),
            model="whisper-large-v3-turbo",
            response_format="text"  # plain text output
        )
        transcript_text = response  # The groq client returns the text directly in this mode
        print("Groq transcription complete.")
    except Exception as e:
        print("Groq transcription failed, falling back to AWS Transcribe:", e)
        USE_GROQ = False

if not USE_GROQ:
    print("Transcribing with AWS Transcribe...")
    # Generate a unique job name using a timestamp or UUID
    job_name = f"Video2KnowledgeTranscriptionJob-{uuid.uuid4()}"
    media_uri = audio_s3_uri  # S3 URI for Transcribe to fetch
    transcribe_client.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={"MediaFileUri": media_uri},
        MediaFormat="mp3",
        LanguageCode="en-US",
        OutputBucketName=BUCKET_NAME,  # Transcribe will put the result here
        OutputKey=f"transcripts/{job_name}.json" # Use unique key for output as well
    )

    # Poll for completion
    while True:
        status = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
        status_val = status["TranscriptionJob"]["TranscriptionJobStatus"]
        if status_val in ["COMPLETED", "FAILED"]:
            break
        print("Waiting for AWS Transcribe job to complete...")
        time.sleep(5)

    if status_val == "COMPLETED":
        # Download the transcription result JSON from S3
        result_key = f"transcripts/{job_name}.json"
        result_local = "temp/raw_transcript.json" # This can remain static if overwritten
        s3_client.download_file(BUCKET_NAME, result_key, result_local)
        print("AWS Transcribe job completed. Downloaded result JSON.")

        # Parse out the transcript text
        with open(result_local, "r") as f:
            result_json = json.load(f)

        try:
            # AWS Transcribe JSON structure extraction
            transcript_text = result_json["results"]["transcripts"][0]["transcript"]
        except Exception as e:
            print("Error parsing Transcribe output:", e)
            transcript_text = ""
    else:
        print("AWS Transcribe failed with status:", status_val)
        transcript_text = ""

# Save cleaned transcript text to file and S3
os.makedirs("transcripts", exist_ok=True)
transcript_txt_path = "temp/clean_transcript.txt"
with open(transcript_txt_path, "w") as f:
    f.write(transcript_text)

# Upload the cleaned transcript text to S3
transcript_s3_key = "transcripts/clean_transcript.txt"
s3_client.upload_file(transcript_txt_path, BUCKET_NAME, transcript_s3_key)
transcript_s3_uri = f"s3://{BUCKET_NAME}/{transcript_s3_key}"
print(f"Uploaded cleaned transcript to {transcript_s3_uri}")

# Update state and output.json
state["step_3_transcription"] = {
    "transcript_text_s3": transcript_s3_uri,
    "transcript_text": transcript_text[:1000]  # store a snippet for quick reference
}
state["metadata"]["status"] = "transcribed"
with open("output.json", "w") as f:
    json.dump(state, f, indent=2)

print("Transcription step completed.")

Transcribing with Groq Whisper API (Whisper large v3 model)...
Groq transcription failed, falling back to AWS Transcribe: Error code: 413 - {'error': {'message': 'Request Entity Too Large', 'type': 'invalid_request_error', 'code': 'request_too_large'}}
Transcribing with AWS Transcribe...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Transcribe job to complete...
Waiting for AWS Trans

### üí° Explanation: Transcription Complete

We attempted transcription with Groq Whisper API first. The Groq client uses the Whisper large model to transcribe the audio; we requested plain text output for simplicity. If successful, this yields the transcript quickly without using any AWS resources (Groq‚Äôs service is free for our use-case). If it fails (e.g., API issues or audio too large >25MB), we catch the exception and switch to AWS Transcribe.

For AWS Transcribe, we used the asynchronous job API. The job is started with the audio‚Äôs S3 URI and we poll until completion. Once done, we download the resulting JSON from S3. We then parse the JSON to extract the transcript text. This approach ensures we don‚Äôt overload the notebook with large responses, and Transcribe writes the output to S3 for us. We then clean and save the text.

The transcript text (cleaned) is uploaded to `transcripts/clean_transcript.txt` in S3 and also saved locally. The state is updated with the S3 path and a snippet of the text. Storing a snippet (e.g., first 1000 chars) in the state helps with quick verification without bloating the JSON. The status is updated to `"transcribed"`.

Using Groq by default helps conserve AWS free minutes. Amazon Transcribe‚Äôs free tier (if still applicable) covers the first hour per month, so either way this step should not incur cost for short videos. We now have the full video transcript ready for analysis.

# Frame Extraction From Video


Frames extracted from local video at 1 frame/5 second using OpenCV.
Saved locally + uploaded to frames/raw/ in S3 for persistence.

In [29]:
import cv2
import math

def load_frames_every_n_seconds(video_path, step_seconds=1.0):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    print(f"FPS: {fps}")
    step_frames = math.floor(fps * step_seconds)
    print(f"Step frames: {step_frames}")

    frames = []
    frame_index = 0
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      if frame_index % step_frames == 0:
        frames.append(frame.copy())
      frame_index += 1
    cap.release()
    return frames

In [30]:
video_path = state["step_1_local"]["local_video_path"]
original_frames = load_frames_every_n_seconds(video_path, step_seconds=1.0)
print(f"Loaded {len(original_frames)} sampled frames")

FPS: 57.26741837024141
Step frames: 57
Loaded 3005 sampled frames


In [31]:
deduplicated_frame_paths = save_frames_to_folder(original_frames, "frames_raw")
bucket_name = BUCKET_NAME
s3_prefix = "frames/raw/" # Changed prefix to 'frames/dedup/' as per problem description for deduplicated frames
s3_image_objects = upload_images_to_s3(deduplicated_frame_paths, bucket_name, s3_prefix)

# Keyframe Deduplication

*Unique keyframes identified via SSIM comparison (threshold ~0.95).*

After splitting the frames per second of the lecture recording. The reduction of the duplicates of the frames to save the space and the cost of the knowledge base and to reduce the noise of the dataset.

There are some python libraries that apply the Convolutional Neural Network to create embedding vectors of each frames and find duplicates and can removed afterwards.

For instance, the desplices and the imagededup are good examples.


In [32]:
from imagededup.methods import CNN
import numpy as np
encoder = CNN()  # uses mobilenet_v3_small by default
image_dir_for_encoding = "/content/frames_raw"

encodings = encoder.encode_images(image_dir=image_dir_for_encoding)

Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_small-047dcff4.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9.83M/9.83M [00:00<00:00, 140MB/s]
2025-12-31 13:01:30,883: INFO Device set to cuda ..
INFO:imagededup.methods.cnn:Device set to cuda ..
2025-12-31 13:01:31,059: INFO Initialized: mobilenet_v3_small for feature extraction ..
INFO:imagededup.methods.cnn:Initialized: mobilenet_v3_small for feature extraction ..
2025-12-31 13:01:31,060: INFO Start: Image encoding generation
INFO:imagededup.methods.cnn:Start: Image encoding generation
2025-12-31 13:01:45,835: INFO End: Image encoding generation
INFO:imagededup.methods.cnn:End: Image encoding generation


In [None]:
import numpy as np

def consine_similarity(a,b):
  # Cosine distance: 1 - cosine_similarity. If embeddings are normalized, dot product is cosine similarity.
  return 1.0 - np.dot(a, b)

def l2_norm(a):
  # Normalize vector to unit length
  return a / (np.linalg.norm(a) + 1e-8) # Add small epsilon to prevent division by zero

norm_encoding = {filename: l2_norm(v) for filename, v in encodings.items()}

indexed_filenames = sorted(
    [(int(filename.split('_')[1].split('.')[0]), filename) for filename in norm_encoding.keys()],
    key=lambda x: x[0]
)

deduplicated_frames_actual = [] # This will store the actual NumPy array frames
deduplicated_frame_info = [] # This will store info about the deduplicated frames for state update later

last_emb = None
last_frame_index_added = -1 # Keep track of the index of the last added frame

DISTANCE_THRESHOLD = 0.1

for frame_index, filename in indexed_filenames:
  current_emb = norm_encoding[filename]
  if last_emb is None:
    # Always add the very first frame
    deduplicated_frames_actual.append(original_frames[frame_index])
    deduplicated_frame_info.append({
        "original_filename": filename,
        "original_index": frame_index,
        "timestamp_sec": frame_index # Assuming 1 frame per second initially sampled
    })
    last_emb = current_emb
    last_frame_index_added = frame_index
  else:
    dist = consine_similarity(last_emb, current_emb)
    if dist > DISTANCE_THRESHOLD: # If the current frame is significantly different from the last added unique frame
      deduplicated_frames_actual.append(original_frames[frame_index])
      deduplicated_frame_info.append({
          "original_filename": filename,
          "original_index": frame_index,
          "timestamp_sec": frame_index # Assuming 1 frame per second initially sampled
      })
      last_emb = current_emb
      last_frame_index_added = frame_index

print(f"Identified {len(deduplicated_frames_actual)} unique keyframes (from {len(original_frames)} raw frames).")


deduplicated_frames_to_save = deduplicated_frames_actual # Renaming for clarity and consistency with next step
state["step_5_deduplicated_frames_info"] = deduplicated_frame_info


Identified 28 unique keyframes (from 3005 raw frames).


In [None]:


deduplicated_frame_paths = save_frames_to_folder(deduplicated_frames_to_save, "frames_deduplicated")
bucket_name = BUCKET_NAME
s3_prefix = "frames/dedup/" # This prefix is correct as per problem description for deduplicated frames
s3_image_objects = upload_images_to_s3(deduplicated_frame_paths, bucket_name, s3_prefix)

print(f"Saved {len(deduplicated_frame_paths)} deduplicated frames locally to 'frames_deduplicated/' and uploaded to S3.")
for obj in s3_image_objects[:5]:
    print(obj)

state["step_5_deduplicated_frames_s3_objects"] = s3_image_objects
state["metadata"]["status"] = "frames_deduplicated"
with open("output.json", "w") as f:
    json.dump(state, f, indent=2)

Saved 28 deduplicated frames locally to 'frames_deduplicated/' and uploaded to S3.
{'bucket': 'bytebytegonew', 'key': 'frames/dedup/frame_000000.jpg', 's3_uri': 's3://bytebytegonew/frames/dedup/frame_000000.jpg'}
{'bucket': 'bytebytegonew', 'key': 'frames/dedup/frame_000001.jpg', 's3_uri': 's3://bytebytegonew/frames/dedup/frame_000001.jpg'}
{'bucket': 'bytebytegonew', 'key': 'frames/dedup/frame_000002.jpg', 's3_uri': 's3://bytebytegonew/frames/dedup/frame_000002.jpg'}
{'bucket': 'bytebytegonew', 'key': 'frames/dedup/frame_000003.jpg', 's3_uri': 's3://bytebytegonew/frames/dedup/frame_000003.jpg'}
{'bucket': 'bytebytegonew', 'key': 'frames/dedup/frame_000004.jpg', 's3_uri': 's3://bytebytegonew/frames/dedup/frame_000004.jpg'}


# External Data Scraping**  

Direct HTTP requests ‚Üí **Firecrawl fallback** for dynamic content.  
Scraped Markdown/text saved locally + uploaded to S3.  

In [None]:
from firecrawl import Firecrawl as fc

In [None]:
results = fc.search(
    query="What is the population of Berlin, Germany?",
    limit=2,
    scrape_options={"formats": ["markdown", "links"]},
)

import pprint
pprint.pp(results)
print(type(results))


In [None]:
from firecrawl import Firecrawl
from langchain_core.tools import StructuredTool
from pydantic import BaseModel, Field
import os
from langchain_core.tools import tool


class FirecrawlSearchInput(BaseModel):
    query: str = Field(description="User's information need as a web search query.")
    limit: int = Field(description="Max number of pages to retrieve.", default=3)

@tool("web_search")
def web_search(query: str, limit: int = 3) -> str:
    """
    Use Firecrawl's search endpoint to get markdown content for the query.
    Returns a single big markdown string as context.
    """
    results = firecrawl.search(
        query=query,
        limit=limit,
        scrape_options={"formats": ["markdown", "links"]},
    )
    print(results)

    contexts = []
    contexts.append(results)
    return contexts

firecrawl_tool = StructuredTool.from_function(
    name="web_search",
    description="Search the web and return summarized markdown content.",
    func=web_search,
    args_schema=FirecrawlSearchInput,
)
print(web_search("What is the weather today in Berlin,Germany?"))

### Knowledge Synthesis 
Combines transcript + external texts + diagrams into comprehensive prompt.
ChatGPT model generates structured Markdown "Study Guide" ‚Üí S3.

In [None]:
import boto3

def read_text_from_s3(bucket: str, key: str) -> str:
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    return obj["Body"].read().decode("utf-8")

def read_bytes_from_s3(bucket: str, key: str) -> bytes:
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    return obj["Body"].read()

In [None]:
from typing import List, Dict, Tuple

def load_lecture_assets_from_s3(bucket: str, prefix: str) -> Tuple[str, str, List[Dict]]:

    transcript = ""
    notes = ""
    images = []

    paginator = s3_client.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]

            if key.endswith("/"):
                continue

            lower = key.lower()

            # Transcript
            if "transcript" in lower and lower.endswith(".txt"):
                transcript = read_text_from_s3(bucket, key)

            # Notes / explanations
            elif ("notes" in lower or "explanations" in lower) and (
                lower.endswith(".txt") or lower.endswith(".md")
            ):
                notes = read_text_from_s3(bucket, key)

            # Images (screenshots / diagrams)
            elif lower.endswith(".png") or lower.endswith(".jpg") or lower.endswith(".jpeg"):
                img_bytes = read_bytes_from_s3(bucket, key)
                fmt = "png" if lower.endswith(".png") else "jpeg"
                images.append({"key": key, "bytes": img_bytes, "format": fmt})

    return transcript, notes, images

In [37]:
import base64
from typing import List, Dict, Tuple
from langchain_core.messages import HumanMessage

def build_multimodal_lecture_message(
    transcript: str,
    notes: str,
    images: List[Dict],
    max_images: int = 4,
) -> HumanMessage:
    """
    Turn transcript + notes + raw image bytes into a single multimodal HumanMessage
    that GPT-4o (via LangChain) can understand.
    """
    # 1. Build the text context (you can customise this)
    lecture_text = ""
    if transcript:
        lecture_text += "TRANSCRIPT:\n" + transcript + "\n\n"
    if notes:
        lecture_text += "ADDITIONAL NOTES / EXPLANATIONS:\n" + notes + "\n\n"

    if not lecture_text:
        lecture_text = "No transcript or notes were provided.\n\n"

    content = [
        {
            "type": "text",
            "text": (
                "You will receive a lecture transcript, optional notes, and one or more slide images.\n"
                "Use all of this context together to produce high-quality notes.\n\n"
                f"{lecture_text}"
            ),
        }
    ]

    # 2. Attach images as base64 inline data URIs
    for img in images[:max_images]:
        fmt = img["format"]  # "png" or "jpeg"
        b64 = base64.b64encode(img["bytes"]).decode("utf-8")
        data_url = f"data:image/{fmt};base64,{b64}"
        content.append(
            {
                "type": "image_url",
                "image_url": {"url": data_url},
            }
        )

    return HumanMessage(content=content)

In [38]:
transcript, notes, images = load_lecture_assets_from_s3("bytebytegonew", "list_objects_v2")
user_msg = build_multimodal_lecture_message(transcript, notes, images)

In [None]:
api_key = os.environ["OPEN_API_KEY"] = ""

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOpenAI(
    model="gpt-4o",  
    temperature=0.3,
    api_key=api_key,
)

system_instruction = """You are an expert university-level teaching assistant and note-taker. Your task is to transform a lecture recording into clear, in-depth, well-structured notes that a student can use to revise and understand the material later. You will be given:
- A transcript of the lecture (possibly noisy)
- Optionally, extracted slide text or key screenshots (as text)

GENERAL RULES
-------------
- Write as if for a motivated MSc-level student who wants to deeply understand and revisit the material.
- Be accurate and grounded.
- If something is unclear or missing in the transcript, do NOT hallucinate. Mark it clearly as ‚Äú[unclear in audio]‚Äù or ‚Äú[not covered in lecture]‚Äù.
- Preserve important technical details (definitions, formulas, symbols, code snippets), but rewrite them in a clear and concise way.
- Use consistent terminology throughout.
- Assume the user will read this as standalone notes without the transcript.

OUTPUT FORMAT
-------------
Return your answer in Markdown with the following structure:

1. **Title & Metadata**
   - Course
   - Lecture
   - Date (if given)
   - Lecturer (if given)

2. **High-Level Summary (10‚Äì20 bullet points)**
   ...

7. **Summary & Key Takeaways**
8. **Self-Check Questions**
"""



prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_instruction),
        ("human",
         "Here is the lecture content (transcript + slides etc.):\n"
         "{lecture_context}\n\n"
         "Task: {task}")
    ]
)

system_msg = SystemMessage(content=system_instruction)

response = llm.invoke([system_msg, user_msg])  # user_msg = HumanMessage built above
print(response.content)
