# Initialize OpenAI Client

In [None]:
client = openai.OpenAI(api_key = "YOUR_API_KEY")

# Split video to frames

In [None]:
# Load the cricket video
video = cv2.VideoCapture("multievent2.mov")

# Calculate video length
length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
fps = video.get(cv2.CAP_PROP_FPS)
video_length_seconds = length / fps

print(f'Video length: {video_length_seconds:.2f} seconds')

# Read frames and encode to base64
base64Frames = []
while video.isOpened():
    success, frame = video.read()
    if not success:
        break
    _, buffer = cv2.imencode(".jpg", frame)
    base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

# Release the video object
video.release()
print(len(base64Frames), "frames read.")


# Generate commentary for given event and nnumber of frames to the model

* Here we are passing evey 50th frame of a cricket video to the gpt-4o-mini model while prompting it to generate commentary in cricket commentary style 

In [None]:
# Create OpenAI chat completion for cricket commentary
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "user", 
            "content": [
                f"These are frames of a video. Create a very short and simple voiceover script in the style of a cricket commentator for {video_length_seconds:.2f} seconds. Only include the narration. Don't talk about the view",
                *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]),
            ]
        }
    ],
    max_tokens=200,
)

# Print the generated cricket commentary
cricket_commentary = response.choices[0].message.content
print(cricket_commentary)


# Ablation test (25 to 100)
 
* We now do Ablation test to see the minimum number of frames that we would need to generate meaningful commmentary for a given event. 
* We find that with a mere 3 to 5 frames per event, we are able to generate meaningful commentary. Hence enabling us to save significant amounts of processing cost.

In [None]:
sampling_intervals = [25, 30, 40, 50, 80, 100]  # Smaller = more frames sent
results = []

for interval in sampling_intervals:
    print(f"\n>>> Sampling every {interval}th frame")
    
    sampled_frames = base64Frames[0::interval]

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    f"This video shows a single ball being bowled in a cricket match, lasting about {video_length_seconds:.2f} seconds. Write a short, vivid voiceover script (under 30 words) in the style of a cricket commentator. Focus only on the action — describe the delivery, the shot, and the outcome (like a boundary or wicket). Avoid naming players or describing visuals like the crowd or stadium.",
                    *map(lambda x: {"image": x, "resize": 768}, sampled_frames),
                ]
            }
        ],
        max_tokens=100,
    )

    commentary = response.choices[0].message.content
    print(commentary)

    # Optional: Save result
    results.append({
        "interval": interval,
        "frame_count": len(sampled_frames),
        "commentary": commentary
    })

# Multiple Event commentary generation prompt engineering test with every 30th frame being passed

* Here we pass frames belonging to not just one,but multiple events.
* We need more number of frames to be processed by a larger, stronger model (GPT-4o) to be able to do the processing.

In [None]:
# Create OpenAI chat completion for cricket commentary
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user", 
            "content": [
                f"This video contains multiple cricket deliveries, each lasting a few seconds. Based on the sequence of frames provided, generate a separate short cricket commentator-style voiceover line for each delivery. Number them 1., 2., 3., etc. Focus only on the delivery, the shot, and the outcome. Do not mention the player names or identify them or describe visuals like the crowd.",
                *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::30]),
            ]
        }
    ],
    max_tokens=100,
)

# Print the generated cricket commentary
cricket_commentary = response.choices[0].message.content
print(cricket_commentary)
