In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -r /kaggle/input/test-omodel/requirements_clip.txt
!pip install elevenlabs
!pip install python-dotenv
!pip install groq
!pip install moviepy

In [2]:
import torch
import cv2
import json
import time
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import clip
import numpy as np
from tqdm import tqdm
import os
from kaggle_secrets import UserSecretsClient
import os
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from IPython.display import Audio
import re 
from groq import Groq
from moviepy.editor import VideoFileClip, AudioFileClip
from pydub import AudioSegment
import shutil
user_secrets = UserSecretsClient()

In [10]:
class TemporalTransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, num_layers, num_frames, dropout=0.1):
        super().__init__()
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.position_embed = nn.Parameter(torch.zeros(1, num_frames + 1, embed_dim))
        nn.init.trunc_normal_(self.cls_token, std=0.02)
        nn.init.trunc_normal_(self.position_embed, std=0.02)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=4 * embed_dim,
            dropout=dropout,
            activation='gelu',
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
        B = x.size(0)
        cls_token = self.cls_token.expand(B, 1, -1)
        x = torch.cat([cls_token, x], dim=1)
        x = x + self.position_embed[:, :x.size(1)]
        x = self.transformer(x)
        return {"cls": x[:, 0], "tokens": x[:, 1:]}

class CricketCommentator(nn.Module):
    def __init__(self, train_mode=False, num_frames=8, gpt2_train_layers=2):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.num_frames = num_frames
        self.clip, self.preprocess = clip.load("ViT-B/32", device=self.device)
        self.clip = self.clip.float()

        if train_mode:
            for param in self.clip.parameters():
                param.requires_grad = False

        self.temporal_encoder = TemporalTransformerEncoder(
            embed_dim=512, num_heads=8, num_layers=3, num_frames=num_frames, dropout=0.1
        ).to(self.device).float()

        self.projection = nn.Sequential(
            nn.Linear(512, 1024), nn.GELU(), nn.LayerNorm(1024),
            nn.Dropout(0.1), nn.Linear(1024, 1024), nn.Tanh()
        ).to(self.device).float()

        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2-medium").to(self.device).float()
        self.gpt2.config.pad_token_id = self.tokenizer.eos_token_id

        for param in self.gpt2.parameters():
            param.requires_grad = False

        if train_mode and gpt2_train_layers > 0:
            for block in self.gpt2.transformer.h[-gpt2_train_layers:]:
                for param in block.parameters():
                    param.requires_grad = True
            for param in self.gpt2.lm_head.parameters():
                param.requires_grad = True
            for param in self.gpt2.transformer.ln_f.parameters():
                param.requires_grad = True

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        stride = max(1, total_frames // self.num_frames)
        frames = []

        for i in range(0, total_frames, stride):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if ret:
                h, w, _ = frame.shape
                crop_size = min(h, w) // 2
                y, x = (h - crop_size) // 2, (w - crop_size) // 2
                cropped = cv2.cvtColor(frame[y:y+crop_size, x:x+crop_size], cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(cropped)
                frames.append(self.preprocess(pil_image))
                if len(frames) >= self.num_frames:
                    break
            else:
                break
        cap.release()
        if len(frames) < self.num_frames:
            frames.extend([torch.zeros(3, 224, 224)] * (self.num_frames - len(frames)))
        return torch.stack(frames)

    def forward(self, frames):
        batch_size = frames.shape[0]
        frames = frames.view(-1, 3, 224, 224)
        with torch.no_grad():
            frame_features = self.clip.encode_image(frames.to(self.device))
        frame_features = frame_features.view(batch_size, self.num_frames, -1).float()
        frame_features = F.normalize(frame_features, p=2, dim=-1)
        temporal_out = self.temporal_encoder(frame_features)
        visual_embeds = self.projection(temporal_out["cls"])
        return F.normalize(visual_embeds, p=2, dim=-1).unsqueeze(1)

    def generate_commentary(self, video_path):
        frames = self.extract_frames(video_path).unsqueeze(0).to(self.device)
        visual_embeds = self.forward(frames)
        prompt = ("USER: <video> Professional commentary focusing on:"
                  "\n- Bowler's delivery\n- Batsman's shot\n- Fielding\n- Outcome"
                  "\nASSISTANT:")
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128).to(self.device)
        prompt_embeds = self.gpt2.transformer.wte(inputs['input_ids'])
        inputs_embeds = torch.cat([visual_embeds, prompt_embeds], dim=1)
        attention_mask = torch.cat([
            torch.ones(visual_embeds.shape[:2], dtype=torch.long).to(self.device),
            inputs['attention_mask']
        ], dim=1)

        output = self.gpt2.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            max_length=200, min_length=100,
            do_sample=True, temperature=0.8,
            top_k=40, top_p=0.9,
            repetition_penalty=1.15, num_beams=4,
            no_repeat_ngram_size=3, early_stopping=True,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id
        )

        commentary = self.tokenizer.decode(output[0], skip_special_tokens=True)
        print(re.split(r'ASSISTANT:', commentary)[-1].strip())
        return re.split(r'ASSISTANT:', commentary)[-1].strip()

# -------------------- PIPELINE --------------------

def summarize_commentary(commentary, client):
    prompt = (
    "You are a professional cricket commentator. Rewrite the following raw cricket commentary into a short, energetic, and expressive version. "
    "Your output should sound like real-time broadcast commentary, use clean punctuation for TTS clarity, and be no more than 1–2 sentences. "
    "Keep it concise and sweet — and make sure the ball's **outcome is clearly mentioned at the end**.\n\n"
    f"Raw commentary:\n\"{commentary}\"\n\n"
    "Formatted commentary:"
    )
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.1-8b-instant"
    )
    print("="*50)
    print(chat_completion.choices[0].message.content.strip())
    print("="*50)
    return chat_completion.choices[0].message.content.strip()

def text_to_speech(text, elevenlabs_api_key, output_path):
    elevenlabs = ElevenLabs(api_key=elevenlabs_api_key)
    audio_stream = elevenlabs.text_to_speech.convert(
        text=text,
        voice_id="URgDTjqBVr48zeu6FETI",
        model_id="eleven_multilingual_v2",
        output_format="mp3_44100_128",
    )
    with open(output_path, "wb") as f:
        for chunk in audio_stream:
            f.write(chunk)
def clear_kaggle_working():
    working_dir = "/kaggle/working"
    for filename in os.listdir(working_dir):
        file_path = os.path.join(working_dir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # delete file or link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # delete folder
        except Exception as e:
            print(f"⚠️ Failed to delete {file_path}. Reason: {e}")
def mix_audio(video_path, voice_path, crowd_path, output_path):
    video = VideoFileClip(video_path)
    video_duration_ms = video.duration * 1000
    voice = AudioSegment.from_file(voice_path)[:int(video_duration_ms - 100)]
    crowd = AudioSegment.from_file(crowd_path) - 10
    while len(crowd) < len(voice):
        crowd += crowd
    crowd = crowd[:len(voice)]
    mixed = crowd.overlay(voice)

    crowd_head = AudioSegment.from_file(crowd_path) - 15
    while len(crowd_head) < (video_duration_ms - len(mixed)):
        crowd_head += crowd_head
    crowd_head = crowd_head[:int(video_duration_ms - len(mixed))]

    final_audio = crowd_head + mixed
    temp_audio_path = "temp_mixed_audio.mp3"
    final_audio.export(temp_audio_path, format="mp3")

    final_video = video.set_audio(AudioFileClip(temp_audio_path))
    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")


In [22]:
def main():
    clear_kaggle_working()
    load_dotenv()
    video_path ="/kaggle/input/test-omodel/test_four.mp4"
    model_weights_path = "/kaggle/input/cric_model_1/pytorch/default/1/best_cricket_commentator.pth"
    crowd_path = "/kaggle/input/voicer/Stadium_Ambience.mp3"

    model = CricketCommentator(train_mode=False)
    model.load_state_dict(torch.load(model_weights_path, map_location=model.device))
    model.eval()

    raw_commentary = model.generate_commentary(video_path)
    client = Groq(api_key=user_secrets.get_secret("GROQ_API_KEY") )
    clean_commentary = summarize_commentary(raw_commentary, client)
    tts_path = "commentary_final.mp3"
    text_to_speech(clean_commentary, user_secrets.get_secret("ElevenLabs"), tts_path)
    
    # trimmed_video_path = "trimmed_video.mp4"
    # os.system(f"ffmpeg -ss 663 -i {video_path} -t 6 -c copy {trimmed_video_path}")
    os.system(f"ffmpeg -i {tts_path} -ss 0 -t 3 pro_audio3.mp3")

    mix_audio(video_path, "pro_audio3.mp3", crowd_path, "final_video.mp4")


In [25]:
if __name__ == "__main__":
    main()    

out Caught by wicket keeper!! A full delivery outside off, Batsman gets hold of it and slogs it over deep mid-wicket for a half-dozen. The fielder at deep midwicket leaps to his right and takes a big leap as the ball goes sailing over long-on.
What a blow! Batsman smashes a full delivery over deep mid-wicket, but this time the fielder at deep midwicket makes a miraculous leap to his right and snares the catch, taking a huge hop in the process - OUT!!!


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Moviepy - Building video final_video.mp4.
MoviePy - Writing audio in final_videoTEMP_MPY_wvf_snd.mp4


                                                       

MoviePy - Done.
Moviepy - Writing video final_video.mp4



                                                              

Moviepy - Done !
Moviepy - video ready final_video.mp4


In [24]:
from IPython.display import Video
# Path to your video
Video("/kaggle/working/final_video.mp4", embed=True, width=720)