<a href="https://colab.research.google.com/github/SrikanthArgp/colab_practices/blob/main/Autogen_Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install autogen-agentchat~=0.2 openai openai-whisper

Collecting autogen-agentchat~=0.2
  Downloading autogen_agentchat-0.2.39-py3-none-any.whl.metadata (30 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache (from autogen-agentchat~=0.2)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting docker (from autogen-agentchat~=0.2)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting flaml (from autogen-agentchat~=0.2)
  Downloading FLAML-2.3.2-py3-none-any.whl.metadata (16 kB)
Collecting python-dotenv (from autogen-agentchat~=0.2)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting tiktoken (from autogen-agentchat~=0.2)
  Downlo

In [None]:
import os
from google.colab import userdata


config_list = [
    {
        "model": "gpt-4",
        "api_key": userdata.get('OPENAI_API_KEY'),
    }
]

In [None]:
from typing import Annotated, List

import whisper
from openai import OpenAI

import autogen

source_language = "English"
target_language = "Hindi"
key = userdata.get('OPENAI_API_KEY')
target_video = "/content/drive/MyDrive/video.mp4"

assistant = autogen.AssistantAgent(
    name="assistant",
    system_message="For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.",
    llm_config={"config_list": config_list, "timeout": 120},
)

user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config={},
)


def translate_text(input_text, source_language, target_language):
    client = OpenAI(api_key=key)

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"Directly translate the following {source_language} text to a pure {target_language} "
                f"video subtitle text without additional explanation.: '{input_text}'",
            },
        ],
        max_tokens=1500,
    )

    # Correctly accessing the response content
    translated_text = response.choices[0].message.content if response.choices else None
    return translated_text


@user_proxy.register_for_execution()
@assistant.register_for_llm(description="using translate_text function to translate the script")
def translate_transcript(
    source_language: Annotated[str, "Source language"], target_language: Annotated[str, "Target language"]
) -> str:
    with open("transcription.txt", "r") as f:
        lines = f.readlines()

    translated_transcript = []
    with open("translation.txt", "w") as file:
      for line in lines:
          # Split each line into timestamp and text parts
          parts = line.strip().split(": ")
          if len(parts) == 2:
              timestamp, text = parts[0], parts[1]
              # Translate only the text part
              translated_text = translate_text(text, source_language, target_language)
              # Reconstruct the line with the translated text and the preserved timestamp
              translated_line = f"{timestamp}: {translated_text}"
              translated_transcript.append(translated_line)
              file.write(f"{translated_line}\n")
          else:
              # If the line doesn't contain a timestamp, add it as is
              translated_transcript.append(line.strip())
              file.write(f"{translated_line}\n")

    return "\n".join(translated_transcript)


@user_proxy.register_for_execution()
@assistant.register_for_llm(description="recognize the speech from video and transfer into a txt file")
def recognize_transcript_from_video(filepath: Annotated[str, "path of the video file"]) -> List[dict]:
    try:
        # Load model
        model = whisper.load_model("small")

        # Transcribe audio with detailed timestamps
        result = model.transcribe(filepath, verbose=True)

        # Initialize variables for transcript
        transcript = []
        sentence = ""
        start_time = 0

        # Iterate through the segments in the result
        for segment in result["segments"]:
            # If new sentence starts, save the previous one and reset variables
            if segment["start"] != start_time and sentence:
                transcript.append(
                    {
                        "sentence": sentence.strip() + ".",
                        "timestamp_start": start_time,
                        "timestamp_end": segment["start"],
                    }
                )
                sentence = ""
                start_time = segment["start"]

            # Add the word to the current sentence
            sentence += segment["text"] + " "

        # Add the final sentence
        if sentence:
            transcript.append(
                {
                    "sentence": sentence.strip() + ".",
                    "timestamp_start": start_time,
                    "timestamp_end": result["segments"][-1]["end"],
                }
            )

        # Save the transcript to a file
        with open("transcription.txt", "w") as file:
            for item in transcript:
                sentence = item["sentence"]
                start_time, end_time = item["timestamp_start"], item["timestamp_end"]
                file.write(f"{start_time}s to {end_time}s: {sentence}\n")

        return transcript

    except FileNotFoundError:
        return "The specified audio file could not be found."
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

In [None]:
user_proxy.initiate_chat(
    assistant,
    message=f"For the video located in {target_video}, recognize the speech and transfer it into a script file, "
    f"then translate from {source_language} text to a {target_language} video subtitle text. ",
)

user_proxy (to assistant):

For the video located in /content/drive/MyDrive/video.mp4, recognize the speech and transfer it into a script file, then translate from English text to a Hindi video subtitle text. 

--------------------------------------------------------------------------------
assistant (to user_proxy):

***** Suggested tool call (call_idk9Ctse3GxpA9oEdNWv50yd): recognize_transcript_from_video *****
Arguments: 
{
"filepath": "/content/drive/MyDrive/video.mp4"
}
************************************************************************************************

--------------------------------------------------------------------------------

>>>>>>>> EXECUTING FUNCTION recognize_transcript_from_video...


  checkpoint = torch.load(fp, map_location=device)


Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:06.800]  Lang chain modules. So these are few things that we will try to explore more. We are right here
[00:06.800 --> 00:14.080]  in the documentation website of Lang chain and as we can see there are mainly six modules available
[00:14.080 --> 00:21.360]  in Lang chain models prompt memory indexes chains agents and callbacks. So we'll go from
[00:21.360 --> 00:28.720]  top to bottom and we'll try to dig deeper into all of this modules. So let's go ahead and try to
[00:28.720 --> 00:36.240]  understand the first one which is models. As the documentation goes, these are some of the models
[00:36.240 --> 00:43.600]  that are being used in Lang chain. So we'll understand the applications of LLMs chart models
[00:43.600 --> 00:50.640]  and text embedding models one by one. Moving ahead, I have tried to mention all the modules which
[00:50.640 --> 00:5

ChatResult(chat_id=None, chat_history=[{'content': 'For the video located in /content/drive/MyDrive/video.mp4, recognize the speech and transfer it into a script file, then translate from English text to a Hindi video subtitle text. ', 'role': 'assistant', 'name': 'user_proxy'}, {'tool_calls': [{'id': 'call_idk9Ctse3GxpA9oEdNWv50yd', 'function': {'arguments': '{\n"filepath": "/content/drive/MyDrive/video.mp4"\n}', 'name': 'recognize_transcript_from_video'}, 'type': 'function'}], 'content': None, 'role': 'assistant'}, {'content': '[{"sentence": "Lang chain modules. So these are few things that we will try to explore more. We are right here.", "timestamp_start": 0, "timestamp_end": 6.8}, {"sentence": "in the documentation website of Lang chain and as we can see there are mainly six modules available.", "timestamp_start": 6.8, "timestamp_end": 14.08}, {"sentence": "in Lang chain models prompt memory indexes chains agents and callbacks. So we\'ll go from.", "timestamp_start": 14.08, "times