# Description

This notebook translates a given subtitle file

# Usage

1. Modify cell `Config` as needed
2. Add a `.env` file containing your open API key or provide it as an environment variable
3. Run the notebook

In [1]:
import os
import pandas as pd
import pysrt
import warnings
from langchain_core.exceptions import OutputParserException
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
from loguru import logger
from dotenv import load_dotenv
from more_itertools import batched
from pathlib import Path
from pydantic import BaseModel, Field, RootModel, StrictInt, StrictStr
from typing import Any, TypeAlias

load_dotenv()
ListOfStrDict: TypeAlias = list[dict[str, Any]]

In [2]:
from langchain_core._api.deprecation import LangChainDeprecationWarning
warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)

# Config

In [3]:
SUBTITLE_FILEPATH = 'example.srt'
# Bigger batches will improve speed and perhaps quality: since we start with a new
# context each time, the few surrounding texts we have may give hints to ChatGPT
# when the original text is ambiguous
# I guess this could be improved (by keeping all batches within the same context)
# But bigger batches may result in more errors (e.g. due to ChatGPT "freezing")
BATCH_SIZE = 20
SOURCE_LANGUAGE = 'Italian'
OPENAI_MODEL = "gpt-4o"
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Helpers

## Custom data structures for interacting with ChatGPT

In [4]:
class InputSegment(BaseModel):
    id: StrictInt = Field(description='Segment id')
    text: StrictStr = Field(description='Segment text (original)')


class InputSegments(RootModel):
    root: list[InputSegment] = Field(description='A list of input segments')


class OutputSegment(BaseModel):
    id: StrictInt = Field(description='Segment id')
    text: StrictStr = Field(description='Segment text (translated)')


class OutputSegments(RootModel):
    root: list[OutputSegment] = Field(description='A list of translated segments')

## Function to translate segments

In [5]:
def translate_segments(llm: ChatOpenAI, input_segments: InputSegments,
                       source_language: str, retries: int = 6) -> OutputSegments:
    message = HumanMessagePromptTemplate.from_template(template=PROMPT_INFO)
    chat_prompt = ChatPromptTemplate.from_messages([message])
    chat_prompt_with_values = chat_prompt.format_prompt(json_payload=input_segments.json(),
                                                        source_language=source_language,
                                                        format_instructions=PARSER_INSTRUCTIONS)
    for i in range(retries + 1):
        try:
            output = llm(chat_prompt_with_values.to_messages())
            return PARSER.parse(output.content)
        except OutputParserException as e:
            logger.opt(exception=True).warning(f'Bad output structure (maybe cutoff?) - Attempt {i + 1} / {retries}')

    logger.warning(f'Segments translation failed after {retries} attempts')
    return OutputSegments([])

# Create instructions for ChatGPT on how to format the response

In [6]:
PARSER = PydanticOutputParser(pydantic_object=OutputSegments)
PARSER_INSTRUCTIONS = PARSER.get_format_instructions()
print(PARSER_INSTRUCTIONS)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"OutputSegment": {"properties": {"id": {"description": "Segment id", "title": "Id", "type": "integer"}, "text": {"description": "Segment text (translated)", "title": "Text", "type": "string"}}, "required": ["id", "text"], "title": "OutputSegment", "type": "object"}}, "description": "A list of translated segments", "items": {"$ref": "#/$defs/OutputSegment"}}
```


# Create prompt template

In [7]:
PROMPT_INFO = """
Please translate given text segments provided in JSON format from {source_language} into English:

{json_payload}
"""

# Prepare LLM

In [8]:
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL)

# Retrieve and translate lines from the subtitle file

In [9]:
subs = pysrt.open(SUBTITLE_FILEPATH)
subs_batches = list(batched(subs, n=BATCH_SIZE))

# empty subtitle file that will contain translated lines
translated_srt_file = pysrt.SubRipFile()

# variables used to show progression
nb_translated = 0
nb_segments = len(subs)

for batch_ix, subs_batch in enumerate(subs_batches):

    input_segment_list = [InputSegment(id=ix, text=s.text) for ix, s in enumerate(subs_batch)]
    input_segments = InputSegments(input_segment_list)
    output_segments = translate_segments(llm=llm, source_language=SOURCE_LANGUAGE,
                                          input_segments=input_segments)

    # retrieve information on each subtitle line to create a translated subtitle line
    for output_segment in output_segments.root:
        srt_line = subs_batch[output_segment.id]
        translated_srt_line = pysrt.SubRipItem(index=srt_line.index,
                                               start=srt_line.start,
                                               end=srt_line.end,
                                               text=output_segment.text)
        translated_srt_file.append(translated_srt_line)

    # show progression
    nb_translated += len(input_segment_list)
    print(f'Translated segments: {nb_translated} / {nb_segments}', end='\r')

Translated segments: 7 / 7

# Check out the results

In [10]:
initial_data = [{'id': sub.index, 'text': sub.text, 'start': sub.start, 'end': sub.end}
                for sub in subs]
id_to_translation_map = {sub_item.index: sub_item.text for sub_item in translated_srt_file}

df = pd.DataFrame(initial_data).set_index('id')
df['translation'] = df.index.map(id_to_translation_map)

# reorder columns
df = df[['text', 'translation', 'start', 'end']]

with pd.option_context('display.max_colwidth', 200):
    display(df)

Unnamed: 0_level_0,text,translation,start,end
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"Oh no, la ciclope è in piena caccia!","Oh no, the cyclops is in full hunt!","00:00:00,000","00:00:02,860"
2,"Attento, non guardare!","Careful, don't look!","00:00:03,600","00:00:04,940"
3,Non sto mica guardando!,I'm not looking!,"00:00:05,040","00:00:05,960"
4,"Qui, agente Sora Lela. Chiedo immediati rinforzi.","Here, Agent Sora Lela. Requesting immediate reinforcements.","00:00:09,380","00:00:15,860"
5,Saremo lì tra cinque minuti.,We will be there in five minutes.,"00:00:16,120","00:00:17,940"
6,"Ehi, nascondiamoci nel museo! So che il martedì si entra gratis.","Hey, let's hide in the museum! I know it's free on Tuesdays.","00:00:20,540","00:00:24,120"
7,Il martedì si entra gratis.,It's free on Tuesdays.,"00:00:30,000","00:00:30,240"


# Create translated SRT file

In [11]:
subtitle_filepath_obj = Path(SUBTITLE_FILEPATH)
translated_filepath_obj = subtitle_filepath_obj.with_stem(subtitle_filepath_obj.stem + '_translated')
translated_srt_file.save(translated_filepath_obj)

# Result

In [12]:
with open(translated_filepath_obj, encoding='utf-8', mode='r') as fh:
    result = fh.read()

print(result)

1
00:00:00,000 --> 00:00:02,860
Oh no, the cyclops is in full hunt!

2
00:00:03,600 --> 00:00:04,940
Careful, don't look!

3
00:00:05,040 --> 00:00:05,960
I'm not looking!

4
00:00:09,380 --> 00:00:15,860
Here, Agent Sora Lela. Requesting immediate reinforcements.

5
00:00:16,120 --> 00:00:17,940
We will be there in five minutes.

6
00:00:20,540 --> 00:00:24,120
Hey, let's hide in the museum! I know it's free on Tuesdays.

7
00:00:30,000 --> 00:00:30,240
It's free on Tuesdays.


