<a href="https://colab.research.google.com/github/jinkokaki/CC2024translator/blob/colab/Video_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><center>
    TRANSLATE OBJECTS IN VIDEO
</center></h1>

Import dependencies

In [None]:
!pip install opencv-python tesserocr Pillow ffmpeg scikit-image numpy ipywidgets

Collecting tesserocr
  Downloading tesserocr-2.6.2-cp310-cp310-manylinux_2_28_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25l[?25hdone
  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6080 sha256=cad4bf56b2a09fef5fc582049e50a5e20447f38c6660b9bf604c35dc875cd2fa
  Stored in directory: /root/.cache/pip/wheels/8e/7a/69/cd6aeb83b126a7f04cbe7c9d929028dc52a6e7d525ff56003a
Successfully built ffmpeg
Installing collected packages: tesserocr, ffmpeg, jed

In [None]:
import os, cv2, ffmpeg, re, shutil, glob, sys, csv
import numpy as np, ipywidgets as widgets
from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageEnhance
from google.cloud import translate_v2 as translate
from tesserocr import PyTessBaseAPI, PSM, OEM
from skimage.util import img_as_ubyte
from skimage.morphology import disk
from skimage.filters import rank
from IPython.display import display, clear_output

<h2><center>
    Convert video to images
</center></h2>

In [None]:
def split_video_to_png(video_path):
    output_folder = "split_images"
    os.makedirs(output_folder, exist_ok=True)
    # Open the video file
    video_capture = cv2.VideoCapture(video_path)

    # Read the first frame
    success, frame = video_capture.read()
    count = 0

    # Loop through the video frames
    while success:
        # Write the frame as a PNG image
        cv2.imwrite(f"{output_folder}/frame_{count:04d}.png", frame)

        # Read the next frame
        success, frame = video_capture.read()
        count += 1

    # Release the video capture object
    video_capture.release()

<h2><center>
    OCR images to get text and positional data
</center></h2>

please only run this once if the folder doesn't already exist

In [None]:
!curl -s -L https://github.com/tesseract-ocr/tessdata/archive/refs/tags/4.1.0.tar.gz | tar xvz

tessdata-4.1.0/
tessdata-4.1.0/.gitmodules
tessdata-4.1.0/LICENSE
tessdata-4.1.0/README.md
tessdata-4.1.0/afr.traineddata
tessdata-4.1.0/amh.traineddata
tessdata-4.1.0/ara.traineddata
tessdata-4.1.0/asm.traineddata
tessdata-4.1.0/aze.traineddata
tessdata-4.1.0/aze_cyrl.traineddata
tessdata-4.1.0/bel.traineddata
tessdata-4.1.0/ben.traineddata
tessdata-4.1.0/bod.traineddata
tessdata-4.1.0/bos.traineddata
tessdata-4.1.0/bre.traineddata
tessdata-4.1.0/bul.traineddata
tessdata-4.1.0/cat.traineddata
tessdata-4.1.0/ceb.traineddata
tessdata-4.1.0/ces.traineddata
tessdata-4.1.0/chi_sim.traineddata
tessdata-4.1.0/chi_sim_vert.traineddata
tessdata-4.1.0/chi_tra.traineddata
tessdata-4.1.0/chi_tra_vert.traineddata
tessdata-4.1.0/chr.traineddata
tessdata-4.1.0/configs
tessdata-4.1.0/cos.traineddata
tessdata-4.1.0/cym.traineddata
tessdata-4.1.0/dan.traineddata
tessdata-4.1.0/dan_frak.traineddata
tessdata-4.1.0/deu.traineddata
tessdata-4.1.0/deu_frak.traineddata
tessdata-4.1.0/div.traineddata
tessdata

In [None]:
def preprocess_image(image_path):
    image = Image.open(image_path)
    image = image.convert('L')
    img = img_as_ubyte(image)
    img_tmp = rank.equalize(np.array(img), disk(30))
    img_tmp = cv2.threshold(img_tmp, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    img_eq = Image.fromarray(img_tmp)
    enhancer = ImageEnhance.Contrast(img_eq)
    img_eq = enhancer.enhance(2)
    img_eq.save("temp.png")
    return img_eq

In [None]:
api = PyTessBaseAPI(path="tessdata-4.1.0",lang="jpn+eng",psm=PSM.AUTO_OSD,oem=OEM.LSTM_ONLY)

def init(image_name, is_video):
    if is_video:
      image = preprocess_image("split_images/" + image_name)
    else:
      image = preprocess_image(image_name)
    api.SetImage(image)
    api.Recognize()

def get_orientation():
    it = api.AnalyseLayout()
    orientation, direction, order, deskew_angle = it.Orientation()
    return format(orientation), format(deskew_angle)

def write_TSV():
    with open("results.tsv", "w") as result:
        result.write(api.GetTSVText(0))
        result.close()

    #with open("results.tsv", "r") as result:
    #    print(result.read())

def get_regions():
    regions = []
    with open("results.tsv", "r") as tsv:
        f = csv.reader(tsv, delimiter='\t', quotechar='"')
        i = 0
        str = ""
        left = []
        top = []
        right = []
        bottom = []
        for row in f:
            if float(row[2]) == i:
                if float(row[10]) == 0 or float(row[10]) > 80:
                    str = " ".join((str, row[11]))
                    left.append(float(row[6]))
                    top.append(float(row[7]))
                    right.append(float(row[6]) + float(row[8]))
                    bottom.append(float(row[7]) + float(row[9]))
            else:
                if (not str.isspace() and str.strip()):
                    regions.append((str, min(left, default=-1), min(top, default=-1), max(right, default=-1), max(bottom, default=-1)))
                left = []
                top = []
                right = []
                bottom = []
                str = ""
                i = i + 1
        if (not str.isspace() and str.strip()):
            regions.append((str, min(left, default=-1), min(top, default=-1), max(right, default=-1), max(bottom, default=-1)))
    return regions

def end():
    api.End()

<h2><center>
    Translate text to English
</center></h2>

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/buoyant-song-414621-671aac12546a.json'

# Function to translate text using Google Cloud Translation API
def translate_text(text):
    client = translate.Client()
    return client.translate(text, 'en')['translatedText']

In [None]:
def translate_regions(regions):
    translated_region = []
    for phrase in regions:
        #remove all non standard characters
        translated_text = translate_text(phrase[0].strip())

        if any(char.isalpha() for char in translated_text):
          translated_region.append((translated_text,) + phrase[1:])
    return translated_region

<h2><center>
    Replace text in images with translated text
</center></h2>

In [None]:
# Finding the maximum font size that still fits within the box
def find_max_font_size(text, box_height, box_width):
    low = 1
    high = 1000
    max_font_size = 1

    while low <= high:
        mid = (low + high) // 2
        font = ImageFont.truetype("arial.ttf", mid)
        bbox = font.getbbox(text)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]

        if text_width <= box_width and text_height <= box_height:
            max_font_size = mid
            low = mid + 1
        else:
            high = mid - 1

    return max_font_size

In [None]:
# Adding the text to the images
def add_text_to_image(image_name, text, x, y, x2, y2, is_video):
    output_folder = "translated_images"
    os.makedirs(output_folder, exist_ok=True)

    width = x2 - x
    height = y2 - y

    if is_video: # is video we save to an additional folder
      # Check if the translated image exists
      translated_image_name = output_folder + "/" + image_name.rsplit('.', 1)[0] + "_translated." + image_name.rsplit('.', 1)[1]
      print(image_name + " - ", end="")

      if os.path.exists(translated_image_name):
        img_to_open = translated_image_name
        print("Old image")
      else:
        img_to_open = "split_images/" + image_name
        print("New image")

    else: # image so save to same dir
      translated_image_name = image_name.rsplit('.', 1)[0] + "_translated." + image_name.rsplit('.', 1)[1]
      print(image_name + " - ", end="")

      # Check if the translated file exists
      if os.path.exists(translated_image_name):
          img_to_open = translated_image_name
          print("Old image")
      else:
          img_to_open = image_name
          print("New image")

    # Open the image
    with Image.open(img_to_open) as img:
        # Create a new image in memory
        new_img = img.copy()

        # Calculate the average color of the specified box
        box = (x, y, x + width, y + height)
        box_area = new_img.crop(box)
        avg_color = map(lambda x: int(sum(x) / len(x)), zip(*box_area.getdata()))

        # Create a draw object to add text to the image
        draw = ImageDraw.Draw(new_img)

        # Load a font
        font_size = find_max_font_size(text, height, width) # Adjust font size based on the box size and text length
        font = ImageFont.truetype("arial.ttf", font_size)

        # Draw a rectangle with the average color
        #avg_color = "#ffffff"
        draw.rectangle(box, fill=avg_color)

        # Add text on top of the rectangle
        text_bbox = draw.textbbox((x, y), text, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]
        text_x = x + (width - text_width) // 2  # Center the text within the box
        text_y = y + (height - text_height) // 2
        draw.text((text_x, text_y), text, fill="black", font=font)

        # Save the new image with "_translated" added to the original name
        new_img.save(translated_image_name)

<h2><center>
    Convert images back to video (without sound)
</center></h2>

In [None]:
def sync_images():
    split_images_folder = "split_images"
    translated_images_folder = "translated_images"

    # Ensure the translated_images_folder exists
    os.makedirs(translated_images_folder, exist_ok=True)

    # Count the number of images in split_images_folder
    num_images = len([file for file in os.listdir(split_images_folder) if file.endswith('.png')])

    # Iterate through the images in split_images_folder
    for file in os.listdir(split_images_folder):
        if file.endswith('.png'):
            frame_number = file.split('_')[1].split('.')[0]  # Extract the frame number
            translated_file = f"frame_{frame_number}_translated.png"
            translated_path = os.path.join(translated_images_folder, translated_file)

            # Check if the translated file exists and if the frame number is lower than num_images
            if not os.path.exists(translated_path) and int(frame_number) < num_images:
                source_path = os.path.join(split_images_folder, file)
                shutil.copy(source_path, translated_path)
                print(f"Copied {file} to {translated_path}")

In [None]:
def images_to_video(video_name):
    imgFolder = "translated_images"
    vidName = "video_output.mp4"

    cap = cv2.VideoCapture(video_name)
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()

    images = glob.glob(os.path.join(imgFolder, "*.png"))
    images.sort()

    width, height = Image.open(images[0]).size
    size = (width, height)
    codec = cv2.VideoWriter_fourcc(*'mp4v')
    if len(sys.argv) < 2:
        video = cv2.VideoWriter(vidName, codec, fps, (width, height))
    else:
        video = cv2.VideoWriter(vidName, codec, fps, size)

    for img in images:
        frame = cv2.imread(img)
        video.write(frame)

    video.release()
    cv2.destroyAllWindows()

<h2><center>
    Process Image
</center></h2>

In [None]:
def process_image(image_name):
    # process images with ocr
    init(image_name, False)
    write_TSV()
    regions = get_regions()

    # translate text
    translated_regions = translate_regions(regions)

    # overlay new text on images
    for phrase in translated_regions:
        print("Image: " + image_name + " text: " + phrase[0] + " x: " + str(phrase[1]) + " y: " + str(phrase[2]) + " height: " + str(phrase[3]) + " width: " + str(phrase[4]))
        add_text_to_image(image_name, phrase[0], phrase[1], phrase[2], phrase[3], phrase[4], False)
    print("Image Processed Successfully!")

<h2><center>
    Process Video
</center></h2>

In [None]:
def process_video(video_name):
    split_images_folder = 'split_images'

    # process video to images
    split_video_to_png(video_name)

    # process images with ocr
    for frame in os.listdir(split_images_folder):
      file_name = os.path.basename(frame)
      init(frame, True)
      write_TSV()
      regions = get_regions()

      # translate text
      translated_regions = translate_regions(regions)

      # overlay new text on images
      for phrase in translated_regions:
        print("Frame: " + frame + " text: " + phrase[0] + " x: " + str(phrase[1]) + " y: " + str(phrase[2]) + " height: " + str(phrase[3]) + " width: " + str(phrase[4]))
        add_text_to_image(frame, phrase[0], phrase[1], phrase[2], phrase[3], phrase[4], True)

    # save as video (without sound)
    sync_images()
    images_to_video(video_name)

    print("Video Processed Successfully!")

<h1><center>
    Main Function
</center></h1>

In [None]:
def main():
    process_video("input_video2.mp4")
    # Define the upload widget
    upload = widgets.FileUpload(
        accept='.mp4, .png',  # Accepted file types
        multiple=False  # Only allow one file to be uploaded
    )

    # Define the submit button
    submit_button = widgets.Button(
        description='Submit',
        disabled=True,  # Initially disabled
        button_style='',  # Initially gray
        layout={'visibility': 'visible'}  # Initially visible
    )

    # Define the output widget to display messages
    output = widgets.Output()
    file_name = ""

    def on_upload_change(change):
        if upload.value:
            submit_button.disabled = False  # Enable the button
            submit_button.button_style = 'success'  # Change to green
        else:
            submit_button.disabled = True  # Disable the button
            submit_button.button_style = ''  # Change to gray

    def on_submit_button_clicked(b):
        with output:
            if upload.value:
                # Retrieve the uploaded file
                uploaded_file = next(iter(upload.value.values()))

                # Save the file
                file_name = uploaded_file['metadata']['name']
                content = uploaded_file['content']
                with open(file_name, 'wb') as f:
                    f.write(content)

                # Update the message
                print(f"File {file_name} has been uploaded and saved.")

                # Hide the widgets
                upload.layout.visibility = 'hidden'
                submit_button.layout.visibility = 'hidden'

                if file_name.endswith('.mp4'): # video
                    process_video(file_name)
                elif file_name.endswith('.png'): # image
                    process_image(file_name)
                else: # invalid file type
                    print(f"Error: File '{file_name}' is not an MP4 or PNG file.")

    # Attach the event handlers
    upload.observe(on_upload_change, names='value')
    submit_button.on_click(on_submit_button_clicked)

    # Display the widgets
    display(upload, submit_button, output)

if __name__ == '__main__':
    main()

Frame: frame_0112.png text: - Save - a x: 38.0 y: 104.0 height: 80.0 width: 309.0
frame_0112.png - Old image
Frame: frame_0112.png text: moon x: 0.0 y: 349.0 height: 43.0 width: 367.0
frame_0112.png - Old image


In [None]:
# prompt: delete everything in "transplated_images" using rm rf

#!rm -rf translated_images
#!rm -rf split_images/