In [1]:
from easyocr import Reader
import re
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.auto import trange,tqdm
from datetime import timedelta
from srt import Subtitle,compose,parse
from difflib import SequenceMatcher
import os.path
from googletrans import Translator

translator = Translator()

In [2]:
name = 'jixiaolan'
series = 1
ep = 1
filename = f"{name}/Series{series}/{name}_S0{series}E{ep:02d}.mp4"

In [3]:
reader_ch = Reader(['ch_sim'])
common_dict = pd.read_csv('common_chinese_words.csv').set_index('word')['rank'].to_dict()
# Open the video
cap = cv2.VideoCapture(filename)
# Some characteristics from the original video
fps, frames = cap.get(cv2.CAP_PROP_FPS), cap.get(cv2.CAP_PROP_FRAME_COUNT)
second = frames/fps

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


In [4]:
def make_gray(img):
    if np.max(img) <= 1:
        img *= 255
    gray = Image.fromarray((img).astype(np.uint8)).convert('L')
    return np.array(gray)


def dilate_erode(img):
    img = Image.fromarray(img.astype('uint8')).convert('L')
    img = np.array(img)  
    
    kernel = np.ones((2,2), np.uint8)
    img = cv2.dilate(img, kernel, iterations = 4 )
    # Creating kernel
    kernel = np.ones((2, 2), np.uint8)
    # Using cv2.erode() method 
    img = cv2.erode(img, kernel,iterations = 5 )
    
    
    return np.array(img)

def get_x(list_of_lists):
    pos=np.array([np.array(xi) for xi in list_of_lists[0]])
    x = pos[:,0]
    return int(x.min()),int(x.max())

def get_frame(frame_no):
    cap.set(1,frame_no)
    # Here you can define your croping values
    x1,x2,y1,y2 = 935,1015,150,1920-150
    #Read the next frame from the video. If you set frame 749 above then the code will return the last frame.
    ret, frame = cap.read()
    crop_frame = frame[x1:x2,y1:y2,:]   
    ret,crop_frame = cv2.threshold(make_gray(crop_frame),200,255,cv2.THRESH_BINARY_INV)
    return crop_frame

def clean(word):
    word = word.replace('八八','小').replace('和坤','和').replace('和','和珅')
    if len(word)==2 and (word[1] in ['一','二','三'] or word[1] not in common_dict.keys() ):
        word = word[0]
    elif len(word)>=2 and word[-1] in ['一','二','三']:
        word = word[:-1]
    return word

def check_sub_area():    
    cap.set(1,frame_no)
    ret, frame = cap.read()
    plt.imshow(frame)
    plt.show()
    x1,x2,y1,y2 = 935,1015,150,1920-150
    #Read the next frame from the video. If you set frame 749 above then the code will return the last frame.
    crop_frame = frame[x1:x2,y1:y2,:]   
    ret,crop_frame = cv2.threshold(make_gray(crop_frame),200,255,cv2.THRESH_BINARY_INV)
    plt.imshow(crop_frame)
    plt.show()

In [5]:
def get_text(crop_frame):
    result = reader_ch.readtext(dilate_erode(make_gray(cv2.bitwise_not(crop_frame))),link_threshold=0.99999,width_ths=0,min_size = 75)
#     og= reader_ch.readtext(make_gray(cv2.bitwise_not(crop_frame)),link_threshold=0.99999)
    # plt.imshow(dilate_erode(make_gray(cv2.bitwise_not(crop_frame))))

#     og_sent = ''
    new_results = {}
    old_results = {}
    for i in range(len(result)):
        if '' != result[i][1]:
            x_min, x_max = get_x(result[i])
            word_crop = crop_frame[:,x_min:x_max]
            word = reader_ch.readtext(make_gray(cv2.bitwise_not(word_crop)))
            if len(word)!= 0:
                new_clean_word = clean(re.sub(u'[^\u4E00-\u9FA5]','',' '.join(word[0][1])))
                prob = word[0][-1]
                new_x_min, new_x_max = get_x(word[0])
                new_results[i] = {'pos':[x_min,x_max],'word':new_clean_word,'prob':prob}
            old_clean_word = clean(re.sub(u'[^\u4E00-\u9FA5]','',' '.join(result[i][1])))
            old_results[i] =  {'pos':[x_min,x_max],'word':old_clean_word,'prob':result[i][-1]}
#     for i in range(len(og)):     
#         og_sent += re.sub(u'[^\u4E00-\u9FA5]','',' '.join(og[i][1]))

    sentence = ''
    for i in range(len(new_results)):
        if i in new_results.keys():
            if new_results[i]['word'] == old_results[i]['word']:
                sentence+= new_results[i]['word']
#             elif new_results[i]['word'] != old_results[i]['word'] and new_results[i]['word'] in og_sent:
#                 sentence+= new_results[i]['word']
#             elif new_results[i]['word'] != old_results[i]['word'] and old_results[i]['word'] in og_sent:
#                 sentence+= old_results[i]['word']
            elif new_results[i]['prob'] > old_results[i]['prob'] + 0.01:
                sentence+= new_results[i]['word']
            elif new_results[i]['prob'] < old_results[i]['prob'] + 0.01:
                sentence+= old_results[i]['word']
            else:
                new_rank = common_dict[new_results[i]['word']]
                old_rank = common_dict[old_results[i]['word']]
                if new_rank < old_rank:
                    sentence += new_results[i]['word']
                else:
                    sentence += old_results[i]['word']
        else:
            try:
                sentence+=old_results[i]['word']
            except:
                pass
    return sentence

In [None]:
read_rate = 0.8
first_second = (120)/read_rate
filename = filename.replace('mp4','srt')
buffer = 0.2
subtract = buffer

if os.path.isfile(filename):
    fr = open(filename, "r")  
    subs = list(parse(fr.read()))
    fr.close()
    if len(subs) == 0:
        subs = []
        index = 1
        current_text = ''    
        last_time = 0
    else:
        last_time = subs[-1].end.seconds - first_second 
        index = subs[-1].index
        current_text = subs[-1].content    
else:
    open(filename, 'w').close()
    subs = []
    index = 1
    current_text = ''    
    last_time = 0
    
for i in tqdm(np.arange(first_second + last_time, int(second)/read_rate-first_second , read_rate)):   
    i = i * read_rate
    crop_frame = get_frame(i * fps)
    text = get_text(crop_frame)
    current_start_time = timedelta(seconds = i)            
    if len(text) != 0:
        if SequenceMatcher(None, text, current_text).ratio() < 0.2 or abs(len(text) - len(current_text)) > 3:
            current_text = text
            if len(subs) > 1:
                last_diff = (timedelta(seconds=(i + read_rate)) - subs[-1].end).total_seconds()
                subtract = buffer if last_diff < buffer else last_diff - read_rate
                subtract = subtract if subtract < buffer else buffer
            subs.append(Subtitle(index = index, 
                                 start = timedelta(seconds = i - subtract + 0.0005), 
                                 end = timedelta(seconds=(i + buffer - 0.0005)), 
                                 content = translator.translate(text, dest='en').text.encode('utf-8').decode('utf-8').replace('\n','')))
            index +=1
            f = open(filename, "w")  
            f.write(compose(subs))
            f.close()
        else:
            subs[-1].end = timedelta(seconds=(i + buffer - 0.0005))
            f = open(filename, "w")  
            f.write(compose(subs))
            f.close()


  0%|          | 0/1525 [00:00<?, ?it/s]