1. Extract captions from vtt files
2. Capture frames for each caption snippet
3. Save data and corresponding JSON file
4. Generate a text file including all captions extracted

In [None]:
import webvtt
import glob, os
import json
import cv2
import numpy as np

In [None]:
THRESHOLD = 100

#parse time stamps in captions
def str_to_time_seconds(timestr):
    arr = timestr.split(':')
    seconds = int(arr[0]) * 3600 + int(arr[1]) * 60 + float(arr[2])
    return seconds
    
# print(json.dumps(blob, indent=4, sort_keys=False, separators=(',', ': ')))
def prettify(obj):
    print(json.dumps(obj, indent=4, sort_keys=False, separators=(',', ': ')))

# convert time into frame index
def time_to_frame(time_in_seconds, fps):
    return int(time_in_seconds * fps)

#retrieve basic information from a video file
def parse_video_info(filename):
    cap = cv2.VideoCapture(filename)
    # Check if camera opened successfully
    if not cap.isOpened(): 
        raise Exception("Unable to read video file " + filename)

    # Default resolutions of the frame are obtained.The default resolutions are system dependent.
    # We convert the resolutions from float to integer.
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    frame_per_second = cap.get(cv2.CAP_PROP_FPS)
    
    comments = '''CV_CAP_PROP_POS_MSEC Current position of the video file in milliseconds or video capture timestamp.
CV_CAP_PROP_POS_FRAMES 0-based index of the frame to be decoded/captured next.
CV_CAP_PROP_POS_AVI_RATIO Relative position of the video file: 0 - start of the film, 1 - end of the film.
CV_CAP_PROP_FRAME_WIDTH Width of the frames in the video stream.
CV_CAP_PROP_FRAME_HEIGHT Height of the frames in the video stream.
CV_CAP_PROP_FPS Frame rate.
CV_CAP_PROP_FOURCC 4-character code of codec.
CV_CAP_PROP_FRAME_COUNT Number of frames in the video file.
CV_CAP_PROP_FORMAT Format of the Mat objects returned by retrieve() .
CV_CAP_PROP_MODE Backend-specific value indicating the current capture mode.
CV_CAP_PROP_BRIGHTNESS Brightness of the image (only for cameras).
CV_CAP_PROP_CONTRAST Contrast of the image (only for cameras).
CV_CAP_PROP_SATURATION Saturation of the image (only for cameras).
CV_CAP_PROP_HUE Hue of the image (only for cameras).
CV_CAP_PROP_GAIN Gain of the image (only for cameras).
CV_CAP_PROP_EXPOSURE Exposure (only for cameras).
CV_CAP_PROP_CONVERT_RGB Boolean flags indicating whether images should be converted to RGB.
CV_CAP_PROP_WHITE_BALANCE Currently not supported
CV_CAP_PROP_RECTIFICATION Rectification flag for stereo cameras (note: only supported by DC1394 v 2.x backend currently)'''
    
    return {'name': filename, 'capture': cap, 'frame_width': frame_width, 'frame_height': frame_height, 'frame_count': frame_count, 'frame_per_second': frame_per_second, 'total_duration': frame_count / frame_per_second}


def laplacian_variance(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # compute the Laplacian of the image and then return the focus
    # measure, which is simply the variance of the Laplacian
    return cv2.Laplacian(gray, cv2.CV_64F).var()

#Get frame and blur score
def frame_blur_score(clip, frame_index):
    cap = clip['capture']
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
    cap.grab() 
    retval, frame = cap.retrieve(0) #retrieve a frame
    if retval:
        blur_score = laplacian_variance(frame)   
    else:
        blur_score = 0.0
    return frame, blur_score

#save frame to folder
def save_frame(frame, frame_id, folder, filename_prefix):
    output_filename = "%s/%s_%08d.jpg" % (folder, filename_prefix, frame_id)
    cv2.imwrite(output_filename, frame)     # save frame as JPEG file
    return output_filename

#capture frames from video source
def capture_frames(video_info, frame_start, frame_stop, frame_step, output_folder):
    saved_frames = {}
    
    for i in range(frame_start, frame_stop, frame_step):
        offset = 0
        frame_id = i
        isAccepted = False
        while isAccepted is not True:  
            frame_id = i + offset
            frame, blur = frame_blur_score(video_info, frame_id)
            #check image blur score
            if blur >= THRESHOLD:
                isAccepted = True
                break
            if offset > 5:
                break #retried 5 times
            if frame_id == frame_stop or offset >= frame_step:
                break
                
            offset = offset + 1
            
        filename = save_frame(frame, frame_id, output_folder, 'Frame')
        frame_dict = {'image_file': os.path.basename(filename)}
        saved_frames[int(frame_id)] = frame_dict
        
    return saved_frames 

In [None]:
#Save captions and sample frames
def process_captions(vtt_file, output_folder):
    captions = webvtt.read(vtt_file)
    last_text = ''
    last_end = ''
    last_start = ''
    all_texts = []
    txt_file = os.path.join(output_folder, "caption.txt")
    caption_data = {'cc_file': txt_file}
    caption_data['cc_info'] = []
    line_id = 0;
    for line in captions:
        text = line.text.replace(last_text, "").strip()
        if len(text) == 0:
            continue   
            
        start = line.start.strip()
        end = line.end.strip()
        
        time_start = str_to_time_seconds(start)
        time_stop = str_to_time_seconds(end)
        item = {'session_id': line_id, 'start': start, 'end': end, 'start_time': time_start, 'stop_time': time_stop, 'text': text}
        caption_data['cc_info'].append(item)
        all_texts.extend(text.split(' '))
        last_start = start
        last_end = end
        last_text = text
        line_id = line_id + 1
        
    f = open(txt_file, "w")
    f.write(' '.join(all_texts)) 
    f.close()
    return caption_data

def process_screenshots(video_file, captions, output_folder):
    
    video_info = parse_video_info(video_file) 
    frame_data = {}
    frame_data['fps'] = video_info['frame_per_second']
    frame_data['image_folder'] = output_folder
    frame_data['image_info'] = []
    frame_step = frame_data['fps'] * 3
    for caption in captions:
        frame_start = time_to_frame(caption['start_time'], frame_data['fps'])
        frame_stop = time_to_frame(caption['stop_time'], frame_data['fps'])
        frame_group = {'session_id': caption['session_id']}
        frame_group['start'] = frame_start
        frame_group['stop'] = frame_stop
        frame_group['caption'] = caption['text']
        frame_group['frames'] = capture_frames(video_info, frame_start, frame_stop, frame_step, output_folder)
        frame_data['image_info'].append(frame_group)

    return frame_data

In [None]:
def get_all_texts(folder):
    all_texts = ''
    for filename in glob.glob(os.path.join(folder,'**/*.txt'), recursive=True):
        with open(filename) as file:
            all_texts += file.read()
            all_texts += ' '
    return all_texts

In [None]:
%%time
#Extract captions with screenshots
path = './data source/Life Is Strange 1/'
output_folder = 'output 1'
game = 'Life Is Strange'

vttfiles = sorted(glob.glob(path + '*.vtt'))
mp4files = sorted(glob.glob(path + '*.mp4'))
print(len(vttfiles), " vtt files in total");
print(len(mp4files), " mp4 files in total");


#vttfiles = ['./Movies/Life Is Strange 1/Life is Strange - Season 1 Episode 1 Chrysalis (Full Episode) (No Music)-AP5UBhyjMKA.en.vtt']
#mp4files = ['./Movies/Life Is Strange 1/Life is Strange - Season 1 Episode 1 Chrysalis (Full Episode) (No Music)-AP5UBhyjMKA.mp4']
all_data = []
file_dict = {}   
file_index = 0
output_dir = ''
for video_file in mp4files:
    filename = os.path.basename(video_file)[:-4]  
    output_dir = os.path.join(path + output_folder, filename)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    vtt_file = ''
    #find out the right vtt file
    for vtt in vttfiles:
        if filename in os.path.basename(vtt)[:-4]:
            vtt_file = vtt
            print(filename)
            print(os.path.basename(vtt)[:-4])
            break;
            
    #Start processing
    captions = []
    screenshots = {}
    print("processing..." + filename)
    captions = process_captions(vtt_file, output_dir)
    screenshots = process_screenshots(video_file, captions['cc_info'], output_dir)
    file_dict = {}    
    file_dict['file_index'] = file_index
    file_dict['filename_vtt'] = vtt_file
    file_dict['filename_video'] = video_file
    file_dict['captions'] = captions
    file_dict['screenshots'] = screenshots
    all_data.append(file_dict)
    file_index += 1
    

#output to json file
with open(os.path.join(path + output_folder, game + '.json'), 'w') as outfile:
    json.dump(all_data, outfile, sort_keys=False, indent=4)

In [None]:
all_texts = get_all_texts('./data source/Life Is Strange 1/')
print(all_texts)
with open('./data source/LifeIsStrange.txt', 'w') as file:
    file.write(all_texts)
    file.close()