# Step 1 - Preprocess Data

In this step, we will preprocess game videos and captions downloaded from YouTube. The assumption for this step is that both videos and captions are available for extracting text content such as dialogues and conversations in games for textual-based search task. For the sake of simplicity, this project uses auto-cc(closed captions) which you can download using some libraries. However, extracting text content can come in different ways. For example, one can use OCR (Optical Character Recognition) techniques. OCR could be quite difficult and less accurate though. 

The script will do the following things:
    1. Extract captions from vtt or xml files
    2. Capture frames for each caption snippet
    3. Save data and corresponding JSON file
    4. Generate a text file including all captions extracted
    
Prerequisites:
    You can download YouTube videos with auto-cc with https://rg3.github.io/youtube-dl/
    In the command line, direct to the folder where youtube-dl.exe resides and type in the following command:
    youtube-dl --write-auto-sub {youtube video link}
    
Input: 
    1. videos and captions files
    
Output:
    1. A json file contains all the necessary data for the next steps
    2. screenshots
    

In [1]:
import glob, os
import json
import cv2
import numpy as np
import webvtt as vttParser
import subtitlexmlparser as xmlParser

In [2]:
"""
Make sure these variables are correctly set.
path: the root path to where the videos and captions are located.
output_folder: specify the path where all the outputs should be.
game: specify the game name
video_format: the extension of video file
caption_format: the extension of subtitle file
"""

path = './visualization/backend/datasource/Rusty Lake/'
output_folder = './visualization/backend/datasource/Rusty Lake/output/'
game = 'Rusty Lake Roots'
video_format = "mkv" # mp4
caption_format = "vtt" # vtt
capture_interval = 10 # n seconds of interval between captures

In [3]:
THRESHOLD = 100

#parse time stamps in captions
def str_to_time_seconds(timestr):
    arr = timestr.split(':')
    seconds = int(arr[0]) * 3600 + int(arr[1]) * 60 + float(arr[2])
    return seconds
    
# print(json.dumps(blob, indent=4, sort_keys=False, separators=(',', ': ')))
def prettify(obj):
    print(json.dumps(obj, indent=4, sort_keys=False, separators=(',', ': ')))

# convert time into frame index
def time_to_frame(time_in_seconds, fps):
    return int(time_in_seconds * fps)

#retrieve basic information from a video file
def parse_video_info(filename):
    cap = cv2.VideoCapture(filename)
    # Check if camera opened successfully
    if not cap.isOpened(): 
        raise Exception("Unable to read video file " + filename)

    # Default resolutions of the frame are obtained.The default resolutions are system dependent.
    # We convert the resolutions from float to integer.
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    frame_per_second = cap.get(cv2.CAP_PROP_FPS)
    
    comments = '''CV_CAP_PROP_POS_MSEC Current position of the video file in milliseconds or video capture timestamp.
CV_CAP_PROP_POS_FRAMES 0-based index of the frame to be decoded/captured next.
CV_CAP_PROP_POS_AVI_RATIO Relative position of the video file: 0 - start of the film, 1 - end of the film.
CV_CAP_PROP_FRAME_WIDTH Width of the frames in the video stream.
CV_CAP_PROP_FRAME_HEIGHT Height of the frames in the video stream.
CV_CAP_PROP_FPS Frame rate.
CV_CAP_PROP_FOURCC 4-character code of codec.
CV_CAP_PROP_FRAME_COUNT Number of frames in the video file.
CV_CAP_PROP_FORMAT Format of the Mat objects returned by retrieve() .
CV_CAP_PROP_MODE Backend-specific value indicating the current capture mode.
CV_CAP_PROP_BRIGHTNESS Brightness of the image (only for cameras).
CV_CAP_PROP_CONTRAST Contrast of the image (only for cameras).
CV_CAP_PROP_SATURATION Saturation of the image (only for cameras).
CV_CAP_PROP_HUE Hue of the image (only for cameras).
CV_CAP_PROP_GAIN Gain of the image (only for cameras).
CV_CAP_PROP_EXPOSURE Exposure (only for cameras).
CV_CAP_PROP_CONVERT_RGB Boolean flags indicating whether images should be converted to RGB.
CV_CAP_PROP_WHITE_BALANCE Currently not supported
CV_CAP_PROP_RECTIFICATION Rectification flag for stereo cameras (note: only supported by DC1394 v 2.x backend currently)'''
    
    return {'name': filename, 'capture': cap, 'frame_width': frame_width, 'frame_height': frame_height, 'frame_count': frame_count, 'frame_per_second': frame_per_second, 'total_duration': frame_count / frame_per_second}


def laplacian_variance(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # compute the Laplacian of the image and then return the focus
    # measure, which is simply the variance of the Laplacian
    return cv2.Laplacian(gray, cv2.CV_64F).var()

#Get frame and blur score
def frame_blur_score(clip, frame_index):
    cap = clip['capture']
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
    cap.grab() 
    retval, frame = cap.retrieve(0) #retrieve a frame
    if retval:
        blur_score = laplacian_variance(frame)   
    else:
        blur_score = 0.0
    return frame, blur_score

#save frame to folder
def save_frame(frame, frame_id, folder, filename_prefix):
    output_filename = "%s/%s_%08d.jpg" % (folder, filename_prefix, frame_id)
    if os.path.isfile(output_filename):
        # Just in case a file with the same name is already there
        output_filename = "%s/%s_%08d_%d.jpg" % (folder, filename_prefix, frame_id, np.random.randint(0,100))
    cv2.imwrite(output_filename, frame)     # save frame as JPEG file
    return output_filename

#capture frames from video source
def capture_frames(video_info, frame_start, frame_stop, frame_step, output_folder, moment_id):
    saved_frames = {}
    
    for i in range(frame_start, frame_stop, frame_step):    
        frame_id = i
        next_start = i + frame_step
        blur = 0
        offset = 5
        # print(f'frame_start:{frame_start}, frame_stop:{frame_stop}, frame_step:{frame_step}, i: {i}')
        #check image blur score
        while blur < THRESHOLD:
            frame, blur = frame_blur_score(video_info, frame_id)
            # print(f'frame_id:{frame_id}, blur:{blur}, try next frame!')
            if  frame_id >= frame_stop - 1 or frame_id >= frame_stop - offset or frame_id == next_start - 1:
                # print(f'frame_id:{frame_id}, no available frame!')
                break   
            next_frame =  frame_id + offset # move to the next frame
            frame_id = next_frame if next_frame < next_start else next_start - 1 # no cross boundrary
            
            
        # print(f'saving frame: {frame_id}')
        filename = save_frame(frame, frame_id, output_folder, 'Frame')
        frame_dict = {'image_file': os.path.basename(filename)}
        frame_dict['moment_id'] = moment_id
        moment_id += 1
        saved_frames[frame_id] = frame_dict
          
    return moment_id, saved_frames 

In [4]:
#Save captions and sample frames
def process_captions(caption_file, output_folder, caption_parser = xmlParser.parse_subtitle_xml):
    captions = caption_parser(caption_file)
    last_text = ''
    last_end = ''
    last_start = ''
    all_texts = []
    txt_file = os.path.join(output_folder, "caption.txt")
    caption_data = {'cc_file': txt_file}
    caption_data['captions'] = {}
    for line_id, line in enumerate(captions):
        session_id = str(line_id)
        text = line.text.replace(last_text, "").strip()
        start = line.start.strip()
        stop = line.end.strip()
        caption_data['captions'][session_id] = {'timestamp_start': start, 'timestamp_stop': stop, 'text': ""}
        if len(text) == 0:
            continue   
       
        caption_data['captions'][session_id]['text'] = text
        all_texts.extend(text.split(' '))
        last_start = start
        last_end = stop
        last_text = text
        
    f = open(txt_file, "w")
    f.write(' '.join(all_texts)) 
    f.close()
    return caption_data

def process_video(video_file, captions, output_folder, interval=5):
    
    video_info = parse_video_info(video_file) 
    frame_data = {}
    fps = video_info['frame_per_second']
    frame_data['image_folder'] = output_folder
    frame_data['image_info'] = {}
    frame_step = int(fps * interval)
    moment_id = 0
    for session_id, caption in captions.items():
        frame_data['image_info'][session_id] ={}
        time_start = str_to_time_seconds(caption['timestamp_start'])
        time_stop = str_to_time_seconds(caption['timestamp_stop'])
        frame_start = time_to_frame(time_start, fps)
        frame_stop = time_to_frame(time_stop, fps)
        frame_data['image_info'][session_id]['start'] = frame_start
        frame_data['image_info'][session_id]['stop'] = frame_stop
        # print(f'fps{fps}, frame_start{frame_start}, frame_stop{frame_stop}')
        moment_id, frame_data['image_info'][session_id]['frames'] = capture_frames(video_info, frame_start, frame_stop, frame_step, output_folder, moment_id)
    return fps, frame_data

In [5]:
%%time
# Extract captions with screenshots

if caption_format is "xml":
    parser = xmlParser.parse_subtitle_xml
else:
    parser = vttParser.read
    
subtFiles = sorted(glob.glob(os.path.join(path, '*.' + caption_format)))  
videoFiles = sorted(glob.glob(os.path.join(path, '*.' + video_format))) 

#subtFiles = ['./data source/Life Is Strange 1/Episode_1_Chrysalis.xml']
#videoFiles = ['./data source/Life Is Strange 1/Episode_1_Chrysalis.mp4']

print(len(subtFiles), " subtitle files in total");
print(subtFiles)
print(len(videoFiles), " video files in total");
print(videoFiles)

file_dict = {}  
output_dir = ''

for file_index, video_file in enumerate(videoFiles):
    filename = os.path.basename(video_file)[:-4]  
    output_dir = os.path.join(output_folder, filename, 'screenshots')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    sub_file = ''
    #find out the right subtitle file
    for f in subtFiles:
        if filename in os.path.basename(f)[:-4]:
            sub_file = f
            break;
            
    #Start processing
    captions = []
    screenshots = {}
    print("processing..." + sub_file)
    captions = process_captions(sub_file, output_dir, caption_parser = parser)
    print("processing..." + video_file)
    fps, screenshots = process_video(video_file, captions['captions'], output_dir, interval=capture_interval)
    file_dict[file_index] = {}
    file_dict[file_index]['corpus'] = filename
    file_dict[file_index]['filename_subtitle'] = sub_file
    file_dict[file_index]['filename_video'] = video_file
    file_dict[file_index]['video_fps'] = fps
    file_dict[file_index]['captions'] = captions
    file_dict[file_index]['screenshots'] = screenshots

# validate total count
image_count = len(glob.glob(output_folder + '**/*.jpg', recursive=True));
print(f'{image_count} screenshots captured.')

#output to json file
jsonfile_name = game.replace(" ", "_")
with open(os.path.join(output_folder, jsonfile_name + '.json'), 'w') as outfile:
    json.dump(file_dict, outfile, sort_keys=False, indent=4)

1  subtitle files in total
['./data source/Rusty Lake\\Roots.vtt']
1  video files in total
['./data source/Rusty Lake\\Roots.mkv']
processing..../data source/Rusty Lake\Roots.vtt
processing..../data source/Rusty Lake\Roots.mkv
3226 screenshots captured.
Wall time: 16min 3s
