# CMU-MOSEI Preprocessing Script
This script is used for generating RAW_PREPROCESSED data from the raw CMU-MOSEI data, you can download the raw dataset from:


In [1]:
import os, sys
from shutil import copyfile
import glob
import pickle

import pandas as pd
import numpy as np
import h5py
import cv2
from scipy.io import wavfile
from tqdm import tqdm

from joblib import Parallel, delayed
import multiprocessing

In [2]:
# Base input output path
base_path = '../data/MOSEI_Raw' # MOSEI_Raw path
base_out_path = './MOSEI_RAW_PROCESSED' # RAW_PREPROCESSED path

# Derived input path for each modalities and label
base_audio_path = f'{base_path}/Audio/WAV_16000'
base_video_path = f'{base_path}/Videos/Full/Combined'
base_text_path = f'{base_path}/Transcript/Combined'
base_label_path = f'{base_path}/Labels'

# Number of jobs to run extraction in parallel
num_jobs = 32 

### Load Labels

In [3]:
# Load data
dfs = []
for path in glob.glob(f'{base_label_path}/*.csv'):
    dfs.append(pd.read_csv(path))
    
# Generate table
df = pd.concat(dfs)
df['uttr_id'] = df.apply(lambda x: f"{str(x['Input.VIDEO_ID']).split('/')[-1]}_{x['Input.CLIP']}" , axis='columns')
ldf = df[['uttr_id', 'Answer.anger', 'Answer.disgust', 'Answer.fear', 'Answer.happiness', 'Answer.sadness', 'Answer.surprise']]
ldf = (ldf.groupby('uttr_id').sum().applymap(lambda x: 1 if x > 0 else 0)).reset_index()
ldf.columns = [column.replace('Answer.','') for column in ldf.columns]

In [4]:
ldf.head()

Unnamed: 0,uttr_id,anger,disgust,fear,happiness,sadness,surprise
0,--qXJuDtHPw_5,0,0,0,1,0,0
1,-3g5yACwYnA_10,0,0,1,1,1,0
2,-3g5yACwYnA_13,0,0,0,0,0,0
3,-3g5yACwYnA_2,0,0,1,1,1,0
4,-3g5yACwYnA_3,0,0,0,1,1,0


### Load Timing & Transcript

In [5]:
# Load data
def split_transcript(path):
    rows = []
    for line in open(path,'r').readlines():
        row = list(filter(lambda x: len(x) > 0, line.split('___')))
        rows.append(row[:5])
    return rows
        
time_data = []
for path in glob.glob(f'{base_text_path}/*'):
    time_data += split_transcript(path)

# Generate table
df = pd.DataFrame(time_data)
df.columns = ['video_id', 'clip_id', 'start', 'end', 'text']
df['uttr_id'] = df.apply(lambda x: f"{str(x['video_id']).split('/')[-1]}_{x['clip_id']}" , axis='columns')
tdf = df[['uttr_id', 'video_id', 'start', 'end', 'text']]

In [6]:
tdf.head()

Unnamed: 0,uttr_id,video_id,start,end,text
0,SqofxdeEcjg_0,SqofxdeEcjg,0.0,28.838,As director of the National Institutes of Heal...
1,SqofxdeEcjg_1,SqofxdeEcjg,28.447,32.959,"I guess on Leap Year it would be ""EDI 366"" but..."
2,SqofxdeEcjg_2,SqofxdeEcjg,33.246,60.416,"The leadership that is in place, and about to ..."
3,SqofxdeEcjg_3,SqofxdeEcjg,59.935,63.11,And that's at all levels within NIH.\n
4,SqofxdeEcjg_4,SqofxdeEcjg,62.11,75.372,"Everything from the senior investigators, the ..."


### Merge Label, Timing, and Transcript

In [7]:
mdf = ldf.merge(tdf, on='uttr_id', how='inner')

In [8]:
mdf['start'] = mdf['start'].astype(float)
mdf['end'] = mdf['end'].astype(float)

In [9]:
mdf.head()

Unnamed: 0,uttr_id,anger,disgust,fear,happiness,sadness,surprise,video_id,start,end,text
0,--qXJuDtHPw_5,0,0,0,1,0,0,--qXJuDtHPw,23.199,30.325,I see that a writer is somebody who has an inc...
1,-3g5yACwYnA_10,0,0,1,1,1,0,-3g5yACwYnA,82.753,100.555,Key is part of the people that we use to solve...
2,-3g5yACwYnA_13,0,0,0,0,0,0,-3g5yACwYnA,119.919,125.299,They've been able to find solutions or at leas...
3,-3g5yACwYnA_2,0,0,1,1,1,0,-3g5yACwYnA,4.84,14.052,Key Polymer brings a technical aspect to our o...
4,-3g5yACwYnA_3,0,0,0,1,1,0,-3g5yACwYnA,13.211,27.521,We're a huge user of adhesives for our operati...


### Extract MOSEI multimodal data

In [10]:
if not os.path.exists(base_out_path):
    os.makedirs(base_out_path)

In [11]:
%%time
# Text and label
meta_dict = {}
for row in tqdm(mdf.itertuples()):
    id, labels, video_id, start, end, text = row[1], list(row[2:8]), row[8], float(row[9]), float(row[10]), row[11]
    audio_path = f'{base_audio_path}/{video_id}.wav'
    video_path = f'{base_video_path}/{video_id}.mp4'
    text_path = f'{base_text_path}/{video_id}.txt'
    
    if os.path.exists(video_path) \
        and os.path.exists(audio_path) \
        and os.path.exists(text_path):
        # Store id, text, and labels to metadata buffer
        text = text.replace('\n','')
        meta_dict[id] = {
            'text': text,
            'label': labels
        }

# Save metadata
pickle.dump(meta_dict, open(f'{base_out_path}/meta.pkl','wb'))

23259it [00:00, 54659.24it/s]

CPU times: user 356 ms, sys: 88 ms, total: 444 ms
Wall time: 458 ms





In [12]:
def read_video(file_name, sampling_time=None, limit_time=280):
    vidcap = cv2.VideoCapture(file_name)
    # Read FPS
    (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')
    if int(major_ver)  < 3 :
        fps = vidcap.get(cv2.cv.CV_CAP_PROP_FPS)
    else :
        fps = vidcap.get(cv2.CAP_PROP_FPS)
    
    # Read image data
    sampling_rate = int(np.round(float(sampling_time) / 1000 * fps))
    success, image = vidcap.read()
    images, i = [], 0
    while success:
        if i % sampling_rate == 0:
            images.append(image)
        success, image = vidcap.read()
        i += 1
        if i == int(fps * limit_time):
            break
    return np.stack(images), int(1000 // sampling_time)

def retrieve_audio_segment(signal, sr, start_time, end_time):
    start_time = 0 if start_time < 0 else start_time
    start_idx = int(sr * start_time)
    end_idx = int(sr * end_time)
    audio_segment = signal[start_idx:end_idx]
    return audio_segment

def retrieve_video_segment(frames, fps, start_time, end_time):
    start_idx = int(fps * start_time)
    end_idx = int(fps * end_time)
    images = frames[start_idx:end_idx,:,:,:]
    return images

def dump_image(img_segment, out_path='./'):
    for i in range(img_segment.shape[0]):
        cv2.imwrite(f'./{out_path}/image_{i}.jpg', img_segment[i,:,:,:])

In [13]:
def extract_parallel(video_id, vdf):
    try:
        audio_path = f'{base_audio_path}/{video_id}.wav'
        video_path = f'{base_video_path}/{video_id}.mp4'
        text_path = f'{base_text_path}/{video_id}.txt'
        if os.path.exists(video_path) \
            and os.path.exists(audio_path) \
            and os.path.exists(text_path):

            # Load video and audio data
            sr, signal  = wavfile.read(audio_path)
            images, fps = read_video(video_path, sampling_time=500, limit_time=280) # the max utterance of MOSEI is at ~270 second,

            if len(signal.shape) == 2:
                signal = signal[:,0]

            # Iterate over utterance
            for row in vdf.itertuples():
                id, labels, start, end, text = row[1], list(row[2:8]), float(row[9]), float(row[10]), row[11]

                # Create directory
                out_path = f'{base_out_path}/{id}'
                if not os.path.exists(out_path):
                    os.makedirs(out_path)
                    
                # Retrieve audio segment and dump
                audio_segment = retrieve_audio_segment(signal, sr, start, end)
                wavfile.write(f'./{out_path}/audio.wav', sr, audio_segment)

                # Retrieve video segment and dump
                video_segment = retrieve_video_segment(images, fps, start, end)
                dump_image(video_segment, out_path)
            return (0, video_id)
        else:
            return (1, video_id)
    except:
        return (2, video_id)

In [14]:
%%time
# Process multimodal data over video_id in parallel
# NOTE: This might take several hours to run, the time listed on this cell is for processing 32 video_ids with num_jobs=16
status_codes = Parallel(n_jobs=num_jobs)(delayed(extract_parallel)(video_id, vdf) for video_id, vdf in mdf.groupby('video_id'))

# Save and Process Log
log_df = pd.DataFrame(status_codes)
log_df.columns = ['status', 'video_id']
log_df.to_csv('log_extract.csv', index=False)

CPU times: user 404 ms, sys: 244 ms, total: 648 ms
Wall time: 28.1 s


In [15]:
# Check the status, it should all return 0 for all the ids mentioned in split
log_df.groupby('status').size().to_frame('count')

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
0,32
