# Video Scraping from Links
- This notebook is for when we have collected labels and links already

- Downloading all videos takes a long time, so we will only scrape selected videos

- With this notebook, we can use `download_videos.py` to download all videos in a metadata.csv to create our raw combined dataset, after we have decided our target words

In [1]:
import pathlib
import os
import pandas as pd
import requests
import download_videos as dv

In [2]:
# change working directory to the project root directory
current_dir = os.getcwd()
os.chdir(current_dir + '/../../')
# this should be the project root directory
os.getcwd()

'c:\\Users\\Ben Thompson\\source\\prepos\\brsl'

In [7]:
metadata_df = pd.read_csv('data/raw/combined/metadata_combined.csv')

## *for creating our combined raw dataset, after we have decided our target words*

Download all videos for a collection of words from all data sources

define list of target words 

In [23]:
metadata_df = metadata_df.sort_values(['label', 'data_source']).reset_index(drop=True)

In [24]:
metadata_df.label.nunique()

25

In [25]:
target_metadata_df = metadata_df.copy()

download videos

In [26]:
output_path = os.path.join('data', 'raw', 'combined', 'videos')
verify_ssl_settings = {
    'ne': True,
    'vl': True,
    'sb': False,
    'uf': True
}

In [None]:
dv.download_videos_from_metadata(
    metadata=target_metadata_df,
    output_path=output_path,
    verify_ssl_settings=verify_ssl_settings,
    verbose=True
    )

## *for collecting metadata from the videos*

In [28]:
video_metadata = dv.collect_metadata_from_directory('data/raw/combined/videos')

In [29]:
video_metadata[0]

{'filename': 'ajudar_ne_1.mp4',
 'frame_count': 44,
 'fps': 12.0,
 'width': 240,
 'height': 176,
 'duration_sec': 3.6666666666666665}

In [30]:
# check for None values
for i,d in enumerate(video_metadata):
    if d is None:
        print(i)
# should be no prints

In [31]:
video_metadata_df = pd.DataFrame(video_metadata)
video_metadata_df = pd.concat([target_metadata_df, video_metadata_df], axis=1)

In [32]:
video_metadata_df[[
    'filename',
    'label',
    'data_source',
    'sign_id',
    'signer_number',
    'frame_count',
    'fps',
    'duration_sec',
    'width',
    'height'
]].to_csv('data/raw/combined/target_dataset_video_metadata.csv', index=False)

## *Horizontal Flipping to unify sign orientation*

In [33]:
video_metadata_df = pd.read_csv('data/raw/combined/target_dataset_video_metadata.csv')

In [34]:
sign_orientation_df = pd.read_csv('data/raw/combined/reviewed_sign_orientation.csv')
sign_orientation_df.rename(
    columns={
        'sign_orientation(also signs that uses both hands I gave it a right hand)': 'sign_orientation',
    },
    inplace=True
)
sign_orientation_df['needs_flip'] = sign_orientation_df['sign_orientation'].apply(
    lambda x: True if x == 'left' else False
)

In [35]:
sign_orientation_df.head()

Unnamed: 0,filename,sign_orientation,needs_flip
0,ajudar_ne_1.mp4,right,False
1,ajudar_sb_2.mp4,right,False
2,ajudar_uf_3.mp4,right,False
3,ajudar_vl_4.mp4,right,False
4,ajudar_vl_5.mp4,right,False


In [36]:
sign_orientation_df.sign_orientation.value_counts()

sign_orientation
right    145
left      11
Name: count, dtype: int64

In [37]:
video_metadata_df = video_metadata_df.merge(
    sign_orientation_df.drop(columns=['sign_orientation']),
    how='left',
    left_on=['filename'],
    right_on=['filename']
)

In [38]:
video_metadata_df

Unnamed: 0,filename,label,data_source,sign_id,signer_number,frame_count,fps,duration_sec,width,height,needs_flip
0,ajudar_ne_1.mp4,ajudar,ne,2,1,44,12.00000,3.666667,240,176,False
1,ajudar_sb_2.mp4,ajudar,sb,1,1,79,29.97003,2.635967,1280,720,False
2,ajudar_uf_3.mp4,ajudar,uf,0,1,115,29.97003,3.837167,480,270,False
3,ajudar_vl_4.mp4,ajudar,vl,0,1,141,29.97003,4.704700,1920,1080,False
4,ajudar_vl_5.mp4,ajudar,vl,0,2,289,29.97003,9.642967,1920,1080,False
...,...,...,...,...,...,...,...,...,...,...,...
145,vagina_sb_2.mp4,vagina,sb,1,1,64,29.97003,2.135467,1280,720,False
146,vagina_uf_3.mp4,vagina,uf,0,1,110,29.97003,3.670333,480,270,False
147,vagina_vl_4.mp4,vagina,vl,0,1,150,29.97003,5.005000,1920,1080,False
148,vagina_vl_5.mp4,vagina,vl,0,2,215,29.97003,7.173833,1920,1080,False
