In [4]:
import sys
!{sys.executable} -m pip install pytube
!{sys.executable} -m pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.5.4.58-cp38-cp38-macosx_10_15_x86_64.whl (45.5 MB)
[K     |████████████████████████████████| 45.5 MB 3.2 MB/s eta 0:00:01     |████████████████                | 22.7 MB 3.1 MB/s eta 0:00:08     |████████████████████████▍       | 34.7 MB 3.2 MB/s eta 0:00:04     |██████████████████████████████▎ | 43.1 MB 3.6 MB/s eta 0:00:01
Installing collected packages: opencv-python
Successfully installed opencv-python-4.5.4.58


# Data Collection

The data is collected by downloading youtube videos using the package ```pytube``` and sampled using a custom class ```FrameExtractor```.


In [6]:
from pytube import YouTube
import os
import shutil
import math
import datetime
import matplotlib.pyplot as plt
import cv2

### class ```FrameExtractor``` has the functions:
1. get_video_duration - returns the length of the video
2. get_n_images - returns the number of images given a particular sampling rate
3. extract_frames - extracts and stores the frames from a given downloaded video

In [7]:
class FrameExtractor():
    '''
    Class used for extracting frames from a video file.
    '''
    def __init__(self, video_path):
        self.video_path = video_path
        self.vid_cap = cv2.VideoCapture(video_path)
        self.n_frames = int(self.vid_cap.get(cv2.CAP_PROP_FRAME_COUNT))
        self.fps = int(self.vid_cap.get(cv2.CAP_PROP_FPS))
        
    def get_video_duration(self):
        duration = self.n_frames/self.fps
        print(f'Duration: {datetime.timedelta(seconds=duration)}')
        return duration
        
    def get_n_images(self, every_x_frame):
        n_images = math.floor(self.n_frames / every_x_frame) + 1
        print(f'Extracting every {every_x_frame} (nd/rd/th) frame resulted in {n_images} images.')
        return n_images
        
    def extract_frames(self, every_x_frame, img_name, dest_path=None, img_ext = '.jpg'):
        if not self.vid_cap.isOpened():
            self.vid_cap = cv2.VideoCapture(self.video_path)
        
        if dest_path is None:
            dest_path = os.getcwd()
        else:
            if not os.path.isdir(dest_path):
                os.mkdir(dest_path)
                print(f'Created the directory: {dest_path}')
        
        frame_cnt = 0
        img_cnt = 0

        while self.vid_cap.isOpened():
            
            success,image = self.vid_cap.read() 
            
            if not success:
                break
            
            if frame_cnt % every_x_frame == 0:
                img_path = os.path.join(dest_path, ''.join([img_name, '_', str(img_cnt), img_ext]))
                cv2.imwrite(img_path, image)  
                img_cnt += 1
                
            frame_cnt += 1
        
        self.vid_cap.release()
        cv2.destroyAllWindows()


### Downloading and sampling the videos

The links to the Youtube videos are obtained from a file called ```links.txt``` which is manually fed in.
We have chosen all videos from Season 5 of F.R.I.E.N.D.S.

We also extract:
1. ```last_video_index``` : index of the last downloaded video
2. ``` last_sampled_index``` :  index of the last sampled video 

This ensures no work is redone when we add more videos to increase the size of the dataset.

In [10]:
with open('links.txt') as file:
    urls = file.readlines()

path_videos = "../data/videos/"    
path_images = "../data/raw_images/"

image_dir = os.listdir(path_images)

last_video_index = 0
last_sampled_index = 0

try: last_video_index = int(os.listdir(path_videos)[-1].split('_')[0])
except: pass

try: last_sampled_index = int(image_dir[-1].split('_')[0])
except: pass
    
print("Last downloaded video index:", last_video_index)
print("Last sampled video index:", last_sampled_index)

Last downloaded video index: 0
Last sampled video index: 0


```image_count``` keeps track of the number of images sampled and ```total_duration``` keeps track of the length of the videos sampled.


1. The video is downloaded using pytube's ```Youtube``` class and the URL from ```links.txt``` 
2. The video is sampled using the ```FrameExtractor``` class from above.
3. Relevant information is extracted and displayed.

In [12]:
image_count = len(image_dir)
total_duration = 0

for index, video_url in enumerate(urls):
    video_url = video_url.strip()
    
    # Extracting code from the URL
    code = video_url[video_url.index('=')+1:]
    image_name = '{}_{}'.format(index, code)
    print(image_name)
    
    # Downloading the video 
    
    yt = YouTube(video_url)
    yt = yt.streams.filter(file_extension='mp4', res='144p').first()
    fps = yt.fps
    
    # Checking if it has already been downloaded
    if index > last_video_index or last_video_index == 0: 
        video = yt.download(path_videos, filename=image_name+".mp4")
        print("Downloaded video {}_{}".format(index, code))
    
    
    # Extracting frames from the video per second
    
    fe = FrameExtractor(video)
    duration = fe.get_video_duration()
    n_images = fe.get_n_images(every_x_frame=fps)
    
    # Checking if it has already been sampled
    if index > last_sampled_index or last_sampled_index == 0:
        fe.extract_frames(every_x_frame=fps, 
                          img_name=image_name, 
                          dest_path=path_images)
        print("Sampled video {}_{}".format(index, code))
        
    
    total_duration += duration
    image_count += n_images
    
    print()
    
print("Total duration of the videos =", total_duration)
print("Total number of images =", image_count)

0_8wewPn7TZfs
Downloaded video 0_8wewPn7TZfs
Duration: 0:05:00.826087
Extracting every 24 (nd/rd/th) frame resulted in 289 images.
Sampled video 0_8wewPn7TZfs

1_Uyq66PLCvvY
Downloaded video 1_Uyq66PLCvvY
Duration: 0:05:00.652174
Extracting every 24 (nd/rd/th) frame resulted in 289 images.
Sampled video 1_Uyq66PLCvvY

2_nzDJdZLPeGE
Downloaded video 2_nzDJdZLPeGE
Duration: 0:05:05
Extracting every 24 (nd/rd/th) frame resulted in 293 images.
Sampled video 2_nzDJdZLPeGE

3_eT_Gzi0HN4E
Downloaded video 3_eT_Gzi0HN4E
Duration: 0:04:12.956522
Extracting every 24 (nd/rd/th) frame resulted in 243 images.
Sampled video 3_eT_Gzi0HN4E

4_AtiMqeDmo1M
Downloaded video 4_AtiMqeDmo1M
Duration: 0:04:14.869565
Extracting every 24 (nd/rd/th) frame resulted in 245 images.
Sampled video 4_AtiMqeDmo1M

5_oSi_pbzIYng
Downloaded video 5_oSi_pbzIYng
Duration: 0:04:12.217391
Extracting every 24 (nd/rd/th) frame resulted in 242 images.
Sampled video 5_oSi_pbzIYng

6_aKlLgwyqsqY
Downloaded video 6_aKlLgwyqsqY
Du

### Resizing all of the images to a fixed size

In [17]:
from PIL import Image

f = r'../data/raw_images/'
width, height = 256, 144

for file in os.listdir(f):
    if file.startswith('.'): continue
    f_img = f+"/"+file
    img = Image.open(f_img)
    img = img.resize((width, height))
    img.save(f_img)

print("Reshaped all images to ({}, {})".format(width, height))

Reshaped all images to (256, 144)
