In [101]:
import pandas as pd 
import pytube as pt 
from pytube.contrib.channel import Channel as channel
from pytube import YouTube as yt 
from pytube import Stream as stream
from tqdm import tqdm
import subprocess
import os 
import time

In [100]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.64.0


In [13]:
def get_channel_metadata(channel_url):
    channel_info = channel(channel_url)
    cid = channel_info.channel_id
    cname = channel_info.channel_name
    video_generators = channel_info.url_generator
    
    return {"channel_id":cid,
           "name":cname,
           "video_generators":video_generators}

In [14]:
jova = get_channel_metadata(channel_url)

In [53]:
def get_videos_metadata(video):
    video_name = video.title
    length = video.length
    views = video.views
    publish_date = pd.to_datetime(video.publish_date)
    return video_name,length,views,publish_date

In [25]:
url_generators = jova["video_generators"]()
while True:
    video_url = next(url_generators)
    video = yt(video_url)
    break

In [47]:
pd.to_datetime(video.publish_date)

Timestamp('2022-05-25 00:00:00')

In [117]:
def interate_and_download(channel_url,dataset_folder = "./dataset",path_csv = None):
    """
    This function get all the videos from a channel and then iterate through the 
    listed video urls, it then record the metadata of videos and create a table as wel
    """
    
    # get channel information 
    channel_info = channel(channel_url) 
    cid = channel_info.channel_id 
    cname = channel_info.channel_name 
    url_generators = channel_info.url_generator()
    
    # create folder to stored song 
    name_folder = "".join(ch for ch in cname if ch.isalnum())
    folder_channel = os.listdir(dataset_folder)
    path_folder = f"{dataset_folder}/{name_folder}"
    if name_folder not in folder_channel:
        os.makedirs(path_folder)
    
    # create list to save data, which will be convert to dataframe later
    video_names = []
    video_publish_date = []
    video_lengths = []
    video_views = []
    video_paths = []
    
    video_urls = []
    print("Get video links")
    while True:
        try: 
            video_url = next(url_generators)
            video_urls.append(video_url)
        except StopIteration:
            break
    
    print("Down load audio")
    for i in tqdm(range(len(video_urls))):

        # create video object
        video_url = video_urls[i]
        video = yt(video_url)
        # get and save video metadata
        video_name,length,views,publish_date = get_videos_metadata(video)

        video_names.append(video_name)
        video_publish_date.append(publish_date)
        video_lengths.append(length)
        video_views.append(views)

        # get and save audio
        video_path = f"{path_folder}/{i}.wav"

        video_paths.append(video_path)
        id_tag = video.streams.filter(only_audio=True)[0]
        down_stream = video.streams.get_by_itag(id_tag.itag)
        down_stream.download(filename=video_path)

        
        
    # create dataframe to save metadata
        
    dict_data = {
        "video_names":video_names,
        "video_publish_date" : video_publish_date,
        "video_lentghs" : video_lengths,
        "video_views" : video_views,
        "video_paths" : video_paths,
        "video_url": video_urls
    }
    df = pd.DataFrame(data = dict_data)
    df["channel_name"] = cname
    df["channel_id"] = cid
    
    if path_csv:
        df.to_csv(path_csv)
    
    return dict_data

    

In [118]:
channel_url = "https://www.youtube.com/channel/UCMlZ4aOHBJnPGJAM5s-7b4w"

In [None]:
piano_tutorials = interate_and_download(channel_url,path_csv = "./metadatacsv/pianotutorial.csv")

Get video links
Down load audio


 99%|████████████████████████████████████████▍| 779/790 [25:14<00:23,  2.12s/it]

In [None]:
piano_tutorials.keys()

In [111]:
len(piano_tutorials["video_names"])

790

In [112]:
len(piano_tutorials["video_publish_date"])

790

In [113]:
len(piano_tutorials["video_lentghs"])

790

In [114]:
len(piano_tutorials["video_views"])

0

In [115]:
len(piano_tutorials["video_paths"])

790

In [116]:
len(piano_tutorials["video_url"])

790

In [104]:
import os, shutil
folder = "./dataset/PianoTutorial"
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))

In [93]:
os.listdir("./")

['Crawl_and_arrange.ipynb',
 'TODO.txt',
 'dataset',
 'README.md',
 'daft_crawl_data.py',
 '.gitignore',
 '.ipynb_checkpoints',
 '.git',
 'metadatacsv']