In [1]:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options 
from selenium.webdriver.common.by import By
import time
import xlsxwriter
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# urls of the youtube channels
urls = [
    # 'https://www.youtube.com/@_SMWX/videos'
    # 'https://www.youtube.com/@konvo_za/videos'
    # 'https://www.youtube.com/@kingdavidstudio1/videos'
    'https://www.youtube.com/@TheHustlersCornerSA/videos'
    # 'https://www.youtube.com/@MajitaMonday/videos'
    # 'https://www.youtube.com/@justifyworldwide/videos'
    #'https://www.youtube.com/@culturespotlight/videos'
    # 'https://www.youtube.com/@podcastandchillnetwork/videos'
    ]

In [3]:
# specify the path to the Edge WebDriver
service = Service(executable_path='C:/Users/vince/Downloads/edgedriver_win64/msedgedriver.exe')

# set up Edge options (optional, can be customized as needed)
options = Options()
options.add_argument("start-maximized")  # Example option to open Edge maximized

# initialize the Edge WebDriver with the service and options
driver = webdriver.Edge(service=service, options=options)

In [4]:
# initialize an empty list of videos
video_list = []

# iterate through the channels 
for url in urls:
    driver.get('{}/videos?view=0&sort=p&flow=grid'.format(url))
    
    # max number of scroll attempts
    max_scroll_attempts = 70
    scroll_attempts = 0
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    while True:
        # scroll to the bottom
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(10)  # Wait for content to load
        
        # get the new page height after scrolling
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        
        # if the height is the same, it means no new content is loading
        if new_height == last_height or scroll_attempts >= max_scroll_attempts:
            break
        
        last_height = new_height  # update the height for the next iteration
        scroll_attempts += 1  # increment the scroll attempts

    try:
        # find all video elements on the page
        videos = driver.find_elements(By.CLASS_NAME, 'style-scope ytd-rich-grid-media')
        duration_elements = driver.find_elements(By.CLASS_NAME, 'style-scope ytd-rich-grid-renderer')
        
        for video in videos:
            title = video.find_element(By.XPATH, './/*[@id="video-title"]').text
            views = video.find_element(By.XPATH, './/*[@id="metadata-line"]/span[1]').text.replace('views', '')
            #when = video.find_element(By.XPATH, './/*[@id="metadata-line"]/span[2]').text

            # incase of live streaming : updated XPath for 'posted' (when the video was uploaded)
            try:
                when = WebDriverWait(video, 10).until(
                    EC.presence_of_element_located((By.XPATH, './/*[@id="metadata-line"]/span[2]'))
                ).text
            except Exception as e:
                when = "N/A" 
            
            video_item = {
                'title': title,
                'views': views,
                'posted': when
            }
            
            video_list.append(video_item) 
    
    except Exception as e:
        print(f"Error processing {url}: {e}")

# closing the driver
driver.quit()


In [5]:
# check if the video_list is empty or loaded

if not video_list:
    print('video_list is empty')
else :
    print(video_list)

[{'title': 'Konvo Show: Penuel In Conversation With Muzi Mthabela, Family, Black Nation, Christianity, Acting', 'views': '5.1K ', 'posted': '5 hours ago'}, {'title': '"EUROPEANS DON\'T GET OFFENDED WHEN WE SAY MLUNGU" - ZOZA SHONGWE', 'views': '1.8K ', 'posted': '2 days ago'}, {'title': '"THERE\'S PROPAGANDA BEHIND THE IMAGE OF SHAKA BEING A RUTHLESS WARRIOR" - ZOZA SHONGWE', 'views': '3.1K ', 'posted': '2 days ago'}, {'title': '"THE NAME CASSPER NYOVEST IS NOT JUST A STREET NAME, IT HAS HISTORICAL SIGNIFICANCE" - ZOZA SHONGWE', 'views': '12K ', 'posted': '3 days ago'}, {'title': '"COLONISERS TREATED ALL AFRICAN PEOPLE LIKE ANIMALS" - ZOZA SHONGWE', 'views': '1K ', 'posted': '3 days ago'}, {'title': '"THE ROMAN EMPIRE EVOLVED TO THE ROMAN CATHOLIC CHURCH TO LIVE FOREVER" - ZOZA SHONGWE', 'views': '1.2K ', 'posted': '3 days ago'}, {'title': '"TO MINIMIZE CONFLICT EUROPEANS CHOSE THE ZULU NATION AS THEIR ADMINISTRATORS" - ZOZA SHONGWE', 'views': '4.5K ', 'posted': '3 days ago'}, {'title'

In [6]:
# create a dataframe
df_oc = pd.DataFrame(video_list)

# display the data frame
print(df_oc)

                                                 title  views       posted
0    Konvo Show: Penuel In Conversation With Muzi M...  5.1K   5 hours ago
1    "EUROPEANS DON'T GET OFFENDED WHEN WE SAY MLUN...  1.8K    2 days ago
2    "THERE'S PROPAGANDA BEHIND THE IMAGE OF SHAKA ...  3.1K    2 days ago
3    "THE NAME CASSPER NYOVEST IS NOT JUST A STREET...   12K    3 days ago
4    "COLONISERS TREATED ALL AFRICAN PEOPLE LIKE AN...    1K    3 days ago
..                                                 ...    ...          ...
403  Penuel The Black Pen In Conversation Nhlanhla ...   59K   2 years ago
404  Penuel The Black Pen | In Conversation Nhlanhl...  164K   2 years ago
405  Penuel The Black Pen | In Conversation with Ma...   21K   2 years ago
406  Penuel The Black Pen | In Conversation With Ma...   62K   2 years ago
407  Penuel The Black Pen | In Conversation With Pe...   79K   2 years ago

[408 rows x 3 columns]


In [7]:
# Function to help convert view counts

def convert_views_count(view):

    try:
        float_view = float(view)
        return float_view 
    except ValueError:
        view = view.strip()

        if 'K' in view:
            return float(view.replace('K', '')) * 1000
        elif 'M' in view:
            return float(view.replace('M', '')) * 1000000
        else:
            return float(view)
    
# Apply the convert_views_count function to the 'views' column
df_oc['views'] = df_oc['views'].apply(convert_views_count)

# removing the word ago in the columnd posted
df_oc['posted'] = df_oc['posted'].str.replace('ago','')

In [8]:
print(df_oc)

                                                 title     views    posted
0    Konvo Show: Penuel In Conversation With Muzi M...    5100.0  5 hours 
1    "EUROPEANS DON'T GET OFFENDED WHEN WE SAY MLUN...    1800.0   2 days 
2    "THERE'S PROPAGANDA BEHIND THE IMAGE OF SHAKA ...    3100.0   2 days 
3    "THE NAME CASSPER NYOVEST IS NOT JUST A STREET...   12000.0   3 days 
4    "COLONISERS TREATED ALL AFRICAN PEOPLE LIKE AN...    1000.0   3 days 
..                                                 ...       ...       ...
403  Penuel The Black Pen In Conversation Nhlanhla ...   59000.0  2 years 
404  Penuel The Black Pen | In Conversation Nhlanhl...  164000.0  2 years 
405  Penuel The Black Pen | In Conversation with Ma...   21000.0  2 years 
406  Penuel The Black Pen | In Conversation With Ma...   62000.0  2 years 
407  Penuel The Black Pen | In Conversation With Pe...   79000.0  2 years 

[408 rows x 3 columns]


In [9]:
try:
    df_oc.to_csv('Output Files/TheHustlersCornerSA_Podcast.csv', index = False )
    print("Successfully exported to csv file")

except Exception as e:
    print(e)

Successfully exported to csv file


In [10]:
try:
    df_oc.to_excel('Output Files/TheHustlersCornerSA_Podcast.xlsx', index=False)
    print("Successfully exported to xlsx file")

except Exception as e:
    print(e)

Successfully exported to xlsx file
