## BITCHUTE 

In [2]:
# Simple Python module for retrieving data from bitchute.
# Copyright (C) 2022 Marcus Burkhardt
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import time
import markdownify
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser
from tqdm import tqdm
from datetime import datetime, timedelta
from retrying import retry
from selenium.webdriver.chrome.service import Service
from dateutil.parser import parse as dateutil_parse
import requests

class Crawler():
    def __init__(self, headless=True, verbose=False, chrome_driver=None):
        self.options = Options()
        if headless:
            self.options.add_argument('--headless')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('--no-sandbox')
        self.chrome_driver = chrome_driver
        self.wd = None
        self.status = []
        self.verbose = verbose
        self.bitchute_base = 'https://api.bitchute.com/category/news/'
        self.channel_base = 'https://api.bitchute.com/channel/{}/'
        self.video_base = 'https://api.bitchute.com/video/'
        self.hashtag_base = 'https://api.bitchute.com/hashtag/{}/'
        self.profile_base = 'https://api.bitchute.com/profile/{}/'
        self.search_base = 'https://api.bitchute.com/search/?query={}&kind=video'

    def create_webdriver(self):
        if not self.chrome_driver:
            service = Service(ChromeDriverManager().install())
            self.wd = webdriver.Chrome(service=service, options=self.options)
        else:
            self.wd = webdriver.Chrome(self.chrome_driver, options=self.options)

    def reset_webdriver(self):
        if self.wd:
            self.wd.quit()
        self.wd = None
        
    def fetch(self, url):
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
            return response.text
        except requests.RequestException as e:
            print(f"Failed to fetch URL {url}: {e}")
            return ""

    def parse_video_ids(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        video_ids = []
        if soup.find(class_='results-list'):
            for result in soup.find_all(class_='video-result-container'):
                if result.find(class_='video-result-title'):
                    video_id = result.find(class_='video-result-title').find('a').get('href').split('/')[-2]
                    video_ids.append(video_id)
        return video_ids

    @retry(stop_max_attempt_number=5, wait_random_min=1000, wait_random_max=2000)
    def call(self, url, click_link_text=None, scroll=True, top=None):
        if not self.wd:
            self.create_webdriver()
        if self.verbose:
            print('Retrieving: ' + url + ' ', end='')
        self.set_status('Retrieving: ' + url)

        if self.wd.current_url == url:
            self.wd.get('about:blank')
            self.wd.get(url)
        else:
            self.wd.get(url)

        time.sleep(2)

        if len(self.wd.find_elements(By.XPATH, '//button[normalize-space()="Dismiss"]')) > 0:
            time.sleep(2)
            self.wd.find_element(By.XPATH, '//button[normalize-space()="Dismiss"]').click()

        if click_link_text and not len(self.wd.find_elements(By.PARTIAL_LINK_TEXT, click_link_text)) > 0:
            time.sleep(5)

        if click_link_text:
            if len(self.wd.find_elements(By.PARTIAL_LINK_TEXT, click_link_text)) > 0:
                time.sleep(2)
                self.wd.find_element(By.PARTIAL_LINK_TEXT, click_link_text).click()
                time.sleep(2)
            else:
                print('Cannot find link to click')

        sensitivity = 'Some videos are not shown'
        if len(self.wd.find_elements(By.PARTIAL_LINK_TEXT, sensitivity)) > 0:
            self.wd.find_element(By.PARTIAL_LINK_TEXT, sensitivity).click()
            time.sleep(2)

        if scroll:
            if top:
                iterations = (top // 10) + (top % 10 > 0)
                iteration = 1
                increment = 1
            else:
                iterations = 1
                iteration = 0
                increment = 0

            script = (
                'window.scrollTo(0, document.body.scrollHeight);'
                'var lenOfPage=document.body.scrollHeight;'
                'return lenOfPage;'
            )

            lenOfPage = self.wd.execute_script(script)
            match = False
            while not match and iteration < iterations:
                iteration += increment
                if self.verbose:
                    print('.', end='')
                self.set_status('.')
                lastCount = lenOfPage
                time.sleep(4)
                lenOfPage = self.wd.execute_script(script)
                if lastCount == lenOfPage:
                    match = True
        if self.verbose:
            print('')

        page_source = self.wd.page_source
        return page_source

    def process_views(self, views):
        if "k" in views or "K" in views:
            views = views.replace('K', '').replace('k', '')
            if '.' not in views:
                views = views[:-1] + '.' + views[-1:]
            views = float(views) * 1000
        elif "m" in views or "M" in views:
            views = views.replace('M', '').replace('m', '')
            if '.' in views:
                views = float(views)
            else:
                views = float(views) / 10
            views = views * 1000000
        return int(views)

    def search(self, query, top=100):
        '''
        Queries Bitchute and retrieves top n results according to the relevance ranking.

        Parameters:
        query (str): Search string
        top (int): Number of results to be retrieved

        Returns:
        data: Dataframe of search results.
        '''
        url = self.search_base.format(query)
        if isinstance(top, str) and top.lower() == 'all':
            top = None
        src = self.call(url, top=top)
        data = self.parser(src, type='video_search')
        self.reset_webdriver()
        return data

    def get_recommended_videos(self, type='popular'):
        '''
        Scapes recommended videos on bitchute homepage.

        Parameters:
        type (str): POPULAR, TRENDING, ALL

        Returns:
        data: Dataframe of recommended videos.
        '''
        if type == 'popular':
            src = self.call(self.bitchute_base)
            data = self.parser(src, type='recommended_videos', kind='popular')
            return data
        elif type == 'trending':
            src = self.call(self.bitchute_base, click_link_text='TRENDING')
            data = self.parser(src, type='recommended_videos', kind='trending-day')
            return data
        elif type == 'trending-day':
            src = self.call(self.bitchute_base, click_link_text='TRENDING')
            data = self.parser(src, type='recommended_videos', kind='trending-day')
            return data
        elif type == 'trending-week':
            src = self.call(self.bitchute_base, click_link_text='TRENDING')
            data = self.parser(src, type='recommended_videos', kind='trending-week')
            return data
        elif type == 'trending-month':
            src = self.call(self.bitchute_base, click_link_text='TRENDING')
            data = self.parser(src, type='recommended_videos', kind='trending-month')
            return data
        elif type == 'all':
            src = self.call(self.bitchute_base, click_link_text='ALL')
            data = self.parser(src, type='recommended_videos', kind='all')
            return data
        else:
            print('Wrong type. Accepted types are popular, trending and all.')
            return None
        self.reset_webdriver()

    def get_popular_videos(self):
        videos, tags = self.get_recommended_videos(type='popular')
        return videos

    def get_trending_videos(self):
        videos, tags = self.get_recommended_videos(type='trending')
        return videos

    def get_trending_tags(self):
        videos, tags = self.get_recommended_videos(type='trending')
        return tags

    def get_trending(self):
        videos, tags = self.get_recommended_videos(type='trending')
        return videos, tags

    def get_all_videos(self):
        videos, tags = self.get_recommended_videos(type='all')
        return videos

    def get_recommended_channels(self, extended=True):
        '''
        Scapes recommended channels on bitchute homepage.

        Parameters:
        extended (bool): whether to retrieve extended channel information. Default: True

        Returns:
        data: Dataframe of recommended channels.
        '''
        src = self.call(self.bitchute_base, scroll=False)
        data = self.parser(src, type='recommended_channels', extended=extended)
        self.reset_webdriver()
        return data

    def _get_channel(self, channel_id, get_channel_about=True, get_channel_videos=True):
        '''
        Scapes channel information.

        Parameters:
        channel_id (str): ID of channel to be scraped.
        get_channel_about (bool): Get the about information by a channel. Default:True 
        get_channel_videos (bool): Get the information of videos published by a channel. Default:True

        Returns:
        about_data: Dataframe of channel about.
        videos_data: Dataframe of channel videos.
        '''

        if get_channel_about:
            channel_about_url = self.channel_base.format(channel_id)
            src = self.call(channel_about_url, click_link_text='ABOUT', scroll=False)
            about_data = self.parser(src, type='channel_about')
        else:
            about_data = pd.DataFrame()

        if get_channel_videos:
            channel_videos_url = self.channel_base.format(channel_id)
            src = self.call(channel_videos_url, click_link_text='VIDEOS')
            videos_data = self.parser(src, type='channel_videos')
        else:
            videos_data = pd.DataFrame()

        return about_data, videos_data

    def get_channels(self, channel_ids, get_channel_about=True, get_channel_videos=True):
        '''
        Scapes information for multiple channels.

        Parameters:
        channel_ids (list): List of channel ids to be scraped.
        get_channel_about (bool): Get the about information by a channel. Default:True 
        get_channel_videos (bool): Get the information of videos published by a channel. Default:True

        Returns:
        abouts: Dataframe of channel abouts.
        videos: Dataframe of channel videos.
        '''
        if type(channel_ids) == str:
            abouts, videos = self._get_channel(channel_ids, get_channel_about=get_channel_about, get_channel_videos=get_channel_videos)
            self.reset_webdriver()
            return abouts, videos
        elif type(channel_ids) == list:
            abouts = pd.DataFrame()
            videos = pd.DataFrame()
            for channel_id in (tqdm(channel_ids) if not self.verbose else channel_ids):
                about_tmp, videos_tmp = self._get_channel(channel_id, get_channel_about=get_channel_about, get_channel_videos=get_channel_videos)
                abouts = pd.concat([abouts, about_tmp])
                videos = pd.concat([videos, videos_tmp])
            self.reset_webdriver()
            return abouts, videos
        else:
            print('channel_ids must be of type list for multiple or str for single channels')
            return None

    def _get_video(self, video_id):
        '''
        Scrapes video metadata.

        Parameters:
        video_id (str): ID of video to be scraped.

        Returns:
        video_data: Dataframe of video metadata.
        '''

        video_url = self.video_base + video_id + "/"
        src = self.call(video_url, scroll=False)
        video_data = self.parser(src, type='video')
        return video_data

    def get_videos(self, video_ids):
        '''
        Scrapes metadata of multiple videos.

        Parameters:
        video_ids (list): List of video ids to be scraped.

        Returns:
        video_data: Dataframe of video metadata.
        '''

        if type(video_ids) == str:
            try:
                video_data = self._get_video(video_ids)
                self.reset_webdriver()
                return video_data
            except:
                print('Failed for video with id {}'.format(video_ids))
        elif type(video_ids) == list:
            video_data = pd.DataFrame()
            for video_id in (tqdm(video_ids) if not self.verbose else video_ids):
                try:
                    video_tmp = self._get_video(video_id)
                    if video_tmp is not None:
                        video_data = pd.concat([video_data, video_tmp])
                except:
                    print('Failed for video with id {}'.format(video_id))
                    self.reset_webdriver()
            self.reset_webdriver()
            return video_data
        else:
            print('video_ids must be of type list for multiple or str for single video')
            return None

    def _get_hashtag(self, hashtag):
        '''
        Scapes video posted with a tag.

        Parameters:
        tag (str): Hashtag to be scraped.

        Returns:
        video_data: Dataframe of video metadata.
        '''
        hashtag_url = self.hashtag_base.format(hashtag)
        src = self.call(hashtag_url)
        video_data = self.parser(src, type='hashtag_videos')
        video_data['hashtag'] = hashtag
        return video_data

    def get_hashtags(self, hashtags):
        '''
        Scapes video posted with a tag.

        Parameters:
        tag (str): Hashtag to be scraped.

        Returns:
        video_data: Dataframe of video metadata.
        '''

        if type(hashtags) == str:
            video_data = self._get_hashtag(hashtags)
            video_data['hashtag'] = hashtags
            self.reset_webdriver()
            return video_data
        elif type(hashtags) == list:
            video_data = pd.DataFrame()
            for hashtag in (tqdm(hashtags) if not self.verbose else hashtags):
                video_tmp = self._get_hashtag(hashtag)
                if video_tmp is not None:
                    video_tmp['hashtag'] = hashtag
                    video_data = pd.concat([video_data, video_tmp])
            self.reset_webdriver()
            return video_data
        else:
            print('hashtags must be of type list for multiple or str for single hashtag')
            return None

    def convert_relative_time(self, relative_time_str):
        now = datetime.utcnow()
        relative_time_str = relative_time_str.lower()
        if "hour" in relative_time_str:
            hours = int(relative_time_str.split()[0])
            return now - timedelta(hours=hours)
        elif "minute" in relative_time_str:
            minutes = int(relative_time_str.split()[0])
            return now - timedelta(minutes=minutes)
        elif "day" in relative_time_str and "week" in relative_time_str:
            parts = relative_time_str.split(',')
            days = int(parts[1].strip().split()[0])
            weeks = int(parts[0].strip().split()[0])
            return now - timedelta(days=days, weeks=weeks)
        elif "days" in relative_time_str:
            days = int(relative_time_str.split()[0])
            return now - timedelta(days=days)
        elif "week" in relative_time_str:
            weeks = int(relative_time_str.split()[0])
            return now - timedelta(weeks=weeks)
        else:
            return dateutil_parse(relative_time_str)

    def get_recent_videos(self, url):
        recent_videos = []
        src = self.call(url)
        videos = self.parser(src, type='video_search')  # Ensure the type matches your use case
        for index, row in videos.iterrows():
            recent_videos.append(row)
        return pd.DataFrame(recent_videos)



    def get_video_details(self, video_id):
        url = f"{self.video_base}{video_id}/"
        print(f"Retrieving: {url}")  # Debug: Print the URL to check if it's correct
        src = self.call(url)
        soup = BeautifulSoup(src, 'html.parser')

        like_count = None
        dislike_count = None
        subscriber_count = None
        hashtags = []

        if soup.find(class_='video-like'):
            like_count_text = soup.find(id='video-like-count').text.strip()
            try:
                like_count = int(like_count_text.replace(',', ''))
            except ValueError:
                like_count = like_count_text

        if soup.find(class_='video-dislike'):
            dislike_count_text = soup.find(id='video-dislike-count').text.strip()
            try:
                dislike_count = int(dislike_count_text.replace(',', ''))
            except ValueError:
                dislike_count = dislike_count_text

        if soup.find(class_='subscriber-count'):
            subscriber_count_text = soup.find(id='subscriber_count').text.strip()
            try:
                subscriber_count = int(subscriber_count_text.replace(',', ''))
            except ValueError:
                subscriber_count = subscriber_count_text

        # Find video hashtags
        video_hashtags_element = soup.find(id='video-hashtags')
        if video_hashtags_element:
            for tag in video_hashtags_element.find_all('li'):
                hashtags.append(tag.text.strip())

        return like_count, dislike_count, subscriber_count, hashtags

    def process_likes(self, likes):
        try:
            likes = likes.replace(',', '').strip()  # Remove commas and strip whitespace
            return int(likes)
        except ValueError:
            return None

    def process_dislikes(self, dislikes):
        try:
            dislikes = dislikes.replace(',', '').strip()  # Remove commas and strip whitespace
            return int(dislikes)
        except ValueError:
            return None

    def process_subscribers(self, subscribers):
        try:
            subscribers = subscribers.replace(',', '').strip()  # Remove commas and strip whitespace
            return int(subscribers)
        except ValueError:
            return None

    def parser(self, src, type=None, kind=None, extended=False):
        scrape_time = str(int(datetime.utcnow().timestamp()))

        soup = BeautifulSoup(src, 'html.parser')
        if soup.find('h1') and ("404 - Page not found" in soup.find('h1').text or "404 - PAGE NOT FOUND" in soup.find('h1').text):
            return None

        if not type:
            raise 'A parse type needs to be passed.'

        if type == 'video_search' or type == 'hashtag_videos':
            videos = []
            if soup.find(class_='results-list'):
                counter = 0
                for result in soup.find_all(class_='video-result-container'):
                    counter += 1
                    title = None
                    id_ = None
                    view_count = None
                    duration = None
                    channel = None
                    channel_id = None
                    description = None
                    description_links = []
                    created_at = None
                    like_count = None
                    dislike_count = None
                    subscriber_count = None
                    hashtags = []

                    if result.find(class_='video-result-title'):
                        title = result.find(class_='video-result-title').text.strip('\n').strip()
                        id_ = result.find(class_='video-result-title').find('a').get('href').split('/')[-2]

                    if result.find(class_='video-views'):
                        view_count = self.process_views(result.find(class_='video-views').text.strip('\n').strip())

                    if result.find(class_='video-duration'):
                        duration = result.find(class_='video-duration').text.strip('\n').strip()

                    if result.find(class_='video-result-channel'):
                        channel = result.find(class_='video-result-channel').text.strip('\n').strip()
                        channel_id = result.find(class_='video-result-channel').find('a').get('href').split('/')[-2]

                    if result.find(class_='video-result-text'):
                        description = result.find(class_='video-result-text').decode_contents()
                        description = description.strip('\n')
                        description = markdownify.markdownify(description)

                        for link in result.find(class_='video-result-text').find_all('a'):
                            description_links.append(link.get('href'))

                    if result.find(class_='video-result-details'):
                        created_at = result.find(class_='video-result-details').text.strip('\n').strip()

                    if result.find(class_='video-like'):
                        like_count_text = result.find(class_='video-like').find(id='video-like-count').text.strip()
                        like_count = like_count_text
                        print(f"Scraped like count: {like_count}")

                    if result.find(class_='video-dislike'):
                        dislike_count_text = result.find(class_='video-dislike').find(id='video-dislike-count').text.strip()
                        dislike_count = dislike_count_text
                        print(f"Scraped dislike count: {dislike_count}")

                    if result.find(class_='subscriber-count'):
                        subscriber_count_text = result.find(class_='subscriber-count').find(id='subscriber_count').text.strip()
                        subscriber_count = subscriber_count_text
                        print(f"Scraped subscriber count: {subscriber_count}")

                    if result.find(id='video-hashtags'):
                        video_hashtags_element = result.find(id='video-hashtags')
                        if video_hashtags_element:
                            for tag in video_hashtags_element.find_all('li'):
                                hashtags.append(tag.text.strip())

                    videos.append([counter, id_, title, hashtags, view_count, duration, channel, channel_id, description, description_links, created_at, scrape_time])

            videos_columns = ['rank', 'id', 'title', 'hashtags', 'view_count', 'duration', 'channel', 'channel_id', 'description', 'description_links', 'created_at', 'scrape_time']
            videos = pd.DataFrame(videos, columns=videos_columns)
            return videos

        elif type == 'recommended_channels':
            channels = []
            channel_ids = []
            soup = BeautifulSoup(src, 'html.parser')
            counter = 0
            if soup.find(id='carousel'):
                for item in soup.find(id='carousel').find_all(class_='channel-card'):
                    counter += 1
                    id_ = item.find('a').get('href').split('/')[-2]
                    name = item.find(class_='channel-card-title').text
                    channels.append([counter, id_, name, scrape_time])
                    channel_ids.append(id_)
            if extended:
                channel_ids = list(set(channel_ids))
                channels, videos = self.get_channels(channel_ids, get_channel_videos=False)
            else:
                columns = ['rank', 'id', 'name', 'scrape_time']
                channels = pd.DataFrame(channels, columns=columns)
                channels = channels.drop_duplicates(subset=['id'])
            return channels

        elif type == 'recommended_videos':
            videos = []
            tags = []
            soup = BeautifulSoup(src, 'html.parser')

            if kind == 'popular':
                if soup.find(id='listing-popular'):
                    soup = soup.find(id='listing-popular')
                else:
                    return None

            elif kind == 'trending-day':
                if soup.find(id='trending-day'):
                    soup = soup.find(id='trending-day')
                else:
                    return None

            elif kind == 'trending-week':
                if soup.find(id='trending-week'):
                    soup = soup.find(id='trending-week')
                else:
                    return None

            elif kind == 'trending-month':
                if soup.find(id='trending-month'):
                    soup = soup.find(id='trending-month')
                else:
                    return None

            elif kind == 'all':
                if soup.find(id='listing-all'):
                    soup = soup.find(id='listing-all')
                else:
                    return None
            else:
                print('kind needs to be passed for recommendations.')
                return None

            if soup.find(class_='video-result-container'):
                counter = 0
                for video in soup.find_all(class_='video-result-container'):
                    counter += 1
                    title = None
                    id_ = None
                    view_count = None
                    duration = None
                    channel = None
                    channel_url = None
                    created_at = None

                    if video.find(class_='video-result-title'):
                        title = video.find(class_='video-result-title').text.strip('\n')
                        id_ = video.find(class_='video-result-title').find('a').get('href').split('/')[-2]

                    if video.find(class_='video-views'):
                        view_count = self.process_views(video.find(class_='video-views').text.strip('\n'))
                    if video.find(class_='video-duration'):
                        duration = video.find(class_='video-duration').text.strip('\n')

                    if video.find(class_='video-result-channel'):
                        channel = video.find(class_='video-result-channel').text.strip('\n')
                        channel_id = video.find(class_='video-result-channel').find('a').get('href').split('/')[-2]
                    if video.find(class_='video-result-details'):
                        created_at = video.find(class_='video-result-details').text.strip('\n')
                    videos.append([counter, id_, title, view_count, duration, channel, channel_id, created_at, scrape_time])

            elif soup.find(class_='video-card'):
                counter = 0
                for video in soup.find_all(class_='video-card'):
                    counter += 1
                    title = None
                    id_ = None
                    view_count = None
                    duration = None
                    channel = None
                    channel_url = None
                    created_at = None

                    if video.find(class_='video-card-title'):
                        title = video.find(class_='video-card-title').text.strip('\n').strip()
                    if video.find(class_='video-card-id'):
                        id_ = video.find(class_='video-card-id').text.strip('\n').strip()
                    if video.find(class_='video-views'):
                        view_count = self.process_views(video.find(class_='video-views').text.strip('\n').strip())
                    if video.find(class_='video-duration'):
                        duration = video.find(class_='video-duration').text.strip('\n').strip()
                    if video.find(class_='video-card-channel'):
                        channel = video.find(class_='video-card-channel').text.strip('\n').strip()
                        channel_id = video.find(class_='video-card-channel').find('a').get('href').split('/')[-2]
                    if video.find(class_='video-card-published'):
                        created_at = video.find(class_='video-card-published').text.strip('\n').strip()
                    videos.append([counter, id_, title, view_count, duration, channel, channel_id, created_at, scrape_time])

            soup = BeautifulSoup(src, 'html.parser')
            if soup.find(class_='sidebar tags'):
                counter = 0
                for tag in soup.find(class_='sidebar tags').find_all('li'):
                    counter += 1
                    tag_name = tag.text.strip('\n').strip()
                    tag_url = tag.find('a').get('href')
                    tags.append([counter, tag_name, tag_url, scrape_time])

            videos_columns = ['rank', 'id', 'title', 'view_count', 'duration', 'channel', 'channel_id', 'created_at', 'scrape_time']
            videos = pd.DataFrame(videos, columns=videos_columns)

            tags_columns = ['rank', 'tag_name', 'tag_url', 'scrape_time']
            tags = pd.DataFrame(tags, columns=tags_columns)

            return videos, tags

        elif type == 'channel_about':
            uid = None
            id_ = None
            title = None
            owner = None
            owner_link = None
            description = None
            description_links = []
            social_links = []
            category = None
            video_count = None
            subscriber_count = None
            view_count = None
            created_at = None

            soup = BeautifulSoup(src, 'html.parser')
            if soup.find('link', id='canonical'):
                uid = soup.find('link', id='canonical').get('href').split('/')[-2]
            if soup.find(class_='name'):
                title = soup.find(class_='name').text.strip('\n').strip()
                if soup.find(class_='name').find('a'):
                    id_ = soup.find(class_='name').find('a').get('href').strip('/').split('/')[1]
            if soup.find(class_='owner'):
                owner = soup.find(class_='owner').text.strip('\n').strip()
                owner_link = soup.find(class_='owner').find('a').get('href')
            if soup.find(id='channel-description'):
                description = soup.find(id='channel-description').decode_contents()
                description = description.strip('\n')
                description = markdownify.markdownify(description)
                for link in soup.find(id='channel-description').find_all('a'):
                    description_links.append(link.get('href'))
            if soup.find(class_='social'):
                for link in soup.find(class_='social').find_all('a'):
                    social_links.append([link.get('data-original-title'), link.get('href')])
            if soup.find(class_='channel-about-details'):
                for elem in soup.find(class_='channel-about-details').find_all('p'):
                    if 'Category' in elem.text and elem.find('a'):
                        category = elem.find('a').text.strip('\n').strip()
                    elif elem.find(class_='fa-video'):
                        video_count = elem.text.split(' ')[1]
                    elif elem.find(class_='fa-users'):
                        subscriber_count = int(elem.text.split(' ')[1].replace(',', ''))
                        print(f"Subscriber count: {subscriber_count}")  # Debug statement
                    elif elem.find(class_='fa-eye'):
                        view_count = self.process_views(elem.text.split(' ')[1])
                    else:
                        created_at = elem.text.strip('\n').strip()
                        pass
            data = [uid, id_, title, description, description_links, video_count, subscriber_count, view_count, created_at, category, social_links, owner, owner_link, scrape_time]
            columns = ['uid', 'id', 'title', 'description', 'description_links', 'video_count', 'subscriber_count', 'view_count', 'created_at', 'category', 'social_links', 'owner', 'owner_link', 'scrape_time']
            data = pd.DataFrame([data], columns=columns)
            return data

        elif type == 'channel_videos':
            soup = BeautifulSoup(src, 'html.parser')
            data = []

            if soup.find('link', id='canonical').get('href'):
                channel_id = soup.find('link', id='canonical').get('href').split('/')[-2]
            else:
                channel_id = None
            if soup.find(class_='name'):
                channel_title = soup.find(class_='name').text.strip('\n')
            else:
                channel_title = None
            if soup.find(class_='channel-videos-list'):
                for video in soup.find(class_='channel-videos-list').find_all(class_='channel-videos-container'):
                    if video.find(class_='channel-videos-title'):
                        title = video.find(class_='channel-videos-title').text.strip('\n')
                        video_id = video.find(class_='channel-videos-title').find('a').get('href').split('/')[-2]
                    else:
                        title = None
                        video_id = None
                    if video.find(class_='channel-videos-text'):
                        description = video.find(class_='channel-videos-text').decode_contents()
                        description = description.strip('\n')
                        description = markdownify.markdownify(description)
                        description_links = [a.get('href') for a in video.find(class_='channel-videos-text').find_all('a')]
                    else:
                        description = None
                        description_links = []
                    if video.find(class_='video-duration'):
                        duration = video.find(class_='video-duration').text.strip('\n').strip()
                    else:
                        duration = None
                    if video.find(class_='channel-videos-details'):
                        created_at = str(parser.parse(video.find(class_='channel-videos-details').text.replace('\n', '')).date())
                    else:
                        created_at = None
                    if video.find(class_='video-views'):
                        view_count = self.process_views(video.find(class_='video-views').text.strip('\n').strip())
                    else:
                        view_count = None

                    data.append([channel_id, channel_title, video_id, title, created_at, duration, view_count, description, description_links, scrape_time])

            columns = ['channel_id', 'channel_title', 'video_id', 'title', 'created', 'duration', 'view_count', 'description', 'description_links', 'scrape_time']
            data = pd.DataFrame(data, columns=columns)
            return data

        elif type == 'video':
            id_ = None
            title = None
            description = None
            description_links = []
            view_count = None
            like_count = None
            dislike_count = None
            created_at = None
            hashtags = []
            category = None
            sensitivity = None
            channel_name = None
            channel_id = None
            owner_name = None
            owner_id = None
            subscribers = None
            next_id = None
            related_ids = []

            if soup.find(id='canonical'):
                id_ = soup.find(id='canonical').get('href').split('/')[-2]
            if soup.find(id='video-title'):
                title = soup.find(id='video-title').text.strip('\n').strip()
            if soup.find(id='video-view-count'):
                view_count = self.process_views(soup.find(id='video-view-count').text.strip('\n').strip())
                print(f"View count found: {view_count}")
            else:
                print("View count element not found.")
            if soup.find(id='video-like-count'):
                like_count_text = soup.find(id='video-like-count').text.strip()
                print(f"Like count text: {like_count_text}")
                try:
                    like_count = int(like_count_text.replace(',', ''))
                    print(f"Like count converted: {like_count}")
                except ValueError:
                    print("Error converting like count")
            else:
                print("Like count element not found.")
            if soup.find(id='video-dislike-count'):
                dislike_count_text = soup.find(id='video-dislike-count').text.strip()
                print(f"Dislike count text: {dislike_count_text}")
                try:
                    dislike_count = int(dislike_count_text.replace(',', ''))
                    print(f"Dislike count converted: {dislike_count}")
                except ValueError:
                    print("Error converting dislike count")
            else:
                print("Dislike count element not found.")
            if soup.find(class_='video-publish-date'):
                created_at = soup.find(class_='video-publish-date').text.strip().replace('First published at ', '')
                created_at = parser.parse(created_at)

            if soup.find(id='video-hashtags'):
                if soup.find(id='video-hashtags').find('li'):
                    for tag in soup.find(id='video-hashtags').find_all('li'):
                        hashtags.append(tag.text.strip())
            if soup.find(id='video-description'):
                description = soup.find(id='video-description').decode_contents()
                description = description.strip()
                description = markdownify.markdownify(description)
                if soup.find(id='video-description').find('a'):
                    for link in soup.find(id='video-description').find_all('a'):
                        description_links.append(link.get('href'))
            if soup.find(class_='video-detail-list'):
                if soup.find(class_='video-detail-list').find('tr'):
                    for row in soup.find(class_='video-detail-list').find_all('tr'):
                        value = row.find('a').text
                        if 'Category' in row.text:
                            category = value
                        elif 'Sensitivity' in row.text:
                            sensitivity = value
            if soup.find(class_='channel-banner'):
                channel_data = soup.find(class_='channel-banner')
                if channel_data.find(class_='name'):
                    channel_name = channel_data.find(class_='name').text.strip()
                    channel_id = channel_data.find(class_='name').find('a').get('href').split('/')[-2]
                if channel_data.find(class_='owner'):
                    owner_name = channel_data.find(class_='owner').text.strip()
                    owner_id = channel_data.find(class_='owner').find('a').get('href').split('/')[-2]
                if channel_data.find(class_='subscribers'):
                    subscribers_text = channel_data.find(class_='subscribers').text.replace('subscribers', '').strip()
                    print(f"Subscriber count text: {subscribers_text}")
                    try:
                        subscribers = int(subscribers_text.replace(',', ''))
                        print(f"Subscriber count converted: {subscribers}")
                    except ValueError:
                        print("Error converting subscriber count")

            if soup.find(class_='sidebar-next'):
                if soup.find(class_='sidebar-next').find(class_='video-card-title'):
                    next_id = soup.find(class_='sidebar-next').find(class_='video-card-title').find('a').get('href').split('/')[-2]

            if soup.find(class_='sidebar-recent'):
                if soup.find(class_='sidebar-recent').find(class_='video-card-title'):
                    for item in soup.find(class_='sidebar-recent').find_all(class_='video-card-title'):
                        related_ids.append(item.find('a').get('href').split('/')[-2])

            columns = ['id', 'title', 'description', 'description_links', 'view_count', 'like_count', 'dislike_count', 'created', 'hashtags', 'category', 'sensitivity', 'channel_name', 'channel_id', 'owner_name', 'owner_id', 'subscribers', 'next_id', 'related_ids']
            data = pd.DataFrame([[id_, title, description, description_links, view_count, like_count, dislike_count, created_at, hashtags, category, sensitivity, channel_name, channel_id, owner_name, owner_id, subscribers, next_id, related_ids]], columns=columns)
            return data

        else:
            print('A correct type needs to be passed.')

    def get_status(self, reset=True):
        status = self.status
        if reset:
            self.status
        return status

    def set_status(self, message):
        self.status.append(message)

def main():
    # Create an instance of the Crawler
    crawler = Crawler(headless=True, verbose=True)
    
    # List of categories to scrape
    categories = [
        ('News', 'https://api.bitchute.com/category/news/'),
        ('Health', 'https://api.bitchute.com/category/health/'),
        ('Entertainment', 'https://api.bitchute.com/category/entertainment/'),
        ('Business & Finance', 'https://api.bitchute.com/category/finance/'),
        ('Auto & Vehicles', 'https://api.bitchute.com/category/vehicles/'),
        ('Education', 'https://api.bitchute.com/category/education/'),
        ('Sports', 'https://api.bitchute.com/category/sport/')
    ]

    # Directory to save the CSV files
    save_directory = r"C:\Users\Shlok Mandloi\Desktop\Shlok\Shlok - USA\simppl\dash dashboard\data"
    os.makedirs(save_directory, exist_ok=True)

    for category_name, category_url in categories:
        # Get recent videos for the current category
        recent_videos = crawler.get_recent_videos(category_url)
        
        # For each video, get detailed information
        for index, row in recent_videos.iterrows():
            video_id = row['id']
            like_count, dislike_count, subscriber_count, hashtags = crawler.get_video_details(video_id)
            recent_videos.at[index, 'like_count'] = like_count
            recent_videos.at[index, 'dislike_count'] = dislike_count
            recent_videos.at[index, 'subscriber_count'] = subscriber_count
            recent_videos.at[index, 'hashtags'] = ','.join(hashtags)
        
        # Save the results to a CSV file for the current category
        file_name = f'recent_videos_bitchute_{category_name}.csv'
        file_path = os.path.join(save_directory, file_name)
        recent_videos.to_csv(file_path, index=False)
        print(f"Scraping completed for {category_name}. Results saved to '{file_path}'.")

if __name__ == "__main__":
    main()



Retrieving: https://api.bitchute.com/category/news/ ..........................................
Retrieving: https://api.bitchute.com/video/gWt8Vjotb9Zo/
Retrieving: https://api.bitchute.com/video/gWt8Vjotb9Zo/ .
Retrieving: https://api.bitchute.com/video/Ggu34XZhrWpA/
Retrieving: https://api.bitchute.com/video/Ggu34XZhrWpA/ .
Retrieving: https://api.bitchute.com/video/CcQPc3qEUlBU/
Retrieving: https://api.bitchute.com/video/CcQPc3qEUlBU/ .
Retrieving: https://api.bitchute.com/video/bEOGgVcZ5HC7/
Retrieving: https://api.bitchute.com/video/bEOGgVcZ5HC7/ .
Retrieving: https://api.bitchute.com/video/Ro5RFh1j9rrD/
Retrieving: https://api.bitchute.com/video/Ro5RFh1j9rrD/ .
Retrieving: https://api.bitchute.com/video/AXAqjwaU-AE/
Retrieving: https://api.bitchute.com/video/AXAqjwaU-AE/ .
Retrieving: https://api.bitchute.com/video/IaLZO5DgG5Y/
Retrieving: https://api.bitchute.com/video/IaLZO5DgG5Y/ .
Retrieving: https://api.bitchute.com/video/R5X5WoHRRZwz/
Retrieving: https://api.bitchute.com/vid

In [2]:
import pandas as pd
import os

# Directory where CSV files are saved
save_directory = r"C:\Users\Shlok Mandloi\Desktop\Shlok\Shlok - USA\simppl\dash dashboard\data"

# Paths to the CSV files
csv_files = {
    'Sports': 'recent_videos_bitchute_Sports.csv',
    'Health': 'recent_videos_bitchute_Health.csv',
    'Entertainment': 'recent_videos_bitchute_Entertainment.csv',
    'Education': 'recent_videos_bitchute_Education.csv',
    'Auto & Vehicles': 'recent_videos_bitchute_Auto & Vehicles.csv',
    'Business & Finance': 'recent_videos_bitchute_Business & Finance.csv',
    'News': 'recent_videos_bitchute_News.csv'
}

# Load the data
alldata = []
for category, file_name in csv_files.items():
    file_path = os.path.join(save_directory, file_name)
    data = pd.read_csv(file_path)
    alldata.append(data)

# Copy the data for processing
ad = alldata.copy()

# Print information about each DataFrame
for data in ad:
    print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rank               56 non-null     int64  
 1   id                 56 non-null     object 
 2   title              56 non-null     object 
 3   hashtags           31 non-null     object 
 4   view_count         56 non-null     int64  
 5   duration           56 non-null     object 
 6   channel            56 non-null     object 
 7   channel_id         56 non-null     object 
 8   description        47 non-null     object 
 9   description_links  56 non-null     object 
 10  created_at         56 non-null     object 
 11  scrape_time        56 non-null     int64  
 12  like_count         56 non-null     float64
 13  dislike_count      56 non-null     float64
 14  subscriber_count   56 non-null     float64
dtypes: float64(3), int64(3), object(9)
memory usage: 6.7+ KB
None
<class 'pandas

In [48]:
import pandas as pd
import re
import os

# Directory where the cleaned CSV files will be saved
save_directory = r"C:\Users\Shlok Mandloi\Desktop\Shlok\Shlok - USA\simppl\dash dashboard\data"

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Improved preprocess function to handle relative times in 'created_at'
def preprocess_data(df):
    # Drop unnecessary columns
    df = df.drop(columns=['rank', 'scrape_time'])
    
    # Fill missing values
    df['hashtags'] = df['hashtags'].fillna('No Hashtags')
    df['description'] = df['description'].fillna('No Description')
    
    # Function to convert relative time strings to total hours
    def convert_relative_time_to_hours(relative_time_str):
        if pd.isna(relative_time_str):
            return None
        try:
            # Remove commas and non-breaking spaces
            relative_time_str = relative_time_str.replace(',', '').replace('Â', '').strip()
            # Regular expression to match time units
            pattern = re.compile(r'(\d+)\s*(hour|minute|day|week)s?', re.IGNORECASE)
            matches = pattern.findall(relative_time_str)
            total_hours = 0
            for number, unit in matches:
                number = int(number)
                if 'hour' in unit:
                    total_hours += number
                elif 'minute' in unit:
                    total_hours += number / 60
                elif 'day' in unit:
                    total_hours += number * 24
                elif 'week' in unit:
                    total_hours += number * 24 * 7
            return total_hours
        except Exception as e:
            print(f"Error converting time: {e}")
            return None
    
    # Convert relative 'created_at' to total hours
    df['hours ago posted'] = df['created_at'].apply(convert_relative_time_to_hours)
    
    # Normalize numerical columns (if needed)
    df['view_count'] = df['view_count'].fillna(0).astype(int)
    df['like_count'] = df['like_count'].fillna(0).astype(int)
    df['dislike_count'] = df['dislike_count'].fillna(0).astype(int)
    df['subscriber_count'] = df['subscriber_count'].fillna(0).astype(int)
    
    return df

# Function to add total interactions column
def add_total_interactions(df):
    df['total_interactions'] = df['like_count'] + df['dislike_count'] + df['view_count']
    return df

# Function to remove duplicates based on video ID
def remove_duplicates(df):
    if 'id' in df.columns:
        df = df.drop_duplicates(subset='id')
    return df

# Function to add a URL column based on video ID
def add_video_url(df):
    if 'id' in df.columns:
        df['video_url'] = df['id'].apply(lambda x: f"https://api.bitchute.com/video/{x}")
    return df

# Assuming `ad` list already has the DataFrames and categories
ad = [data_sports, data_health, data_ent, data_edu, data_auto, data_bus, data_news]
categories = ['Sports', 'Health', 'Entertainment', 'Education', 'Auto & Vehicles', 'Business & Finance', 'News']

# Initialize an empty list to store cleaned dataframes
cleaned_dataframes = []

# Iterate through each dataframe in ad, preprocess it, remove duplicates, add total interactions and URL, and append to the cleaned_dataframes list
for df in ad:
    if not df.empty:
        cleaned_df = preprocess_data(df)
        cleaned_df = remove_duplicates(cleaned_df)
        cleaned_df = add_total_interactions(cleaned_df)
        cleaned_df = add_video_url(cleaned_df)
        cleaned_dataframes.append(cleaned_df)
    else:
        cleaned_dataframes.append(df)  # Append the empty dataframe if it was not loaded

# Save cleaned DataFrames to CSV files in the specified directory
for cleaned_df, category in zip(cleaned_dataframes, categories):
    # Remove invalid characters from category names for filenames
    valid_category = category.replace(' & ', '_and_').replace(' ', '_')
    output_file = os.path.join(save_directory, f'dash_csv_{valid_category}.csv')
    cleaned_df.to_csv(output_file, index=False)
    print(f"Saved preprocessed data to {output_file}")

# Display the first few rows of one of the cleaned dataframes with total interactions and URLs
if not cleaned_dataframes[3].empty:
    print(cleaned_dataframes[3].head())


Saved preprocessed data to dash_csv_sports.csv
Saved preprocessed data to dash_csv_health.csv
Saved preprocessed data to dash_csv_entertainment.csv
Saved preprocessed data to dash_csv_education.csv
Saved preprocessed data to dash_csv_automotive.csv
Saved preprocessed data to dash_csv_business.csv
Saved preprocessed data to dash_csv_news.csv


Unnamed: 0,id,title,hashtags,view_count,duration,channel,channel_id,description,description_links,created_at,like_count,dislike_count,subscriber_count,hours ago posted,total_interactions,video_url
0,Z0D9JccIh7FV,She told him he could tie her up and do anythi...,No Hashtags,6335,0:17,PIRATEPETE,piratepete,mirrored from rumble \nIMO this guy has his p...,['http://old.bitchute.com'],19 hours ago,108,21,18149,19,6464,https://api.bitchute.com/video/Z0D9JccIh7FV
1,H7XEkx0Ro2OG,CHARLIE WARD DAILY NEWS WITH PAUL BROOKER & DR...,"#charlieward,#dailynews,#breaking",4623,17:11,Dr Charlie Ward 🔵✔️,drcharlieward,Thinking of buying gold or switching your 401k...,"['http://www.goldbusters.co.uk/', 'http://www....",10 hours ago,157,14,137911,10,4794,https://api.bitchute.com/video/H7XEkx0Ro2OG
2,GAWFNng7zai1,''We are ALL in trouble'' - Edward Snowden ......,No Hashtags,2748,8:07,XANDREWX,xandrewx,No Description,[],23 hours ago,15,13,33721,23,2776,https://api.bitchute.com/video/GAWFNng7zai1
3,G0XNXJOIqrM3,RESET MAPS [2023-11-07] - CONSPIRACY-R-US (VIDEO),No Hashtags,1610,10:23,Sergeant Major,sergeant-major,Looking into interesting maps of the Land Down...,['https://www.bitchute.com/video/xGIHac7GvY2K/...,15 hours ago,31,2,60726,15,1643,https://api.bitchute.com/video/G0XNXJOIqrM3
4,rF0_ZqFvf34,The Battle Against Parasites,No Hashtags,812,0:44,Jordan Peterson,jordanpeterson,No Description,[],17 hours ago,13,16,18947,17,841,https://api.bitchute.com/video/rF0_ZqFvf34
