# Data Collection and Exploration

Here we will load in the data from youtube, where collecting cc texts from relevant search queries is the main goal.

In [4]:
import os
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build

Change directories to get access to env file.

In [2]:
os.chdir("../")

In [3]:
pwd

'c:\\Users\\RaviB\\GitHub\\TechKnowBot'

Build YouTube connector.

In [5]:
load_dotenv()

youtube_api_key = os.getenv('GCP_YOUTUBE_API_KEY')

Take old code from SentiRec Analytics project: https://github.com/RavinderRai/SentiRec-Analytics/blob/main/modules/YouTubeReviewScraper.py.

In [14]:
class YouTubeReviewData:    
    def __init__(self, api_key):
        self.api_key = api_key
        self.youtube = build('youtube', 'v3', developerKey=api_key)
        
    def search_videos(self, search_query, max_results=5, individual_review=False):
        """
        Search for YouTube videos based on a given query and retrieve additional information including closed captions.

        Parameters:
        - search_query (str): The search query used to find relevant videos on YouTube.
        - max_results (int): The maximum number of videos to retrieve. Defaults to 5.
        - individual_review (bool): If the search query is for reviews of a specific product, then set this to True, 
        to ignore videos with VS in their titles as that indicates that the reviews isn't for just the individual product itself.

        Returns:
        List[dict]: A list of dictionaries, each containing information about a video, including:
            - 'video_id' (str): The unique identifier for the video.
            - 'title' (str): The title of the video.
            - 'video_link' (str): The YouTube link to the video.
            - 'channel_name' (str): The name of the channel that uploaded the video.
            - 'cc_text' (str): The closed captions text for the video. This is the review text.

        Note:
        - Videos with titles containing specific strings ('VS', 'vs', 'Vs') are excluded, as they indicate videos that aren't reviews specific to the  
        product in the search query.
        - The 'cc_text' field may contain an empty string if closed captions are not available.
        """       
        
        search_response = self.youtube.search().list(
            q=search_query,
            type='video',
            part='id, snippet',
            maxResults=max_results
        ).execute()        
        
        videos_info = []
        for result in search_response.get('items', []):
            video_id = result['id']['videoId']
            title = result['snippet']['title']
            video_link = f'https://www.youtube.com/watch?v={video_id}'
            channel_name = result['snippet']['channelTitle']

            # Check and remove unwanted titles
            strings_to_check = ["VS", "vs", "Vs"] if individual_review else []
            if not any(s in title for s in strings_to_check):
                review_text = self.fetch_captions(video_id)
                videos_info.append({
                    'video_id': video_id,
                    'title': title,
                    'video_link': video_link,
                    'channel_name': channel_name,
                    'review_text': review_text
                })

        return videos_info
    
    def fetch_captions(self, video_id):
        """
        Get the closed captions. 

        Parameters:
        - video_id (str): The video id which is obtained in search_videos.
        
        Returns:
        String: Closed caption text of a youtube video
        """
        try:
            # Retrieve the transcript for the video
            transcript = YouTubeTranscriptApi.get_transcript(video_id)

            cc_text = ""

            # Concatenate the transcript text
            for entry in transcript:
                cc_text += ' ' + entry['text']
                
            cc_text = cc_text.replace('\n', ' ')
            return cc_text

        except Exception as e:
            print(f"An error occurred: {str(e)}")

In [15]:
youtube = YouTubeReviewData(youtube_api_key)

In [17]:
youtube.search_videos("top ten laptops", 5)

[{'video_id': 'FK8veh-L8AE',
  'title': 'TOP 10 BEST LAPTOPS 2023',
  'video_link': 'https://www.youtube.com/watch?v=FK8veh-L8AE',
  'channel_name': 'Trend Max',
  'review_text': " top 10 best laptops 2023 number 10 HP Spectre x364 careful because you can't help but fall in love with the HP Spectre X 3614 just look at that beautiful 13.5 in touchscreen with a resolution of 3000x 2000 pixels that adjusts its image color to the environment thanks to its artificial intelligence system incredible plus its screen can be turned back becoming a kind of tablet regarding the CPU the Spectre x314 comes equipped with a core i7 1255 U and an Intel UHD GPU so it has power to spare whether you're doing everyday office tasks or if you're a content creator like me can you see why I love it it has 16 GB of RAM and up to one Terra of storage so you can save what you want last but not least it has four speakers that sound great so how much for this beauty a reasonable $1,050 number nine Samsung Galaxy Bo

In [12]:
youtube.fetch_captions("X35QRgsHhF0")

" this year we tested over 70 different laptops by far a record for this channel we tested small ones we tested big ones we tested cheap ones and we tested expensive ones a huge variety from all kinds of Manufacturers and when you use these laptops side by side just like we do it becomes so obvious which laptops are great and which are completely mediocre well today is the day that we countd down the top 10 laptops that we tested in 2023 if you are planning to buy one of these lap tops you'll obviously want to buy them at the best possible price so check the links below the video our team scours the internet to find the best deals and we update them daily plus if new laptops are released after this video that we end up liking even better we'll include them down there too number 10 the HB Pavilion plus 14 in for a price of around $729 you get a crazy amount for the money it will be configured with an OLED panel at that price which is bright vibrant color accurate has a fast refresh rate