In [1]:
import time
import pandas as pd
from googleapiclient.discovery import build

In [2]:
#YOUTUBE API KEY
API_KEY = "AIzaSyB9z4l6i2Fs-cGIkH1ssJMqBOKCIJY_ZLU"

In [3]:
#Initialize YouTube API
youtube = build("youtube", "v3", developerKey=API_KEY)

In [4]:
#Function to Fetch Top Product Review Videos (Sorted by Views)
def search_top_videos(query, max_results=50):
    """Fetches the most viewed product review videos."""
    request = youtube.search().list(
        part="snippet",
        q=query,
        type="video",
        maxResults=max_results,
        order="viewCount"  # Sort by highest views
    )
    response = request.execute()
    
    videos = []
    for item in response["items"]:
        videos.append({
            "video_id": item["id"]["videoId"],
            "video_title": item["snippet"]["title"],
            "channel_name": item["snippet"]["channelTitle"],
            "upload_date": item["snippet"]["publishedAt"]
        })
    
    return pd.DataFrame(videos)

In [5]:
# Search for Most Viewed Product Review Videos
df_videos = search_top_videos("best product reviews", max_results=50)
df_videos.to_csv("youtube_product_reviews_videos.csv", index=False)
print(f"Total product review videos collected: {len(df_videos)}")

Total product review videos collected: 50


In [6]:
# Search for Most Viewed Product Review Videos
df_videos = search_top_videos("best product reviews", max_results=50)
df_videos.to_csv("youtube_product_reviews_videos.csv", index=False)
print(f"Total product review videos collected: {len(df_videos)}")

Total product review videos collected: 50


In [7]:
# Function to Fetch Comments from Each Video
def get_video_comments(video_id, max_comments=1000, sleep_time=1):
    """Fetches comments from a video, prioritizing most engaged ones."""
    comments = []
    next_page_token = None
    collected_comments = 0

    while True:
        try:
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                order="relevance",  # Prioritize most engaged comments
                textFormat="plainText"
            )
            response = request.execute()

            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comments.append({
                    "video_id": video_id,
                    "video_title": video_title_dict.get(video_id, "Unknown"),
                    "comment_id": item["id"],
                    "comment_text": comment["textDisplay"],
                    "comment_author": comment["authorDisplayName"],
                    "comment_date": comment["publishedAt"],
                    "likes_on_comment": comment["likeCount"],
                    "replies_count": item["snippet"]["totalReplyCount"]
                })
                collected_comments += 1

                # Fetch replies (if available)
                if "replies" in item:
                    for reply in item["replies"]["comments"]:
                        comments.append({
                            "video_id": video_id,
                            "video_title": video_title_dict.get(video_id, "Unknown"),
                            "comment_id": reply["id"],
                            "comment_text": reply["snippet"]["textDisplay"],
                            "comment_author": reply["snippet"]["authorDisplayName"],
                            "comment_date": reply["snippet"]["publishedAt"],
                            "likes_on_comment": reply["snippet"]["likeCount"],
                            "replies_count": 0
                        })
                        collected_comments += 1

            next_page_token = response.get("nextPageToken")

            if not next_page_token or collected_comments >= max_comments:
                break

            time.sleep(sleep_time)

        except Exception as e:
            print(f"Error fetching comments for {video_id}: {e}")
            return pd.DataFrame()

    return pd.DataFrame(comments)

In [8]:
# Collect Comments from Top Videos
all_comments = []
video_title_dict = dict(zip(df_videos["video_id"], df_videos["video_title"]))

for index, video_id in enumerate(df_videos["video_id"]):
    print(f"Fetching comments for video {index + 1}/{len(df_videos)}: {video_id}...")
    df_video_comments = get_video_comments(video_id, max_comments=2000)
    if not df_video_comments.empty:
        all_comments.append(df_video_comments)

Fetching comments for video 1/50: 7KHt0TshT2Q...
Fetching comments for video 2/50: UHkq0-XKdZY...
Fetching comments for video 3/50: ZZ_MBbmbgCY...
Fetching comments for video 4/50: U68Zlppcr30...
Fetching comments for video 5/50: XeML16BwHnw...
Fetching comments for video 6/50: h359LRmMZMU...
Fetching comments for video 7/50: DaJeg3f8jFk...
Fetching comments for video 8/50: _3hQnHi5OEk...
Fetching comments for video 9/50: WXggcRHXYAU...
Fetching comments for video 10/50: VmrOxpNjJoA...
Fetching comments for video 11/50: Z5k54Dd4OpY...
Error fetching comments for Z5k54Dd4OpY: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&videoId=Z5k54Dd4OpY&maxResults=100&order=relevance&textFormat=plainText&key=AIzaSyB9z4l6i2Fs-cGIkH1ssJMqBOKCIJY_ZLU&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The v

In [9]:
# Save All Comments to CSV
df_all_comments = pd.concat(all_comments, ignore_index=True)
df_all_comments.to_csv("youtube_product_review_comments.csv", index=False)

print(f"Total comments collected: {len(df_all_comments)}")

Total comments collected: 64558
