# youtube comments

In [1]:
import googleapiclient.discovery
import pandas as pd

In [3]:
# API setup
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = ... # can not be published
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=DEVELOPER_KEY)

In [4]:
# code snippets from
# https://www.youtube.com/watch?v=SIm2W9TtzR0 
# https://github.com/analyticswithadam/Python/blob/main/YouTubeComments.ipynb
# https://www.geeksforgeeks.org/how-to-extract-youtube-comments-using-youtube-api-python/

# function to extract comments under video by video id
def get_comments(video_id):

    # request to get comments
    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_id
    )
    response = request.execute()

    # list of comments
    comments = []
    # responses yield 50 comments at a time. You have to "go through pages" until there are no more
    while response:
        # extract top level comments
        for uniqueID, item in enumerate(response['items']):
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append([
                video_id,
                uniqueID,
                comment['authorDisplayName'],
                comment['publishedAt'],
                comment['updatedAt'],
                comment['likeCount'],
                comment['textDisplay'],
                "topLevelComment"
            ])
            # for each top level comment, extract responses
            # number of replies to comment
            replycount = item['snippet']['totalReplyCount']
            if replycount > 0:
                # iterate through all replies
                for reply in item['replies']['comments']:
                    # Extract reply
                    reply = reply['snippet']
                    # Store reply is list
                    comments.append([
                        video_id,
                        uniqueID,
                        reply['authorDisplayName'],
                        reply['publishedAt'],
                        reply['updatedAt'],
                        reply['likeCount'],
                        reply['textDisplay'],
                        "reply"
                    ])


        # repeat again, if there is another page
        if 'nextPageToken' in response:
            response = youtube.commentThreads().list(
                    part = 'snippet,replies',
                    videoId = video_id,
                        pageToken = response['nextPageToken']
                ).execute()
        else:
            break
    df = pd.DataFrame(comments, columns=['videoID', 'topLevelID', 'author', 'published_at', 'updated_at', 'like_count', 'text', 'commentType'])
    return df

In [5]:
top10_cyberpunk = [ #the top 10 videos sorted by views when typing in "cyberpunk" in the youtube API. Excluding music and cyberpunk edgerunners content. Only video game trailers and gameplay.
    "BO8lX3hDU30",
    "P99qJGrPNLs",
    "8X2kIfS6fb8",
    "vjF9GgrY9c0",
    "U8qJc6znzZc",
    "qIcTM8WXFjk",
    "LembwKDo1Dk",
    "G51GkSmQAmQ",
    "DvVjkqB3LH0",
    "uRYsVL2RNNY"
]

In [8]:
result_df = pd.DataFrame({})
for video in top10_cyberpunk:
    print(f"extracting from {video}.")
    df = get_comments(video)
    print(f"extracted {len(df)} comments")
    result_df = pd.concat([result_df, df], ignore_index=True)

print(f"extracted {len(result_df)} comments in total.")


extracting from uRYsVL2RNNY.
extracted 3444 comments
extracted 3444 comments in total.


In [9]:
result_df.to_csv("comments.csv")

# Steam reviews

from https://andrew-muller.medium.com/scraping-steam-user-reviews-9a43f9e38c92

In [7]:
import requests
from tqdm import tqdm

def get_reviews(appid, params={'json':1}):
        url = 'https://store.steampowered.com/appreviews/'
        response = requests.get(url=url+appid, params=params, headers={'User-Agent': 'Mozilla/5.0'})
        return response.json()

def get_n_reviews(appid, n=100):
    pbar = tqdm(desc="loading reviews", total = n)
    reviews = []
    cursor = '*'
    params = {
            'json' : 1,
            'filter' : 'updated',
            'language' : 'all',
            'day_range' : 9223372036854775807,
            'review_type' : 'all',
            'purchase_type' : 'all'
            }

    fail_counter = 0
    while n > 0:
        params['cursor'] = cursor.encode()
        params['num_per_page'] = min(100, n)
        n -= 100

        response = get_reviews(appid, params)

        if len(response['reviews']) < 1: # after last page
              print(response)
              fail_counter += 1
              if fail_counter > 10: # sometimes 1 review is returned, but the next page contains reviews again
                break
        else:
             reviews += response['reviews']
        
        cursor = response['cursor']
        pbar.update(len(response['reviews']))

    return reviews

In [8]:
reviews_json = get_n_reviews("1091500", n=1000000) # ID from cyberpunk steam page URL
print(f"extracted {len(reviews_json)} reviews")

loading reviews:  69%|██████▉   | 694695/1000000 [1:52:16<2:49:42, 29.98it/s] 

{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd

loading reviews:  69%|██████▉   | 694695/1000000 [1:52:24<49:23, 103.01it/s] 

{'success': 1, 'query_summary': {'num_reviews': 0}, 'reviews': [], 'cursor': 'AoJwsY7/pPYCd7u/uAI='}
extracted 694695 reviews





In [9]:
# convert to pandas dataframe
reviews = pd.DataFrame(reviews_json)
reviews.head()

Unnamed: 0,recommendationid,author,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,hidden_in_steam_china,steam_china_location,timestamp_dev_responded,developer_response
0,155462372,"{'steamid': '76561198169337147', 'num_games_ow...",brazilian,Tendo terminado esse jogo em seu lançamento e ...,1704618897,1704643454,True,1,0,0.52173912525177,0,False,False,False,True,,,
1,155487141,"{'steamid': '76561198031762568', 'num_games_ow...",polish,Wspamiała gra ! <3,1704643418,1704643418,True,0,0,0.0,0,True,False,False,True,,,
2,155487124,"{'steamid': '76561198159468753', 'num_games_ow...",russian,"Бла-бла-бла, ебать «Арасаку» бла-бла, блядь, в...",1704643410,1704643410,True,0,0,0.0,0,True,False,False,True,,,
3,103843118,"{'steamid': '76561198013234046', 'num_games_ow...",russian,❤️,1637854516,1704643392,True,0,0,0.0,0,True,False,False,False,,,
4,155487037,"{'steamid': '76561198027298550', 'num_games_ow...",english,A nice game,1704643372,1704643372,True,0,0,0.0,0,False,False,False,True,,,


In [11]:
from datetime import datetime
datetimes = []
for index, row in reviews.iterrows():
    datetimes.append(datetime.fromtimestamp(row["timestamp_updated"]))
reviews.index = datetimes
reviews.head(10)

Unnamed: 0,recommendationid,author,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,hidden_in_steam_china,steam_china_location,timestamp_dev_responded,developer_response
2024-01-07 17:04:14,155462372,"{'steamid': '76561198169337147', 'num_games_ow...",brazilian,Tendo terminado esse jogo em seu lançamento e ...,1704618897,1704643454,True,1,0,0.52173912525177,0,False,False,False,True,,,
2024-01-07 17:03:38,155487141,"{'steamid': '76561198031762568', 'num_games_ow...",polish,Wspamiała gra ! <3,1704643418,1704643418,True,0,0,0.0,0,True,False,False,True,,,
2024-01-07 17:03:30,155487124,"{'steamid': '76561198159468753', 'num_games_ow...",russian,"Бла-бла-бла, ебать «Арасаку» бла-бла, блядь, в...",1704643410,1704643410,True,0,0,0.0,0,True,False,False,True,,,
2024-01-07 17:03:12,103843118,"{'steamid': '76561198013234046', 'num_games_ow...",russian,❤️,1637854516,1704643392,True,0,0,0.0,0,True,False,False,False,,,
2024-01-07 17:02:52,155487037,"{'steamid': '76561198027298550', 'num_games_ow...",english,A nice game,1704643372,1704643372,True,0,0,0.0,0,False,False,False,True,,,
2024-01-07 16:58:14,155486625,"{'steamid': '76561199331846737', 'num_games_ow...",schinese,CDPR剧情方面没得说，尤其是往日之影的人物塑造很强，氛围感拉满，2.1的优化也不错，两年的...,1704643094,1704643094,True,0,0,0.0,0,True,False,False,True,,,
2024-01-07 16:56:57,155486533,"{'steamid': '76561198111976431', 'num_games_ow...",english,Has been greatly improved since the last 2 meg...,1704643017,1704643017,True,0,0,0.0,0,True,False,False,True,,,
2024-01-07 16:56:20,155486489,"{'steamid': '76561199437438329', 'num_games_ow...",english,Yes,1704642980,1704642980,True,0,0,0.0,0,True,False,False,True,,,
2024-01-07 16:53:25,155486283,"{'steamid': '76561198129862622', 'num_games_ow...",schinese,1,1704642805,1704642805,True,0,0,0.0,0,True,False,False,True,,,
2024-01-07 16:51:55,155486167,"{'steamid': '76561198079985019', 'num_games_ow...",french,Incroyable,1704642715,1704642715,True,0,0,0.0,0,True,False,False,True,,,


In [12]:
# pickle data
reviews.to_pickle("steam_reviews.pkl")