In [1]:
import os
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import xlsxwriter
from datetime import datetime

api_key = '' #Please insert your key
youtube = build('youtube', 'v3', developerKey=api_key)

**Due to the daily data extraction limitations of the YouTube API, multiple extraction runs are required. Subsequent data pulls will be merged with previously accumulated data, up to the cut-off time of the prior runs.**

Data Scrapping

Rerunning the code because of API limitations and also further developments on Abarth. Change the keywords according to the needs.

In [None]:
#Extracting the specific video IDs from the videos appearing from the searches using the keywords below

keywords = ['Abarth EV', 'Abarth Combustion', 'Fiat combustion engine', 'Fiat 500e', 'Tesla EV', 
'Mini Cooper combustion cars', 'Mini Cooper EV', 'Peugeot Combustion','Peugeot EV', 
'Volkswagen combustion engine', 'Volkswagen EV'] 
video_ids = []

# This date changes to when the previous run date was cut off. To scrap the latest videos in the next run
after_date = '2023-06-02T00:00:00Z'

for keyword in keywords:
    search_response = youtube.search().list(
        q=keyword,
        part='id,snippet',
        maxResults=25,
        type='video',
        publishedAfter=after_date
    ).execute()
    
    # Extract video IDs
    for item in search_response['items']:
        video_id = item['id']['videoId']
        video_ids.append(video_id)


In [None]:
#Checking the number of video_ids
len(video_ids)

100

In [None]:
# Empty variables for data collection
df1_data = []
df2_data = []

DF1 is the general information about the videos to extract the comments from

In [None]:
#Section for df1

#Filling it into the "df1_data"
for i in range(0, len(video_ids), 100):
    ids_chunk = video_ids[i:i+10]
    video_response = youtube.videos().list(
        id=','.join(ids_chunk),
        part='snippet,statistics'
    ).execute()
    
    for video in video_response['items']:
        video_id = video['id']

        # Extract data for DataFrame 1
        post_id = video_id
        post_title = video['snippet']['title']
        author = video['snippet']['channelTitle']
        date = video['snippet']['publishedAt']
        post_content = video['snippet']['description']
        comment_number = int(video['statistics'].get('commentCount', '0'))
        net_like = int(video['statistics'].get('likeCount', '0')) - int(video['statistics'].get('dislikeCount', '0'))
        views = video['statistics']['viewCount']
        df1_data.append([post_id, post_title, author, date, post_content, comment_number, net_like, views])

DF2 is the comment and specific details of the comments from the videos in DF1

In [None]:
# Fetch comments from videos
#Section for df2
next_page_token = None
while True:
    try:
        comments_response = youtube.commentThreads().list(
            videoId=video_id,
            part='id,snippet',
            maxResults=50,
            textFormat='plainText',
            pageToken=next_page_token
        ).execute()

        # Extract data for DataFrame 2
        for item in comments_response['items']:
            unique_id = item['id']
            post_id = video_id
            author = item['snippet']['topLevelComment']['snippet'].get('authorDisplayName', 'Unknown')
            date = item['snippet']['topLevelComment']['snippet'].get('publishedAt', 'Unknown')
            comment_content = item['snippet']['topLevelComment']['snippet'].get('textDisplay', '')
            reply_id = item['snippet'].get('totalReplyCount', '0')

            df2_data.append([unique_id, post_id, author, date, comment_content, reply_id])

            # Fetch replies for the current top level comment
            if reply_id > 0:
                reply_next_page_token = None
                while True:
                    try:
                        reply_response = youtube.comments().list(
                            part='id,snippet',
                            parentId=unique_id,
                            maxResults=10,
                            textFormat='plainText',
                            pageToken=reply_next_page_token
                        ).execute()

                        #Extract data for replies
                        for reply_item in reply_response['items']:
                            reply_unique_id = reply_item['id']
                            reply_author = reply_item['snippet'].get('authorDisplay', 'Unknown')
                            reply_date = reply_item['snippet'].get('publishedAt', 'Unknown')
                            reply_comment_content = reply_item['snippet'].get('textDisplay', '')
                            parent_id = unique_id #mapping to the parent comment ID

                            df2_data.append([reply_unique_id, post_id, reply_author, reply_date, reply_comment_content,
                            0, parent_id])

                            reply_next_page_token = reply_response.get('nextPageToken')
                            if not reply_next_page_token:
                                break
                    except HttpError as e:
                        print(f"Skipping replies for comment {unique_id} due to: print {e}")
                        break

        # Check if there are more comments to fetch
        next_page_token = comments_response.get('nextPageToken')
        if not next_page_token:
            break

    except HttpError as e:
        print(f"Skipping video {video_id} due to error: {e}")
        break

In [9]:
# Create dataframes
df1_columns = ['Post_id', 'Post_Title', 'Author', 'Date', 'Post_Content', 'Comment_Number', 'Net_Like', 'Views']
df1 = pd.DataFrame(df1_data, columns=df1_columns)

df2_columns = ['Unique_id', 'Post_id', 'Author', 'Date', 'Comment_Content', 'Reply_Count', 'Parent_id']
df2 = pd.DataFrame(df2_data, columns=df2_columns)

In [10]:
len(df2)

9843

In [12]:
df1.head()

Unnamed: 0,Post_id,Post_Title,Author,Date,Post_Content,Comment_Number,Net_Like,Views
0,iak-pj6sdFA,IS THIS A REAL ABARTH?! Driving the 500e Elect...,Auto Social UK,2023-07-19T04:45:01Z,Mt expectations were actually not all that hig...,107,428,7314
1,-Ya6yWohTW4,Abarth 500 Electric! - I Might Buy One! Eventu...,Electric Vehicle Man,2023-07-21T15:30:00Z,We test the new (and arguably 1st) electric ho...,203,995,23161
2,6fDMXjYOsGM,2023 Abarth 500E Turismo First Drive Review: A...,Stef ABtv,2023-07-19T05:30:11Z,Here is the NEW Abarth 500e Turismo Electric F...,136,391,14768
3,OqQo6VLmnTc,Abarth’s NOISY EV Doesn't Care About Speed!,Fully Charged Show,2023-07-20T14:00:24Z,Jack and Bobby go for a test drive in the firs...,495,5013,144627
4,bigX6_seQr4,New Abarth 500e: Mission Possible. The Mission...,Abarth,2023-06-30T14:13:54Z,It’s no easy feat when you're presented with a...,21,179,565906


In [20]:
df2.sort_values(by='Date', ascending=True)

Unnamed: 0,Unique_id,Post_id,Author,Date,Comment_Content,Reply_Count,Parent_id
9842,UgwqfvUX__-4ZCwegCp4AaABAg,l-5lVw42VIs,tecdessus,2023-07-19T22:59:24Z,An electric bath sounds like a bad idea. Ev's ...,0,
9841,UgypwZid2Eb8wW8oZdZ4AaABAg,l-5lVw42VIs,tecdessus,2023-07-19T23:01:12Z,why are my comment's being deleted ?,2,
9840,UgxSxybJ6V1Cdu2QQyx4AaABAg,l-5lVw42VIs,tecdessus,2023-07-19T23:04:17Z,That electric bath should just play Barbie Gir...,1,
9839,UgzDnIGDPHtMvqeovH54AaABAg,l-5lVw42VIs,its1me1cal,2023-07-19T23:12:07Z,Sounds like a UFO landing lol,0,
9838,UgzTM3QmmmfSmWvCTLZ4AaABAg,l-5lVw42VIs,PoltergeistWorks,2023-07-20T00:27:46Z,More like a slap in the face to ABARTH fans :(...,0,
...,...,...,...,...,...,...,...
4,Ugxh1xbdgnJjybhTBHp4AaABAg,l-5lVw42VIs,Nomad624,2023-08-08T15:57:07Z,"34k for this thing is insane, given that despi...",0,
3,Ugz6kUrZ4HcQfUyXXJF4AaABAg,l-5lVw42VIs,Rod Thorpe,2023-08-09T15:33:37Z,"£38k!?!? 🤯 Yeah ""electric cars are soooo expen...",0,
2,UgyfaHcnq8D3j-Kl_Ft4AaABAg,l-5lVw42VIs,Mias Greyling,2023-08-12T04:32:24Z,To expensive!!!,0,
1,UgyQ0dyzYBjdYwZbr1Z4AaABAg,l-5lVw42VIs,Actual Facts,2023-08-13T15:23:11Z,They forgot to put on a boot.,0,


Combining with old and removing duplicates before exporting it to an Excel file

In [46]:
df1_old = pd.read_csv('df1_combined.csv')
df2_old = pd.read_csv('df2_combined.csv')

  df2_old = pd.read_csv('df2_combined.csv')


In [47]:
df1_combined = pd.concat([df1_old, df1], ignore_index=True)
df2_combined = pd.concat([df2_old, df2], ignore_index=True)

In [48]:
len(df2_combined)

157242

In [49]:
df1_combined.drop_duplicates(inplace=True)
df2_combined.drop_duplicates(subset=['Comment_Content'], inplace=True)

In [50]:
len(df2_combined)

147397

In [51]:
# Save df1_combined as CSV
df1_combined.to_csv('df1_combined.csv', index=False)

# Save df2_combined as CSV
df2_combined.to_csv('df2_combined.csv', index=False)