**Carga de datos usados para la descarga**

In [None]:
# Manualmente cargamos el archivo csv correspondiente a la búsqueda de videos que queremos usar para descargar comentarios
import pandas as pd
data = pd.read_csv("busquedas/terms_pseudo/camuflated_pseudoscientific.csv")
data = data[data['Content_Classification'] == 'Pseudocientífico\n']
data

In [None]:
lista_videos = data.iloc[:, 0]
lista_videos_inicial = lista_videos

In [None]:
len(lista_videos)

**Configuración API YouTube**

In [None]:
import time
from googleapiclient.discovery import build
import googleapiclient.discovery
api_key = ''
youtube = build('youtube', 'v3', developerKey=api_key)
api_service_name = "youtube"
api_version = "v3"

**Función para descargar todos los comentarios de cada vídeo**

In [None]:
## Function to extract the comments of a video

import time
from googleapiclient.discovery import build


# Function to retrieve comments from a YouTube video
def retrieve_comments(video_id):
    comments_data = []  # List to store comments and related data
    next_page_token = None  # Token for paginated API responses

    while True:
        try:
            # Construct a request to retrieve comment threads for the video
            comments_request = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                textFormat='plainText',
                maxResults=100,  # Maximum number of comments per page
                pageToken=next_page_token  # Use token for pagination
            )
            # Execute the comments request and store the response
            comments_response = comments_request.execute()
    
            # Iterate through comments in the response
            for item in comments_response['items']:
                comment_snippet = item['snippet']['topLevelComment']['snippet']
                comment_text = comment_snippet['textDisplay']  # Text of the comment
                # comment_likes = comment_snippet['likeCount']  # Number of likes on the comment
                comment_timestamp = comment_snippet['publishedAt']  # Timestamp of the comment
                comment_id = item['id']
    
                replies_data = []  # List to store reply texts and their timestamps
                # Construct a request to retrieve replies to the current comment
                reply_request = youtube.comments().list(
                    part='snippet',
                    parentId=item['id'],  # ID of the current comment
                    maxResults=100  # Maximum number of replies per page
                )
                # Execute the reply request and store the response
                reply_response = reply_request.execute()
    
                # Iterate through replies in the response
                for reply_item in reply_response['items']:
                    reply_text = reply_item['snippet']['textDisplay']
                    # reply_likes = reply_item['likeCount']  # Number of likes
                    reply_timestamp = reply_item['snippet']['publishedAt']  # Timestamp of the reply
                    replies_data.append({
                        'reply_id': reply_item['id'],
                        'reply': reply_text,
                        # 'likes': reply_likes,
                        'timestamp': reply_timestamp,
                        
                    })
    
                # Store comment data and related replies in the comments_data list
                comments_data.append({
                    'comment_id': comment_id,
                    'comment': comment_text,
                    # 'likes': comment_likes,
                    'timestamp': comment_timestamp,
                    'replies': replies_data
                    
                })
    
            # Retrieve the next page token from the response
            next_page_token = comments_response.get('nextPageToken')
            # Break the loop if there are no more pages or a maximum of 500 comments are retrieved
            if not next_page_token or len(comments_data) >= 500:
                break
    
            time.sleep(2)  # Add a delay between API requests to avoid rate limits
        except:
            return []

    return comments_data

**Descargar los vídeos de una lista dada**

In [None]:
import pandas as pd
# Using the function with all the selected videos
dict_video_comments = {}
i = 0
for video_id in lista_videos:
#for video_id in dict_respuestas_llm_filtrado.keys():
    comments_of_the_video = retrieve_comments(video_id)
    comments_with_timestamps = []
    for comment in comments_of_the_video:
        comments_with_timestamps.append({
        'comment_id':comment['comment_id'],
        'comment': comment['comment'].strip(),
        'timestamp': comment['timestamp'],
        'is_response': 'False',
        'response_of': "NA"
    })
        if comment['replies'] != []:
            for reply in comment['replies']:
                comments_with_timestamps.append({
                    'comment_id': reply['reply_id'],
                    'comment': reply['reply'].strip(),
                    'timestamp': reply['timestamp'],
                    'is_response': 'True',
                    'response_of': comment['comment_id']
                })
    comments_with_timestamps = pd.DataFrame(comments_with_timestamps)
    dict_video_comments[video_id] = comments_with_timestamps
    i += 1
    print(i)

dict_video_comments

**Combinamos todos los dataframes en uno solo y mantenemos solo los hilos de conversaciones**

In [None]:
# Descartamos los dataframes vacíos
dict_video_comments = {
    k: v for k, v in dict_video_comments.items() if not v.empty
}

# Creamos un identificador único para los registros de cada dataframe
for video_id, df in dict_video_comments.items():
    df['case_id'] = video_id
    dict_video_comments[video_id] = df

# Combinamos todos los dataframes en uno solo
combined_df = pd.concat(dict_video_comments.values(), ignore_index=True)

# Convertimos a 'timestamp' la columna timestamp
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])

# Extraemos todos los comment_id que fueron referenciados como 'response_of' (es decir, son roots reales de hilos)
roots_ids = combined_df['response_of'].dropna().unique()

# Función con la que asignamos un ID indicando a que conversación pertenece cada comentario
def assign_conversation_id(row):
    if row['comment_id'] in roots_ids:
        return row['comment_id']  # Este es el root de un hilo
    elif pd.notna(row['response_of']) and row['response_of'] in roots_ids:
        return row['response_of']  # Este es parte de un hilo
    else:
        return None  # Comentario aislado
# Aplicamos la función a todos los casos
combined_df['conversation_id'] = combined_df.apply(assign_conversation_id, axis=1)



# Eliminamos los comentarios aislados (sin conversation_id)
combined_df = combined_df[combined_df['conversation_id'] != 'NA']

# Contamos comentarios por conversación
conversation_sizes = combined_df['conversation_id'].value_counts()
valid_conversations = conversation_sizes[conversation_sizes > 1].index

# Filtramos solo las conversaciones útiles
combined_df = combined_df[combined_df['conversation_id'].isin(valid_conversations)]

In [None]:
combined_df

**Detección de vídeos en los que no se han descargado los comentarios**

In [None]:
lista_descargada=list(dict_video_comments.keys())
lista_videos = set(lista_videos)  # Convertimos a conjunto
lista_descargada = set(lista_descargada)  # Convertimos a conjunto

lista_faltante = lista_videos - lista_descargada
lista_faltante = list(lista_videos - lista_descargada)
lista_faltante

In [None]:
lista_videos=lista_faltante
len(lista_videos)

**Exportamos a CSV el resultado**

In [None]:
combined_df.to_csv("Comentarios_camuflated_pseudoscientific.csv")