
Step 1: A list of 15 YouTube news channels is mannually created

In [1]:
import datetime
import pandas as pd
import re

In [2]:
channels = [
    {'channel_name': 'CNBC Television',
     'channel_id': 'UCrp_UI8XtuYfpiqluWLD7Lw'},
    {'channel_name': 'Yahoo Finance',
     'channel_id': 'UCEAZeUIeJs0IjQiqTCdVSIg'},
    {'channel_name': 'Bloomberg Television',
     'channel_id': 'UCIALMKvObZNtJ6AmdCLP7Lg'},
    {'channel_name': 'Financial Times',
     'channel_id': 'UCoUxsWakJucWg46KW5RsvPw'},
    {'channel_name': 'The Wall Street Journal',
     'channel_id': 'UCK7tptUDHh-RYDsdxO1-5QQ'},
    {'channel_name': 'Reuters',
     'channel_id': 'UChqUTb7kYRX8-EiaN3XFrSQ'},
    {'channel_name': 'The New York Times',
     'channel_id': 'UCqnbDFdCpuN8CMEg0VuEBqA'},
    {'channel_name': 'The Guardian',
     'channel_id': 'UCHpw8xwDNhU9gdohEcJu4aA'},
    {'channel_name': 'Fox News',
     'channel_id': 'UCXIJgqnII2ZOINSWNOGFThA'},
    {'channel_name': 'CNN',
     'channel_id': 'UCupvZG-5ko_eiXAupbDfxWw'},
    {'channel_name': 'NBC News',
     'channel_id': 'UCeY0bbntWzzVIaj2z3QigXg'},
    {'channel_name': 'ABC News',
     'channel_id': 'UCBi2mrWuNuyYy4gbM6fU18Q'},
    {'channel_name': 'CBS News',
     'channel_id': 'UC8p1vwvWtl6T73JiExfWs1g'},
    {'channel_name': 'BBC News',
     'channel_id': 'UC16niRr50-MSBwiO3YDb3RA'},
    {'channel_name': 'The Economist',
     'channel_id': 'UC0p5jTq6Xx_DosDFxVXnWaQ'}
]

Step 2: Within each channel, search videos that are stock-related

In [3]:
# YouTube Data API
from googleapiclient.discovery import build
api_key = 'AIzaSyAJ0iBGJ3KGzqa8kGy1C5XXfRN15eAlKuA'
youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
def get_video_ids(query, channel_id, published_after, published_before):
  videos = []
  next_page_token = None
  while True:
      video_request = youtube.search().list(
          q=query,
          part="id",
          channelId=channel_id,
          maxResults=50,
          pageToken=next_page_token,
          publishedAfter=published_after,
          publishedBefore=published_before,
          type="video",
          videoDefinition="high",
          )
      video_response = video_request.execute()
      videos += video_response["items"]
      next_page_token = video_response.get("nextPageToken")
      if not next_page_token:
          break
  video_ids = [video["id"].get("videoId") for video in videos]
  return video_ids

channel_ids = [channel['channel_id'] for channel in channels]
query = "stock"
published_after = '2024-02-25T00:00:00Z'
published_before = '2024-03-11T00:00:00Z'

video_ids = []
for cid in channel_ids:
  current_video_ids = get_video_ids(query, cid, published_after, published_before)
  video_ids += current_video_ids

In [None]:
# discard live videos
def remove_live_videos(video_ids):
  updated_video_ids = []
  for videoId in video_ids:
    response = youtube.videos().list(
        part='liveStreamingDetails',
        id=videoId
        ).execute()
    try:
      live_streaming_details = response['items'][0]['liveStreamingDetails']
    except:
      updated_video_ids.append(videoId)

  return updated_video_ids

updated_video_ids = remove_live_videos(video_ids)

Step 3: Fetch video information and transcript

In [6]:
# YouTubeTranscriptApi
!pip install youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi

Collecting youtube_transcript_api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-0.6.2


In [None]:
def get_video_transcript(videoId):
  try:
    transcript = YouTubeTranscriptApi.get_transcript(videoId)
  except:
    transcript = ''

  final_transcript = ''
  if len(transcript) > 10:
    for caption in transcript:
      final_transcript += caption['text']
      final_transcript += ' '

  return final_transcript


def get_video_data(videoId):
  response = youtube.videos().list(
      part='snippet',
      id=videoId
  ).execute()
  title = response['items'][0]['snippet']['title']
  upload_date = response['items'][0]['snippet']['publishedAt']
  upload_date = datetime.datetime.strptime(upload_date, '%Y-%m-%dT%H:%M:%S%z')
  channel = response['items'][0]['snippet']['channelTitle']

  transcript = get_video_transcript(videoId)

  return title, channel, upload_date, transcript

video_list = []
video = {}
for videoId in updated_video_ids:
    title, channel, upload_date, transcript = get_video_data(videoId)
    if transcript == '':
      continue
    video = {'Video ID': videoId, 'Title': title, 'Channel': channel, 'Upload date': upload_date, 'Transcipt': transcript}
    video_list.append(video)

Step 4: Identify S&P 500 stocks from video titles

In [5]:
sp500_df = pd.DataFrame(pd.read_excel('sp500.xlsx'))
sp500_df.drop_duplicates(inplace=True)
sp500_df

Unnamed: 0,Company,Symbol
0,Microsoft,MSFT
1,Apple,AAPL
2,NVIDIA,NVDA
3,Alphabet,GOOGL
4,Amazon,AMZN
...,...,...
485,Hasbro,HAS
486,Generac,GNRC
487,Comerica,CMA
488,DENTSPLY SIRONA,XRAY


In [None]:
def identify_stocks_from_titles(title):
  identified_stocks = []
  for i in range(len(sp500_df)):
    company = sp500_df.iloc[i]['Company']
    symbol = sp500_df.iloc[i]['Symbol']

    # check if company in title
    pattern = r'\b{}\b'.format(re.escape(company))
    match = re.search(pattern, title)
    if match:
      identified_stocks.append(i)
    # check if symbol in title given that company is not in title
    else:
      pattern = r'\b{}\b'.format(re.escape(symbol))
      match = re.search(pattern, title)
      if match:
        identified_stocks.append(i)

  return identified_stocks

In [None]:
for video in video_list:
  identified_stocks = identify_stocks_from_titles(video['Title'])
  video['Stock_indices'] = identified_stocks

In [None]:
pd.DataFrame(video_list)

Unnamed: 0,Video ID,Title,Channel,Upload date,Transcipt,Stock_indices
0,YEoJq_PcOgc,What can reignite Apple shares?,CNBC Television,2024-02-28 17:55:38+00:00,LATELY. WE ALSO HAVE NEW COMMITTEE MOVES TO DI...,[1]
1,BOm0zNiaNjg,Alphabet's AI problems: Stock falls 4%,CNBC Television,2024-02-26 19:01:28+00:00,♪ >>> WELCOME BACK. LET'S TALK ALPHABET. IT I...,[3]
2,XTCiSQhcIvk,Two market experts debate the push and pull be...,CNBC Television,2024-03-04 12:29:27+00:00,"GENTLEMEN, THANK YOU FOR BEIN \nR HERE. >> THA...",[]
3,GQK4bCZHudM,Top tech stocks don't look 'wildly expensive' ...,CNBC Television,2024-03-06 14:24:00+00:00,"WE GOT ADT TODAY MIGHT BE INTERESTING, TOO. TH...",[]
4,Hp6IRjNJuI4,Jim Cramer on what could be the next $1 trilli...,CNBC Television,2024-02-27 00:23:35+00:00,"""MAD MONEY"" WITH JIM CRAMER STARTS RIGHT NOW....",[]
...,...,...,...,...,...,...
1250,RXmMUEGLHEc,Donald Trump can stay on presidential ballot r...,BBC News,2024-03-04 15:33:59+00:00,[Music] now we're going to move away from the ...,[]
1251,S6hF2ffyz2c,Farmers set fires in Brussels ahead of Agricul...,BBC News,2024-02-26 11:42:30+00:00,agricultural ministers from across the Europea...,[]
1252,pyGyrz06ljg,Israeli protesters block food convoys for star...,BBC News,2024-03-05 23:46:00+00:00,President Biden has warned Israel that there a...,[]
1253,jR_LNMjJa2k,Sweden formally joins Nato military alliance |...,BBC News,2024-03-07 16:57:10+00:00,this is BBC News just want to take you straigh...,[]


Step 5: Save collected data

In [None]:
pd.DataFrame(video_list).to_csv('youtube_collected_2024.csv')