In [162]:
import re
import requests
import googleapiclient
import pandas as pd
API_KEY = ''
URL = 'https://www.googleapis.com/youtube/v3/search'

In [174]:
# check whether we've run out of daily quota amount from the API
def check_daily_limit():
  params = {
      'part': 'snippet',
      'type': 'video',
      'q': 'Israel Palestine Conflict',
      'key': API_KEY,
      'maxResults': 1,
  }
  response = requests.get(URL, params=params)
  if response.status_code == 200:
      print("Quota not exceeded")
  else:
      print(f"Error: {response.status_code}")
      print(response.text)
check_daily_limit()

Quota not exceeded


In [183]:
# duration_conversion to seconds by William
def duration_conversion(duration):
    import re
    match = re.match('PT(\d+H)?(\d+M)?(\d+S)?', duration)

    if match!=None:
      hours = int(match.group(1)[:-1]) if match.group(1) else 0
      minutes = int(match.group(2)[:-1]) if match.group(2) else 0
      seconds = int(match.group(3)[:-1]) if match.group(3) else 0
      return hours * 3600 + minutes * 60 + seconds
    else:
      print("duration pattern not matched")
      return None

# Retrieve Video & Shorts based on the topic and a selected list of channels

In [191]:
def data_retrieval(topic, API_KEY, channels, rows, search_type):
  params = {
      'part': 'snippet',
      'q': topic,
      'type': 'video',
      'key': API_KEY,
      'maxResults': 50,
  }
  data=[]

  #stop until we've retrived enough rows of videos
  while len(data) < rows:
    response = requests.get(URL, params=params)
    videos = response.json()

    if 'items' in videos:
      for video in videos['items']:
        # video information
          video_id = video['id']['videoId']
          video_title = video['snippet']['title']
          channel = video['snippet']['channelTitle']

        # check whether the video is published by one of our targeted channels
          if channel in channels:
            youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)
            request = youtube.videos().list(
                part="contentDetails,statistics",
                id=video_id
            )
            response = request.execute()
            duration = response["items"][0]["contentDetails"].get("duration", "")
            duration = duration_conversion(duration)

            if (duration==None):
              continue
            #double checking the duration to ensure correct video type is stored
            elif (duration > 60 and search_type == 'video') or (duration <= 60 and search_type == 'short'):
              data.append([channel, video_title, video_id])

            #report status of data retrival - how many rows have been stored
            if len(data)%12 ==0:
              print(f"Data retrieved: {len(data)} rows")

      # handle paginated API responses
      if 'nextPageToken' in videos:
          params['pageToken'] = videos['nextPageToken']
      else:
        # if there no more pages of data available, end of search
          break
    else:
          break

  #write to csv
  df = pd.DataFrame(data, columns=['Channel', 'Title', 'Video ID'])
  df.to_csv(f'Israel_Palestine_Conflict_{search_type}.csv', index=False, encoding='utf-8')
  return df

In [None]:
channels=["Voice Of America","NBC News","CNN","Fox News","CBC News","Guardian News"]

df_videos = data_retrieval('Israel Palestine Conflict', API_KEY, channels, 100, 'video')
print(df_videos.head())

df_shorts = data_retrieval('Israel Palestine Conflict', API_KEY, channels, 10,'short')
print(df_shorts.head())

In [189]:
df_shorts

Unnamed: 0,Channel,Title,Video ID
0,CNN,See how man survived massacre in Israel,L_P1nbNPH3M
1,WFAA,Ultra-orthodox Jews spit towards Christian pil...,sUJrXNCfUrk
2,AJ+,How was Israel created over Palestine?,Z8IR7IhrszQ
3,CNN-News18,Israel-Palestine Conflict | Israel Vs Palestin...,yWbvm15sO9Q
4,CNN-News18,Israel-Palestine Conflict | Israel Vs Palestin...,yWbvm15sO9Q
5,The Telegraph,Israel&#39;s Iron Dome intercepts Hamas rocket...,D9fynbWJUOc
6,Al Jazeera English,Shocked child in viral video from Gaza makes a...,aJQ66srsgzo
7,Zee News,Putin on Israel Hamas Conflict: इजरायल-हमास यु...,ifDIr4KPv3Q
8,MSNBC,"Israel strikes Gaza, says it will &#39;only in...",aobV2A3O-XQ
9,Aaj Tak,Israel Palestine Conflict : हमास चीफ की कुल सं...,jhe_1qGzxBE


# Retrieve all Video & Shorts from all channels thats related to the topic

In [196]:
def retrieve_all_data(topic, API_KEY, channels, rows, search_type):
  params = {
      'part': 'snippet',
      'q': topic,
      'type': search_type,
      'key': API_KEY,
      'maxResults': 50,
  }
  data=[]

  #stop until we've retrived enough rows of videos
  while len(data) < rows:
    response = requests.get(URL, params=params)
    videos = response.json()

    if 'items' in videos:
      for video in videos['items']:
        # video information
          video_id = video['id']['videoId']
          video_title = video['snippet']['title']
          channel = video['snippet']['channelTitle']

        # calculate duration
          youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)
          request = youtube.videos().list(
              part="contentDetails,statistics",
              id=video_id
          )
          response = request.execute()
          duration = response["items"][0]["contentDetails"].get("duration", "")
          duration = duration_conversion(duration)

          data.append([channel, video_title, video_id, duration])

          #report status of data retrival - how many rows have been stored
          if len(data)%12 ==0:
            print(f"Data retrieved: {len(data)} rows")

      # handle paginated API responses
      if 'nextPageToken' in videos:
          params['pageToken'] = videos['nextPageToken']
      else:
        # if there no more pages of data available, end of search
          break
    else:
          break

  #write to csv
  df = pd.DataFrame(data, columns=['Channel', 'Title', 'Video ID','Duration'])
  df.to_csv(f'Israel_Palestine_Conflict_{search_type}.csv', index=False, encoding='utf-8')
  return df

In [197]:
df_videos = retrieve_all_data('Israel Palestine Conflict', API_KEY, channels, 100, 'video')
print(df_videos.head())

duration pattern not matched
duration pattern not matched
duration pattern not matched
Data retrieved: 12 rows
duration pattern not matched
duration pattern not matched
duration pattern not matched
duration pattern not matched
Data retrieved: 24 rows
duration pattern not matched
duration pattern not matched
Data retrieved: 36 rows
Data retrieved: 48 rows
Data retrieved: 60 rows
duration pattern not matched
Data retrieved: 72 rows
Data retrieved: 84 rows
Data retrieved: 96 rows
       Channel                                              Title  \
0          Vox  The Israel-Palestine conflict: a brief, simple...   
1         WION  Israel-Palestine War LIVE: US strikes on Syria...   
2  War Stories  The Entire Israeli-Palestine Conflict Explaine...   
3         WION  Israel-Palestine war: A simple history of how ...   
4          Vox                                    Gaza, explained   

      Video ID  Duration  
0  iRYZjOuUnlU     619.0  
1  sE5Qp40GwJA       NaN  
2  9cU8B7FXX6g    2656

## Filter dataset before saving

In [204]:
df_videos[df_videos['Duration']>60].to_csv(f'All_channels_video.csv', index=False, encoding='utf-8')
df_videos[df_videos['Duration']<=60].to_csv(f'All_channels_short.csv', index=False, encoding='utf-8')

In [203]:
df_videos[(df_videos['Duration'] <= 60) & (df_videos['Channel'].isin(channels))]

Unnamed: 0,Channel,Title,Video ID,Duration
34,CNN,See how man survived massacre in Israel,L_P1nbNPH3M,49.0
