#  Collect data from youtube for videos using video ID

In [20]:
import pandas as pd
import json
import requests
from io import StringIO
from csv import reader
import os
import glob

In [21]:
class YTstats:
    '''
    this class takes api_key and video id
    then obtain data for the video
    '''
  
    def __init__(self, api_key, video_id):
        self.api_key = api_key
        self.video_id = video_id
        self.video_statistics = None
  
    def get_video_statistics(self):
        '''
        recieve data from youtube with URL have the video_id and API key
        '''        
        url = f'https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={self.api_key}&part=snippet,contentDetails,statistics,status'
        json_url = requests.get(url)
        data = json.loads(json_url.text)
  
        try:
            data = data["items"]#[0]["statistics"]
        except:
            data = None
  
        self.video_statistics = data
        return data
  
    def dump(self):
        '''
        save the file with the data or print "nothing happend" it we have reply with no data
        '''          
        if self.video_statistics is None:
            print('nothing happend')
            return 
  
        video_title = self.video_id
        #video_title = video_title.replace(" ", "_")
  
        # generate a json file with all the statistics data of the youtube video
        file_name = video_title + '.json'
        with open(file_name, 'w') as f:
            json.dump(self.video_statistics, f, indent=4)
        print('file dumped')

### Start from here

# cleaning the ids - read them then delete any id we obtained before

In [383]:
# file url and folder url:
file_url = "./34- scraped ids by david  - amber heard playing victim.csv"
folder_url = "./34-amber heard playing victim/"

In [311]:
# read videos IDs we get by search:

new_ids = pd.read_csv(file_url)
new_ids.shape

(122, 1)

In [312]:
# remvove duplicates from its
new_ids.drop_duplicates(inplace=True, keep='first')
print(new_ids.shape)

(122, 1)


In [313]:
# read totals ids we have data for it:
total_ids = pd.read_csv(r'C:\Users\David\Amber Heard Case\Youtube\SNA-AH-Case-YouTube\scraping/IDs_for_all_videos_we_have_data_for.csv')

In [314]:
#make lists for new and old IDs to comparing
new_ids_list = list(new_ids['id'])
total_ids_list = list(total_ids['id'])
print("total IDs size = {}\nnew IDs size = {}".format(len(total_ids_list) , len(new_ids_list)))

total IDs size = 9979
new IDs size = 122


In [315]:
# see if we collected a video id we have it and save it to a list with name: not_duplicated_ids:
not_duplicated_ids = []
for new_id in new_ids_list:
    if new_id not in total_ids_list:
        not_duplicated_ids.append(new_id)
len(not_duplicated_ids)

0

In [259]:
# update the csv file to have no duplicated with any ID all the time
not_duplicated_ids_df = pd.DataFrame(not_duplicated_ids, columns=['id'])
not_duplicated_ids_df.to_csv(file_url, index=None)

In [260]:
# adding new IDs to the IDs_for_all_videos_we_have_data_for file:
new_total_ids = total_ids.append(not_duplicated_ids_df)

new_total_ids_df = pd.DataFrame(new_total_ids, columns=['id'])
new_total_ids_df.to_csv(r'C:\Users\David\Amber Heard Case\Youtube\SNA-AH-Case-YouTube\scraping/IDs_for_all_videos_we_have_data_for.csv', index=None)

### end of cleaning

# Obtaining

In [384]:
#read data from youtube file and extratc video code for each and make a list of the videos code:

# read dat for the same file we just dropped duplicate from:
address = file_url
Amber_Heard = pd.read_csv(address)
Amber_Heard = Amber_Heard.melt().drop('variable',axis=1).rename({'value':'id'},axis=1)
Amber_Heard = list(Amber_Heard['id']) # make a list with IDs we will obtain data from
len(Amber_Heard)

76

In [385]:
Amber_Heard[:3] #first 3 items

['QUQqXOMFtfk', 'jocf6k2MXRw', 'fWEn3lfmCKc']

In [386]:
# recive data using API and save it on PC

API_KEY = "AIzaSyA-0KfpLK04NpQN1XghxhSlz********"


# read IDs form list to obtaining data:
while len(Amber_Heard) != 0:
    
    for video in Amber_Heard:
        print(video)
        yt = YTstats(API_KEY, video)
        yt.get_video_statistics()
        Amber_Heard.remove(video)
        print('ids remain: ', len(Amber_Heard))
        yt.dump()

QUQqXOMFtfk
ids remain:  75
file dumped
fWEn3lfmCKc
ids remain:  74
file dumped
LGpLOUjKgDs
ids remain:  73
file dumped
ELZYfdcAmac
ids remain:  72
file dumped
UBApuBdAYFY
ids remain:  71
file dumped
INbpxn4by14
ids remain:  70
file dumped
r8rcLTH0Fd0
ids remain:  69
file dumped
DAsoO0HI3u0
ids remain:  68
file dumped
VFU0GTifYiU
ids remain:  67
file dumped
vPwD_VgP_mg
ids remain:  66
file dumped
mehMlUJLgKo
ids remain:  65
file dumped
wMcd1ywyiWI
ids remain:  64
file dumped
JGr8QBEGM38
ids remain:  63
file dumped
diEOWXOM65c
ids remain:  62
file dumped
GzMYx1-LhsM
ids remain:  61
file dumped
DhbaaA0Lu_4
ids remain:  60
file dumped
JGlEsLDPdyM
ids remain:  59
file dumped
AiEvwzKSgxE
ids remain:  58
file dumped
cro6uqs-1XE
ids remain:  57
file dumped
UAkAq00jXUI
ids remain:  56
file dumped
COLVU_rIL48
ids remain:  55
file dumped
g_agJcStq7g
ids remain:  54
file dumped
m7O87lUtiPQ
ids remain:  53
file dumped
XrmV-rKAhh8
ids remain:  52
file dumped
eVGB0aqsRro
ids remain:  51
file dumped


#  read and save JSON files collected

In [387]:
# read josn files and make a list with json files
# (make 1 change)

path_to_json = folder_url

json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
json_files;  
len(json_files)

76

In [388]:
# create a data frame with all json files:

df = pd.DataFrame()
for file in json_files:
    file_path = path_to_json + file
    with open(file_path) as data_file:    
        data = json.load(data_file)
        df = df.append(pd.json_normalize(data))
df.shape

(76, 47)

In [389]:
df.head(2)

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,...,status.license,status.embeddable,status.publicStatsViewable,status.madeForKids,statistics.viewCount,statistics.likeCount,statistics.dislikeCount,statistics.favoriteCount,statistics.commentCount,snippet.defaultLanguage
0,youtube#video,uQ2VTz2RRiIu-KJYapCe1O5vsCM,-c5Nunmd-xs,2021-09-24T10:27:44Z,UCYJA2v0ZsG5yMM3fUIkip2A,"One Sentence: Johnny Depp, Amber Heard, and an...",Jonny Depp increasingly looks to have been the...,https://i.ytimg.com/vi/-c5Nunmd-xs/default.jpg,120,90,...,youtube,True,True,False,597,104,1,0,14,
0,youtube#video,49EnyzitSjAUnJjLxEKKAlBJdAw,1dQExSLqQXo,2020-11-12T03:55:41Z,UC8AGq7bm-x4teGcvQonvnPw,Amber Heard & TMZ Target My Channel | Conserva...,Let me know what you think!\n\n💕 Don't Forget ...,https://i.ytimg.com/vi/1dQExSLqQXo/default.jpg,120,90,...,youtube,True,True,False,4714,698,17,0,328,


In [390]:
# save this dataframe with the search name as csv file:

save_csv_url = r"C:\Users\David\Amber Heard Case\Youtube\SNA-AH-Case-YouTube\scraping\Videos_details/{} --scraped.csv".format(path_to_json.split('/')[1])
df.to_csv(save_csv_url, index=None)

# save channels IDs to collect cannels info

In [391]:
# make new df with channels IDs:
new_channels_id_df = pd.DataFrame(df['snippet.channelId'])

# for the same batch read channels we have then add the news:
channel_id_file_url = r'C:\Users\David\Amber Heard Case\Youtube\SNA-AH-Case-YouTube\scraping\channels_ids_batch-4.csv'
channel_id_df = pd.read_csv(channel_id_file_url)
print("channels count before adding: ", channel_id_df.shape[0])
# append the two dfs together and update the file to have the new IDs
total_channels_ids = new_channels_id_df['snippet.channelId'].append(channel_id_df['id'], ignore_index=True)
total_channels_ids = pd.DataFrame(total_channels_ids, columns=['id'])
total_channels_ids.to_csv(channel_id_file_url, index=None )
print("channels count after adding: ", total_channels_ids.shape[0])

channels count before adding:  2415
channels count after adding:  2491
