# TEDscraper Notebook

In [1]:
import random
import re
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from fake_useragent import UserAgent

import requests
from requests import request
from requests.compat import urljoin, urlparse
from requests.exceptions import HTTPError
from urllib.robotparser import RobotFileParser
from requests import Session
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import simplejson

In [2]:
driver=webdriver.Chrome() 

In [3]:

headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

In [4]:
url='https://www.ted.com/talks?language=en&page=1&sort=newest'
params={'language':'en','page':1,'sort':'newest'}


In [5]:
driver.get(url)
video=list()
for _ in driver.find_elements_by_class_name('talk-link'):
    video.append(_.find_element_by_css_selector('a').get_attribute('href'))


In [6]:
driver.close()

In [7]:
video

['https://www.ted.com/talks/gareth_gaskell_how_do_our_brains_process_speech?language=en',
 'https://www.ted.com/talks/ethan_hawke_give_yourself_permission_to_be_creative?language=en',
 'https://www.ted.com/talks/matt_walker_how_caffeine_and_alcohol_affect_your_sleep?language=en',
 'https://www.ted.com/talks/iseult_gillespie_the_myth_of_jason_medea_and_the_golden_fleece?language=en',
 'https://www.ted.com/talks/kwame_owusu_kesse_a_comprehensive_neighborhood_based_response_to_covid_19?language=en',
 'https://www.ted.com/talks/philip_freeman_the_rise_and_fall_of_the_celtic_warriors?language=en',
 'https://www.ted.com/talks/tom_teves_a_call_to_end_the_media_coverage_mass_shooters_want?language=en',
 'https://www.ted.com/talks/alex_gendler_the_egyptian_myth_of_the_death_of_osiris?language=en',
 'https://www.ted.com/talks/shari_davis_what_if_you_could_help_decide_how_the_government_spends_public_funds?language=en',
 'https://www.ted.com/talks/nita_mosby_tyler_want_a_more_just_world_be_an_unl

## Soup Maker

In [8]:
class SoupMaker:
    """Make soup objects and put your machine to sleep."""
    

    def sleep_short(self):
        """Suspends execution time between 0 - .2 seconds."""
        return time.sleep(random.uniform(0, .2))

    def sleep_two(self):
        """Suspends execution time between .5 - 2 seconds."""
        return time.sleep(random.uniform(.5, 2))
    
    def sleep_five(self):
        """Suspends execution time between 3 - 5 seconds."""
        return time.sleep(random.uniform(3, 5))

    def make_soup(self, url):
        """Returns soup object from a URL."""
        # generate random user-agent
        user_agent = {'User-agent': UserAgent().random}
        # request page and make soup
        page = requests.get(url, headers=user_agent)
        soup = BeautifulSoup(page.content, 'lxml')
        return soup

    def taste_soup(self, soup):
        """Taste test soup object."""
        try:
            taster = soup.title.text
            bad_soup = re.search(r'404: Not Found', taster)
        except AttributeError:
            bad_soup = None
        return bad_soup


## CreateCSV

In [9]:
class CreateCSV(SoupMaker):
    """Create CSVs of TED topics and languages."""

    def create_topics_csv(self):
        """Creates CSV of all topics available from TED."""
        soup = self.make_soup('https://www.ted.com/topics')
        topic_list = []
        topic_tag = soup.find_all(class_='d:b', style='line-height:3;')
        for tag in topic_tag:
            topic = re.sub(r'\s+', '', tag.text)
            topic_list.append(topic)
        topics_series = pd.Series(topic_list, name='Topic')
        topics_series.to_csv('../data/topics.csv', index=False)

    def create_languages_csv(self):
        """Creates CSV of all language codes supported by TED."""
        lang_url = 'https://www.ted.com/participate/translate/our-languages'
        soup = self.make_soup(lang_url)
        lang_list = []
        lang_tags = soup.find_all('div', class_='h9')
        for tag in lang_tags:
            if tag.a == None:
                continue
            else:
                lang_code = re.search(r'(?<=\=)[\w-]+', tag.a['href']).group(0)
                lang_name = tag.text
                lang_list.append([lang_code] + [lang_name])
        lang_df = pd.DataFrame(data=lang_list, columns=['lang_code', 'language'])
        lang_df.to_csv('../data/languages.csv', index=False)


## Talk Features

In [10]:
class TalkFeatures(SoupMaker):
    """Class to get TED talk features."""


    def get_talk_id(self, soup):
        """Returns the talk_id provided by TED."""
        talk_id = re.search(r"(?<=\"current_talk\":)\"(\d+)\"", str(soup)).group(1)
        return talk_id

    def get_title(self, soup):
        """Returns the title of the talk."""
        title_tag = soup.find(attrs={'name': 'title'}).attrs['content']
        tag_list = title_tag.split(':')
        title = ":".join(tag_list[1:]).lstrip()
        return title

    def get_speaker_1(self, soup):
        """Returns the first speaker in TED's speaker list."""
        try:
            speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert to DataFrame
            speakers_df = pd.read_json(speaker_tag)
            full_name_raw = (speakers_df.loc[:, 'firstname'] + ' '
                         + speakers_df.loc[:, 'middleinitial'] + ' '
                         + speakers_df.loc[:, 'lastname'])
            full_name_clean = full_name_raw.str.replace('\s+', ' ')
            # transform series to a dict
            speaker = full_name_clean.iloc[0]
        except:
            speaker = re.search(r"(?<=\"speaker_name\":)\"(.*?)\"", str(soup)).group(1)
        return speaker

    def get_all_speakers(self, soup):
        """Returns dict of all speakers per talk."""
        try:
            speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert to DataFrame
            speakers_df = pd.read_json(speaker_tag)
            full_name_raw = (speakers_df.loc[:, 'firstname'] + ' '
                         + speakers_df.loc[:, 'middleinitial'] + ' '
                         + speakers_df.loc[:, 'lastname'])
            full_name_clean = full_name_raw.str.replace('\s+', ' ')
            # transform series to a dict
            speakers = full_name_clean.to_dict()
        except:
            speakers = None
        return speakers

    def get_occupations(self, soup):
        """Returns list of the occupation(s) of the speaker(s) per talk."""
        try:
            occupations_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert json to DataFrame
            occupations_series = pd.read_json(occupations_tag)['description']
            if occupations_series.all():
                # clean and create dict
                occupations = occupations_series.str.lower().str.split(', ')
                occupations = occupations.to_dict()
            else:
                occupations = None
        except:
            occupations = None
        return occupations

    def get_about_speakers(self, soup):
        """Returns dict with each 'About the Speaker' blurb per talk."""
        try:
            speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert to DataFrame
            about_series = pd.read_json(speaker_tag)['whotheyare']
            if about_series.all():
                # transform series to a dict
                about_speakers = about_series.to_dict()
            else:
                about_speakers = None
        except:
            about_speakers = None
        return about_speakers

    def get_views(self, soup):
        """Returns viewed count per talk."""
        view_count = re.search(r"(?<=\"viewed_count\":)\d+", str(soup)).group(0)
        return view_count

    def get_recorded_date(self, soup):
        """Returns date a talk was recorded."""
        try:
            tag = re.search(r"(?<=\"recorded_at\":\")[\d-]+", str(soup))
            recorded_at = tag.group(0)
        except:
            recorded_at = None
        return recorded_at

    def get_published_date(self, soup):
        """Returns date a talk was published in TED.com."""
        published_raw = soup.find(attrs={'itemprop': 'uploadDate'}).attrs['content']
        published_date = re.search(r"[\d-]+", published_raw).group(0)
        return published_date

    def get_event(self, soup):
        """Returns name of the event in which the talk was given."""
        event = re.search(r"(?<=\"event\":)\"(.*?)\"", str(soup)).group(1)
        return event
    
    def get_native_lang(self, soup):
        """Returns native language code for each talk as a string."""
        native_lang = re.search(r'(?<=nativeLanguage\":)\"(.*?)\"', str(soup)).group(1)
        return native_lang
    
    def get_available_lang(self, soup):
        """Returns list of all available languages (lang codes) for a talk."""
        languages = re.findall(r'(?<=languageCode\":)\"(.*?)\"', str(soup))
        clean_lang = sorted(list(set(languages)))
        return clean_lang

    def get_comments_count(self, soup):
        """Return the count of comments per talk."""
        try:
            comments_count = re.search(r"(?<=\"count\":)(\d+)", str(soup)).group(1)
        except AttributeError:
            comments_count = None
        return comments_count

    def get_duration(self, soup):
        """Returns duration of a talk in seconds."""
        duration =  re.search(r"(?<=\"duration\":)(\d+)", str(soup)).group(1)
        return duration

    def get_topics(self, soup):
        """Returns list of tags (topics) per talk."""
        match_obj = re.search(r"\"tag\":\"(.*?)\"", str(soup))
        topics = match_obj.group(1).split(',')
        return topics

    def get_related_talks(self, soup):
        """Returns dict (keys: id & title) of related talks."""
        related_tag = re.search(r"(?<=\"related_talks\":).*?]", str(soup)).group(0)
        related_sr = pd.read_json(related_tag)
        related_talks = dict(zip(related_sr['id'], related_sr['title']))
        return related_talks

    def get_talk_url(self, soup):
        """Returns url for each talk as a string."""
        talk_tag = soup.find(attrs={'property': 'og:url'}).attrs['content']
        talk_url = talk_tag.split('transcript')[0]
        return talk_url

    def get_description(self, soup):
        """Returns description of the talk."""
        desc_tag = soup.find(attrs={'property': 'og:description'}).attrs['content']
        talk_desc = desc_tag.split(': ', 1)[1]
        return talk_desc

    def get_transcript(self, soup):
        """Returns talk's transcript as a single string.""" 
        transcript = ''
        transcript_strings = []
        for div in soup.find_all('div', class_="Grid__cell flx-s:1 p-r:4"):
            for p in div.find_all('p'):
                # add every string in the transcript to a list
                transcript_strings.append(" ".join(p.text.split()))
            else:
                # after all strings have been added, create a single transcript string
                transcript = " ".join(transcript_strings)
        return transcript


## URLs

In [11]:
class URLs(SoupMaker):
    """Get and process urls to scrape."""

    
    def topics_url_param(self):
        """Returns string of the url query from topics parameter."""
        topics_param = ''
        if self.topics != 'all':
            if isinstance(self.topics, list):
                for topic in self.topics:
                    topics_param += ('&topics[]=' + topic)
            else:
                raise ValueError("'topics' param needs to be a list")
        return topics_param

    def get_max_page(self):
        """Returns max pagination number from www.ted.com/talks."""
        page_num = [1]
        # make soup from ted.com/talks with specified language
        soup = self.make_soup(self.base_url + '&page=1&sort=newest')
        # iterate through each pagination element and get the max
        page_elem = soup.find_all('a', class_='pagination__item pagination__link')
        for element in page_elem:
            page_num.append(int(element.text))
        return max(page_num)
    
    def get_all_url_paths(self):
        """Returns list of all the talk url paths available in www.ted.com/talks"""
        url_path_list = []
        # construct url with lang code specified by the user
        talks_url = (self.base_url + '&page=')
        # set range from 1 to the max page in the pagination element
        page_range = range(1, self.get_max_page()+1)
        # iterate through each page and get the url for each talk
        for i in page_range:
            # try a second attempt if first attempt fails
            for attempt in range(2):
                try:
                    talks_page_url = talks_url + str(i) + '&sort=newest'
                    soup = self.make_soup(talks_page_url)
                    # delay between searches
                    self.sleep_short()
                    for div in soup.find_all('div', attrs={'class': 'media__image'}):
                        for a in div.find_all('a'):
                            url_path_list.append(a.get('href'))
                except:
                    # delay before continuing to second attempt
                    self.sleep_two()
                # break from attempts loop if no exceptions are raised
                else:
                    break
        return url_path_list

    def get_all_urls(self):
        """Returns list of complete urls for each talk's transcript page."""
        # '/talks/jen_gunter_why_can_t_we_talk_about_periods?language=fa'
        url_list = []
        for url in self.get_all_url_paths():
            url_list.append(('https://www.ted.com'
                             + url.replace(
                                 # to replace
                                 '?language=' + self.lang_code,
                                 # replace with
                                 '/transcript' + '?language=' + self.lang_code)
                            ))
        return url_list
    
    def clean_urls(self, urls):
        """Returns list of clean urls from urls the user inputs."""
        clean_urls = []
        for idx, url in enumerate(urls):
            if url.startswith('https://www.ted.com/talks'):
                parts = url.split('/')
                joined = '/'.join(parts[:5])
                clean = joined.split('?')
                lang = clean[0] + '/transcript?language=' + self.lang_code
                topic = lang + self.topics_url_param()
                clean_urls.append(lang)
            else:
                print(f'bad url @ {idx} >> {url}')
                continue
        return clean_urls
    
    def url_issues(self):
        """Returns DataFrame of urls with known issues."""
        issues_df = pd.read_csv('../data/known_issues.csv')
        return issues_df
    
    def remove_urls_with_issues(self):
        """Remove urls with known issues to prevent unnecessary scraping."""
        urls = self.all_urls()
        final_urls = []
        removed_urls = []
        removed_counter = 0
        issues_df = pd.read_csv('../data/known_issues.csv')
        for url in urls:
            try:
                base_url = url.replace('transcript?language=' + self.lang_code, '')
                # is base url in the issues df?
                url_in_issues = (issues_df['url'] == base_url).any()
                # get the lang_codes of the base_url
                langs = issues_df.loc[issues_df['url'] == base_url, 'lang_code']
                # check if the url in issues_df
                if not url_in_issues:
                    final_urls.append(url)
                # if the url is in issues_df, check if it's for the same lang_code
                elif self.lang_code in langs.any():
                    removed_urls.append(url)
                    removed_counter += 1
                    continue
                else:
                    final_urls.append(url)
            except:
                removed_urls.append(url)
                removed_counter += 1
                continue
        if removed_urls:
            print(f"Removed the following {removed_counter} urls as they have "
                  "known issues:\n", removed_urls, end='\n\n')
        return final_urls

    def all_urls(self):
        """Return all urls based on parameter 'urls' without removing."""
        # define url attribute
        if self.urls == 'all':
            urls = self.get_all_urls()
        else:
            if isinstance(self.urls, list):
                urls = self.clean_urls(self.urls)
            else:
                raise ValueError("'urls' param needs to be a list")
        return urls

    def final_urls(self):
        """Return final urls to fetch."""
        # define url attribute
        if self.force_fetch:
            urls = self.all_urls()
        else:
            urls = self.remove_urls_with_issues()  
        return urls

    def seen_urls(self, url, attempt):
        """Returns attempt depending on seen urls for urls that fail."""
        if url not in self.seen:
            yield url
            seen.add(url)
        # if the url was appended earlier after 2 failed attempts
        # it means this is the last attempt (3)
        elif url in self.seen and attempt == 1:
            attempt = 3
        return attempt


## TEDscraper

In [12]:
class TEDscraper(TalkFeatures, URLs):
    """Gets urls and scrapes TED talk data in the specified language.

    Attributes:
        lang_code (str): Language code. Defaults to 'en'.
        language (str): Language name derived from lang_code.
        urls (list): URLs of talks. Defaults to 'all'.
        topics (list): Talk topics. Defaults to 'all'.
        exclude (bool): Exclude transcript. Defaults to False.
        ted_dict (dict): Dict to store ted talk features after scraping.
        dict_id (int): Index of nested dict in 'ted_dict'.
        failed_counter: Counts urls that failed to get scraped.
    """
 

    def __init__(self, lang_code='en', urls='all', topics='all',
                 force_fetch = False, exclude_transcript=False):
        self.lang_code = lang_code
        self.language = self.convert_lang_code()
        self.urls = urls
        self.topics = topics
        self.exclude = exclude_transcript
        self.ted_dict = {}
        self.dict_id = 0
        self.failed_counter = 0
        self.failed_urls = []
        self.force_fetch = force_fetch
        self.seen = set()
        self.base_url = ('https://www.ted.com/talks'
                         + '?language=' + self.lang_code
                         + self.topics_url_param())

    def scrape_all_features(self, soup):
        """Scrapes all features to a nested dict."""
        # create nested dict
        self.ted_dict[self.dict_id] = {}
        nested_dict = self.ted_dict[self.dict_id]
        # add the features to the nested dict
        nested_dict['talk_id'] = self.get_talk_id(soup)
        nested_dict['title'] = self.get_title(soup)
        nested_dict['speaker_1'] = self.get_speaker_1(soup)
        nested_dict['all_speakers'] = self.get_all_speakers(soup)
        nested_dict['occupations'] = self.get_occupations(soup)
        nested_dict['about_speakers'] = self.get_about_speakers(soup)
        nested_dict['views'] = self.get_views(soup)
        nested_dict['recorded_date'] = self.get_recorded_date(soup)
        nested_dict['published_date'] = self.get_published_date(soup)
        nested_dict['event'] = self.get_event(soup)
        nested_dict['native_lang'] = self.get_native_lang(soup)
        nested_dict['available_lang'] = self.get_available_lang(soup)
        nested_dict['comments'] = self.get_comments_count(soup)
        nested_dict['duration'] = self.get_duration(soup)
        nested_dict['topics'] = self.get_topics(soup)
        nested_dict['related_talks'] = self.get_related_talks(soup)
        nested_dict['url'] = self.get_talk_url(soup)
        nested_dict['description'] = self.get_description(soup)
        # add transcript if param is set to False (default)
        if not self.exclude:
            nested_dict['transcript'] = self.get_transcript(soup)
        return nested_dict

    def get_data(self):
        """Returns nested dictionary of features from each talk's transcript page."""
        print("Fetching urls...\n")
        urls = self.final_urls()
        print(f"Scraping {len(urls)} TED talks in '{self.language}'...")
        print(f"Estimated time to complete is {round((.9*len(urls)/60), 1)} minutes\n")
        # iterate through each TED talk transcript url
        for url in urls:
            # delay between each scrape
            self.sleep_short()
            # try up to three attempts
            for i in range(1, 4):
                # check if url has been seen, if true:
                # it means it previously failed twice so make it the final attempt
                attempt = self.seen_urls(url, i)
                try:
                    # make soup
                    soup = self.make_soup(url)                                        
                    # create nested dict
                    self.ted_dict[self.dict_id] = {}
                    # scrape features and add to a nested dict
                    self.scrape_all_features(soup)
                except Exception as e:
                    # taste if it's a bad soup
                    if self.taste_soup(soup):
                        print(f"[BAD_SOUP] {url}")
                        self.failed_urls.append(url)
                        self.failed_counter += 1
                        break
                    elif attempt == 1:
                        # 3-5 second delay before another attempt
                        self.sleep_five()
                        continue
                    elif attempt == 2:
                        # append the url to 'urls' to try again later
                        urls.append(url)
                        break
                    elif attempt == 3:
                        print(f"[EXCEPTION] {e} {url}")
                        self.failed_counter += 1
                        self.failed_urls.append(url)
                        break
                else:
                    # indicate successful scrape
                    print(f"[OK] {self.dict_id} {url}")
                    # add 1 to create a new nested dict
                    self.dict_id += 1
                    # exit attempts loop
                    break
        # print results
        print(f"""\nTed.com scraping results:
            \n\t• Successful: {self.dict_id}
            \n\t• Failed: {self.failed_counter}\n""")
        if self.failed_counter:
            print(f"Failed to scrape:\n{self.failed_urls}\n")
        return self.ted_dict

    def convert_lang_code(self):
        """Reads languages.csv and returns language.
        Parameters:
            lang_code (str): Language code
        """
        df = pd.read_csv('../data/languages.csv')
        lang_series = df.loc[(df['lang_code'] == self.lang_code), 'language']
        language = lang_series.values[0]
        return language

    def to_dataframe(self, ted_dict):
        """Returns sorted DataFrame object from dict."""
        df = pd.DataFrame.from_dict(ted_dict, orient='index')
        df = df.sort_values(by='published_date')
        sorted_df = df.reset_index(drop=True)
        return sorted_df


## Get Data


In [13]:
# instantiate the scraper & pass in optional arguments
scraper = TEDscraper(lang_code='en', urls=video, topics='all')

In [14]:
# scrape the data and save it to a dictionary
ted_dict = scraper.get_data()

Fetching urls...

Scraping 36 TED talks in 'English'...
Estimated time to complete is 0.5 minutes

[OK] 0 https://www.ted.com/talks/gareth_gaskell_how_do_our_brains_process_speech/transcript?language=en
[OK] 1 https://www.ted.com/talks/ethan_hawke_give_yourself_permission_to_be_creative/transcript?language=en
[OK] 2 https://www.ted.com/talks/matt_walker_how_caffeine_and_alcohol_affect_your_sleep/transcript?language=en
[OK] 3 https://www.ted.com/talks/iseult_gillespie_the_myth_of_jason_medea_and_the_golden_fleece/transcript?language=en
[OK] 4 https://www.ted.com/talks/kwame_owusu_kesse_a_comprehensive_neighborhood_based_response_to_covid_19/transcript?language=en
[OK] 5 https://www.ted.com/talks/philip_freeman_the_rise_and_fall_of_the_celtic_warriors/transcript?language=en
[OK] 6 https://www.ted.com/talks/tom_teves_a_call_to_end_the_media_coverage_mass_shooters_want/transcript?language=en
[OK] 7 https://www.ted.com/talks/alex_gendler_the_egyptian_myth_of_the_death_of_osiris/transcript?l

In [15]:
# transform the dictionary to a sorted pandas DataFrame
df2 = scraper.to_dataframe(ted_dict)


In [16]:
df2

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
0,64693,The new urgency of climate change,Al Gore,"{0: 'Al Gore', 1: 'Chris Anderson'}","{0: ['climate advocate'], 1: ['head of ted']}",{0: 'Nobel Laureate Al Gore focused the world'...,968368,2020-06-23,2020-06-25,TED2020,en,"[en, nl]",52.0,3405,"[environment, climate change, global issues, e...",{2441: 'The case for optimism on climate chang...,https://www.ted.com/talks/al_gore_the_new_urge...,The coronavirus brought much of the world to a...,"Chris Anderson: Al, welcome. So look, just six..."
1,64534,How Dolly Parton led me to an epiphany,Jad Abumrad,{0: 'Jad Abumrad'},"{0: ['radio host', 'producer']}","{0: 'As a producer, the creator and host of th...",863968,2020-06-04,2020-06-25,TED2020,en,"[en, pt-br, tr, zh-cn]",26.0,791,"[storytelling, media, arts, relationships, jou...","{2665: 'If a story moves you, act on it', 2813...",https://www.ted.com/talks/jad_abumrad_how_doll...,"How do you end a story? Host of ""Radiolab"" Jad...",I want to tell you about my search for purpose...
2,63727,"How women will lead us to freedom, justice and...",Ellen Johnson Sirleaf,{0: 'Ellen Johnson Sirleaf'},"{0: ['global leader', 'nobel laureate']}","{0: 'The former President of Liberia, Nobel la...",816748,2019-12-04,2020-06-26,TEDWomen 2019,en,"[ckb, en, es, hi, hr, ku, pt, ro, tr]",16.0,851,"[leadership, government, women, gender equalit...",{59155: 'How women are revolutionizing Rwanda'...,https://www.ted.com/talks/h_e_ellen_johnson_si...,"""I was the first woman president of an African...",I was the first woman president of an African ...
3,60319,The beauty and complexity of finding common gr...,Matt Trombley,{0: 'Matt Trombley'},"{0: ['maven', 'teacher', 'team builder']}","{0: 'Matt Trombley is an active, passionate an...",231413,2020-02-05,2020-06-26,TED@WellsFargo,en,"[ar, el, en, es, fr, pt-br, ro, zh-tw]",16.0,840,"[humanity, community, communication, personal ...","{55061: 'The language of being human', 1533: '...",https://www.ted.com/talks/matt_trombley_the_be...,"How can we disagree with one another, respectf...","So our story started several years ago, when m..."
4,64185,Stop being a bystander in your own life,Tracy Edwards,{0: 'Tracy Edwards'},{0: ['trailblazer']},{0: 'Tracy Edwards MBE battled against the odd...,991138,2020-05-18,2020-06-30,TED2020,en,"[en, hr, my, pt, pt-br, zh-cn]",11.0,674,"[life, adventure, personal growth, identity, s...",{60183: '2 questions to uncover your passion -...,https://www.ted.com/talks/tracy_edwards_stop_b...,"""Life doesn't go from A to B -- it's messy,"" s...",Being able to navigate is an extraordinary gif...
5,62845,What foods did your ancestors love?,Aparna Pallavi,{0: 'Aparna Pallavi'},{0: ['journalist']},{0: 'Aparna Pallavi researches and writes abou...,871204,2018-12-08,2020-06-30,TEDxCapeTownWomen,en,"[en, es, fa, mr, pt, pt-br, tr, zh-cn]",6.0,880,"[history, food, indigenous peoples, culture, a...","{24455: 'The history of chocolate', 424: 'The ...",https://www.ted.com/talks/aparna_pallavi_what_...,"Around the world, Indigenous food cultures van...","Last year, I was living with this indigenous f..."
6,63853,The courage to live with radical uncertainty,Shekinah Elmore,{0: 'Shekinah Elmore'},{0: ['oncologist']},{0: 'Shekinah Elmore is dedicated to pursuing ...,96332,2020-03-04,2020-07-01,TEDMED 2020,en,"[ar, bg, en, fr, ko, ro, zh-tw]",8.0,953,"[cancer, illness, life, health care, mental he...",{55027: 'The beautiful balance between courage...,https://www.ted.com/talks/shekinah_elmore_the_...,"When your future is uncertain, how do you keep...",What's the worst that could happen? Almost exa...
7,64835,How the pandemic will shape the near future,Bill Gates,"{0: 'Bill Gates', 1: 'Chris Anderson'}","{0: ['technologist', 'philanthropist'], 1: ['h...","{0: 'A passionate techie, Bill Gates changed t...",1170738,2020-06-28,2020-07-01,TED2020,en,"[en, nl, tr]",117.0,2587,"[coronavirus, pandemic, science, public health...",{61301: 'How we must respond to the coronaviru...,https://www.ted.com/talks/bill_gates_how_the_p...,Bill Gates talks best (and worst) case scenari...,"Chris Anderson: Welcome, Bill Gates. Bill Gate..."
8,64776,Can beauty open our hearts to difficult conver...,Titus Kaphar,{0: 'Titus Kaphar'},{0: ['artist']},{0: 'Titus Kaphar's artworks interact with the...,110993,2020-06-25,2020-07-01,TED2020,en,"[en, pt-br, zh-tw]",22.0,815,"[art, history, race, painting, social change, ...","{2826: 'Can art amend history?', 64511: 'How r...",https://www.ted.com/talks/titus_kaphar_can_bea...,An artwork's color or composition can pull you...,I believe there is beauty in hearing the voice...
9,64824,Scenes from a Black trans life,D-L Stewart,{0: 'D-L Stewart'},{0: ['scholar and activist']},{0: 'D-L Stewart empowers and imagines futures...,726797,2019-03-09,2020-07-02,TEDxCSU,en,"[ar, en, fr, pt-br]",21.0,919,"[gender, race, Gender spectrum, Transgender, s...",{36659: 'A short history of trans people's lon...,https://www.ted.com/talks/d_l_stewart_scenes_f...,"At the crossroads of life and livelihood, scho...","Hello. Hey. (Laughter) As you just heard, my n..."


In [17]:
# fill NoneType data into int(0)
for i in range(0,len(df2)):
    
    if str(type(df2['comments'][i])) =="<class 'NoneType'>":
        df2['comments'][i]=0
    print(i,type(df2['comments'][i]))
    if str(type(df2['views'][i])) =="<class 'NoneType'>":
        df2['views'][i]=0


0 <class 'str'>
1 <class 'str'>
2 <class 'str'>
3 <class 'str'>
4 <class 'str'>
5 <class 'str'>
6 <class 'str'>
7 <class 'str'>
8 <class 'str'>
9 <class 'str'>
10 <class 'int'>
11 <class 'int'>
12 <class 'int'>
13 <class 'str'>
14 <class 'int'>
15 <class 'int'>
16 <class 'str'>
17 <class 'str'>
18 <class 'int'>
19 <class 'str'>
20 <class 'int'>
21 <class 'str'>
22 <class 'int'>
23 <class 'str'>
24 <class 'int'>
25 <class 'str'>
26 <class 'str'>
27 <class 'int'>
28 <class 'str'>
29 <class 'str'>
30 <class 'int'>
31 <class 'str'>
32 <class 'int'>
33 <class 'str'>
34 <class 'str'>
35 <class 'int'>


In [18]:
# change data type

for i in range(0,len(df2)):
    df2['talk_id'][i]=int(df2['talk_id'][i])
    df2['views'][i]=int(df2['views'][i])
    df2['comments'][i]=int(df2['comments'][i])
    df2['duration'][i]=int(df2['duration'][i])
    

In [19]:
df2=df2.sort_values(['talk_id'])

df2

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
3,60319,The beauty and complexity of finding common gr...,Matt Trombley,{0: 'Matt Trombley'},"{0: ['maven', 'teacher', 'team builder']}","{0: 'Matt Trombley is an active, passionate an...",231413,2020-02-05,2020-06-26,TED@WellsFargo,en,"[ar, el, en, es, fr, pt-br, ro, zh-tw]",16,840,"[humanity, community, communication, personal ...","{55061: 'The language of being human', 1533: '...",https://www.ted.com/talks/matt_trombley_the_be...,"How can we disagree with one another, respectf...","So our story started several years ago, when m..."
5,62845,What foods did your ancestors love?,Aparna Pallavi,{0: 'Aparna Pallavi'},{0: ['journalist']},{0: 'Aparna Pallavi researches and writes abou...,871204,2018-12-08,2020-06-30,TEDxCapeTownWomen,en,"[en, es, fa, mr, pt, pt-br, tr, zh-cn]",6,880,"[history, food, indigenous peoples, culture, a...","{24455: 'The history of chocolate', 424: 'The ...",https://www.ted.com/talks/aparna_pallavi_what_...,"Around the world, Indigenous food cultures van...","Last year, I was living with this indigenous f..."
10,63654,What do all languages have in common?,Cameron Morin,{0: 'Cameron Morin'},,,470636,2020-06-29,2020-07-06,TED-Ed,en,"[en, fa, fr, ja, my, pl, pt, pt-br, ro, tr, zh...",0,302,"[education, TED-Ed, language, animation, speec...","{6857: 'The benefits of a bilingual brain', 24...",https://www.ted.com/talks/cameron_morin_what_d...,Language is endlessly variable. Each of us can...,Language is endlessly variable. Each of us can...
2,63727,"How women will lead us to freedom, justice and...",Ellen Johnson Sirleaf,{0: 'Ellen Johnson Sirleaf'},"{0: ['global leader', 'nobel laureate']}","{0: 'The former President of Liberia, Nobel la...",816748,2019-12-04,2020-06-26,TEDWomen 2019,en,"[ckb, en, es, hi, hr, ku, pt, ro, tr]",16,851,"[leadership, government, women, gender equalit...",{59155: 'How women are revolutionizing Rwanda'...,https://www.ted.com/talks/h_e_ellen_johnson_si...,"""I was the first woman president of an African...",I was the first woman president of an African ...
23,63749,The invisible life hidden beneath Antarctica's...,Ariel Waldman,{0: 'Ariel Waldman'},"{0: ['antarctic explorer', 'nasa advisor']}","{0: 'An artist who's pivoted to science, Ariel...",439226,2020-05-18,2020-07-14,TED2020,en,"[ar, en, es, nl, ro, zh-tw]",3,356,"[science, animals, exploration, Antarctica, na...",{2511: 'Drawings that show the beauty and frag...,https://www.ted.com/talks/ariel_waldman_the_in...,"In this tour of the microscopic world, explore...",Can you guess what this is? What if I told you...
17,63815,"Every day you live, you impact the planet",Jane Goodall,"{0: 'Jane Goodall', 1: 'Chris Anderson'}","{0: ['primatologist', 'conservationist'], 1: [...","{0: 'Dubbed ""the woman who redefined man"" by h...",703875,2020-05-18,2020-07-09,TED2020,en,[en],18,1514,"[nature, animals, humanity, climate change, co...",{340: 'How humans and animals can live togethe...,https://www.ted.com/talks/jane_goodall_every_d...,Legendary primatologist Jane Goodall says that...,"Chris Anderson: Dr. Jane Goodall, welcome. Jan..."
6,63853,The courage to live with radical uncertainty,Shekinah Elmore,{0: 'Shekinah Elmore'},{0: ['oncologist']},{0: 'Shekinah Elmore is dedicated to pursuing ...,96332,2020-03-04,2020-07-01,TEDMED 2020,en,"[ar, bg, en, fr, ko, ro, zh-tw]",8,953,"[cancer, illness, life, health care, mental he...",{55027: 'The beautiful balance between courage...,https://www.ted.com/talks/shekinah_elmore_the_...,"When your future is uncertain, how do you keep...",What's the worst that could happen? Almost exa...
4,64185,Stop being a bystander in your own life,Tracy Edwards,{0: 'Tracy Edwards'},{0: ['trailblazer']},{0: 'Tracy Edwards MBE battled against the odd...,991138,2020-05-18,2020-06-30,TED2020,en,"[en, hr, my, pt, pt-br, zh-cn]",11,674,"[life, adventure, personal growth, identity, s...",{60183: '2 questions to uncover your passion -...,https://www.ted.com/talks/tracy_edwards_stop_b...,"""Life doesn't go from A to B -- it's messy,"" s...",Being able to navigate is an extraordinary gif...
12,64208,The last chief of the Comanches and the fall o...,Dustin Tahmahkera,{0: 'Dustin Tahmahkera'},,,152063,2020-06-16,2020-07-06,TED-Ed,en,"[en, hu, pt, pt-br, tr, zh-tw]",0,363,"[TED-Ed, animation, education, indigenous peop...","{18514: 'Did the Amazons really exist?', 46519...",https://www.ted.com/talks/dustin_tahmahkera_th...,"Late one night in 1871, a group of riders desc...","Late one night in 1871, a group of riders desc..."
11,64418,Can you solve the Ragnarok riddle?,Dan Finkel,{0: 'Dan Finkel'},,,1014394,2020-06-30,2020-07-06,TED-Ed,en,"[en, fr, pt, pt-br, tr, zh-cn, zh-tw]",0,274,"[TED-Ed, education, math, animation]",{17849: 'Can you solve the giant cat army ridd...,https://www.ted.com/talks/dan_finkel_can_you_s...,"Ragnarok: The fabled end of the world, when gi...","Ragnarok. The fabled end of the world, when gi..."


In [20]:
# output DataFrame as CSV
df2.to_csv('../data/ted_talksVideo.csv', index=False)

In [21]:
df2

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
3,60319,The beauty and complexity of finding common gr...,Matt Trombley,{0: 'Matt Trombley'},"{0: ['maven', 'teacher', 'team builder']}","{0: 'Matt Trombley is an active, passionate an...",231413,2020-02-05,2020-06-26,TED@WellsFargo,en,"[ar, el, en, es, fr, pt-br, ro, zh-tw]",16,840,"[humanity, community, communication, personal ...","{55061: 'The language of being human', 1533: '...",https://www.ted.com/talks/matt_trombley_the_be...,"How can we disagree with one another, respectf...","So our story started several years ago, when m..."
5,62845,What foods did your ancestors love?,Aparna Pallavi,{0: 'Aparna Pallavi'},{0: ['journalist']},{0: 'Aparna Pallavi researches and writes abou...,871204,2018-12-08,2020-06-30,TEDxCapeTownWomen,en,"[en, es, fa, mr, pt, pt-br, tr, zh-cn]",6,880,"[history, food, indigenous peoples, culture, a...","{24455: 'The history of chocolate', 424: 'The ...",https://www.ted.com/talks/aparna_pallavi_what_...,"Around the world, Indigenous food cultures van...","Last year, I was living with this indigenous f..."
10,63654,What do all languages have in common?,Cameron Morin,{0: 'Cameron Morin'},,,470636,2020-06-29,2020-07-06,TED-Ed,en,"[en, fa, fr, ja, my, pl, pt, pt-br, ro, tr, zh...",0,302,"[education, TED-Ed, language, animation, speec...","{6857: 'The benefits of a bilingual brain', 24...",https://www.ted.com/talks/cameron_morin_what_d...,Language is endlessly variable. Each of us can...,Language is endlessly variable. Each of us can...
2,63727,"How women will lead us to freedom, justice and...",Ellen Johnson Sirleaf,{0: 'Ellen Johnson Sirleaf'},"{0: ['global leader', 'nobel laureate']}","{0: 'The former President of Liberia, Nobel la...",816748,2019-12-04,2020-06-26,TEDWomen 2019,en,"[ckb, en, es, hi, hr, ku, pt, ro, tr]",16,851,"[leadership, government, women, gender equalit...",{59155: 'How women are revolutionizing Rwanda'...,https://www.ted.com/talks/h_e_ellen_johnson_si...,"""I was the first woman president of an African...",I was the first woman president of an African ...
23,63749,The invisible life hidden beneath Antarctica's...,Ariel Waldman,{0: 'Ariel Waldman'},"{0: ['antarctic explorer', 'nasa advisor']}","{0: 'An artist who's pivoted to science, Ariel...",439226,2020-05-18,2020-07-14,TED2020,en,"[ar, en, es, nl, ro, zh-tw]",3,356,"[science, animals, exploration, Antarctica, na...",{2511: 'Drawings that show the beauty and frag...,https://www.ted.com/talks/ariel_waldman_the_in...,"In this tour of the microscopic world, explore...",Can you guess what this is? What if I told you...
17,63815,"Every day you live, you impact the planet",Jane Goodall,"{0: 'Jane Goodall', 1: 'Chris Anderson'}","{0: ['primatologist', 'conservationist'], 1: [...","{0: 'Dubbed ""the woman who redefined man"" by h...",703875,2020-05-18,2020-07-09,TED2020,en,[en],18,1514,"[nature, animals, humanity, climate change, co...",{340: 'How humans and animals can live togethe...,https://www.ted.com/talks/jane_goodall_every_d...,Legendary primatologist Jane Goodall says that...,"Chris Anderson: Dr. Jane Goodall, welcome. Jan..."
6,63853,The courage to live with radical uncertainty,Shekinah Elmore,{0: 'Shekinah Elmore'},{0: ['oncologist']},{0: 'Shekinah Elmore is dedicated to pursuing ...,96332,2020-03-04,2020-07-01,TEDMED 2020,en,"[ar, bg, en, fr, ko, ro, zh-tw]",8,953,"[cancer, illness, life, health care, mental he...",{55027: 'The beautiful balance between courage...,https://www.ted.com/talks/shekinah_elmore_the_...,"When your future is uncertain, how do you keep...",What's the worst that could happen? Almost exa...
4,64185,Stop being a bystander in your own life,Tracy Edwards,{0: 'Tracy Edwards'},{0: ['trailblazer']},{0: 'Tracy Edwards MBE battled against the odd...,991138,2020-05-18,2020-06-30,TED2020,en,"[en, hr, my, pt, pt-br, zh-cn]",11,674,"[life, adventure, personal growth, identity, s...",{60183: '2 questions to uncover your passion -...,https://www.ted.com/talks/tracy_edwards_stop_b...,"""Life doesn't go from A to B -- it's messy,"" s...",Being able to navigate is an extraordinary gif...
12,64208,The last chief of the Comanches and the fall o...,Dustin Tahmahkera,{0: 'Dustin Tahmahkera'},,,152063,2020-06-16,2020-07-06,TED-Ed,en,"[en, hu, pt, pt-br, tr, zh-tw]",0,363,"[TED-Ed, animation, education, indigenous peop...","{18514: 'Did the Amazons really exist?', 46519...",https://www.ted.com/talks/dustin_tahmahkera_th...,"Late one night in 1871, a group of riders desc...","Late one night in 1871, a group of riders desc..."
11,64418,Can you solve the Ragnarok riddle?,Dan Finkel,{0: 'Dan Finkel'},,,1014394,2020-06-30,2020-07-06,TED-Ed,en,"[en, fr, pt, pt-br, tr, zh-cn, zh-tw]",0,274,"[TED-Ed, education, math, animation]",{17849: 'Can you solve the giant cat army ridd...,https://www.ted.com/talks/dan_finkel_can_you_s...,"Ragnarok: The fabled end of the world, when gi...","Ragnarok. The fabled end of the world, when gi..."


In [22]:
driver=webdriver.Chrome()

In [23]:
driver.set_window_size(1024,1024)

In [24]:
url=df2['url'][0]
driver.get(str(url))

In [25]:
# //*[@id="content"]/div/div[2]/div[1]/div[2]/div/div/div[2]/ul/li[4]/button/div/span/svg

In [26]:
driver.find_element_by_xpath('//button[@class="sb"]').click()

In [27]:
types = driver.find_elements_by_xpath('//ul/span/div/div/button')
types[1].click()

In [28]:
driver.find_element_by_xpath("//a[contains(.//text(),' audio')]").click() 
# audio dowload, video download시 video로 변경

In [29]:
df[df['talk_id']==1]['url'][0]

NameError: name 'df' is not defined

In [None]:
driver.close()