# TEDscraper Notebook

In [4]:
import random
import re
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from fake_useragent import UserAgent

import requests
from requests import request
from requests.compat import urljoin, urlparse
from requests.exceptions import HTTPError
from urllib.robotparser import RobotFileParser
from requests import Session
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import simplejson

driver=webdriver.Chrome() 


headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

url='https://www.ted.com/talks?language=en&page=1&sort=newest'
params={'language':'en','page':1,'sort':'newest'}


driver.get(url)
video=list()
for _ in driver.find_elements_by_class_name('talk-link'):
    video.append(_.find_element_by_css_selector('a').get_attribute('href'))
driver.close()

video

## Soup Maker

In [5]:
class SoupMaker:
    """Make soup objects and put your machine to sleep."""
    

    def sleep_short(self):
        """Suspends execution time between 0 - .2 seconds."""
        return time.sleep(random.uniform(0, .2))

    def sleep_two(self):
        """Suspends execution time between .5 - 2 seconds."""
        return time.sleep(random.uniform(.5, 2))
    
    def sleep_five(self):
        """Suspends execution time between 3 - 5 seconds."""
        return time.sleep(random.uniform(3, 5))

    def make_soup(self, url):
        """Returns soup object from a URL."""
        # generate random user-agent
        user_agent = {'User-agent': UserAgent().random}
        # request page and make soup
        page = requests.get(url, headers=user_agent)
        soup = BeautifulSoup(page.content, 'lxml')
        return soup

    def taste_soup(self, soup):
        """Taste test soup object."""
        try:
            taster = soup.title.text
            bad_soup = re.search(r'404: Not Found', taster)
        except AttributeError:
            bad_soup = None
        return bad_soup


## CreateCSV

In [6]:
class CreateCSV(SoupMaker):
    """Create CSVs of TED topics and languages."""


    def create_topics_csv(self):
        """Creates CSV of all topics available from TED."""
        soup = self.make_soup('https://www.ted.com/topics')
        topic_list = []
        topic_tag = soup.find_all(class_='d:b', style='line-height:3;')
        for tag in topic_tag:
            topic = re.sub(r'\s+', '', tag.text)
            topic_list.append(topic)
        topics_series = pd.Series(topic_list, name='Topic')
        topics_series.to_csv('../data/topics.csv', index=False)

    def create_languages_csv(self):
        """Creates CSV of all language codes supported by TED."""
        lang_url = 'https://www.ted.com/participate/translate/our-languages'
        soup = self.make_soup(lang_url)
        lang_list = []
        lang_tags = soup.find_all('div', class_='h9')
        for tag in lang_tags:
            if tag.a == None:
                continue
            else:
                lang_code = re.search(r'(?<=\=)[\w-]+', tag.a['href']).group(0)
                lang_name = tag.text
                lang_list.append([lang_code] + [lang_name])
        lang_df = pd.DataFrame(data=lang_list, columns=['lang_code', 'language'])
        lang_df.to_csv('../data/languages.csv', index=False)


## Talk Features

In [16]:
class TalkFeatures(SoupMaker):
    """Class to get TED talk features."""


    def get_talk_id(self, soup):
        """Returns the talk_id provided by TED."""
        talk_id = re.search('meta content="ted://talks/(\d+)',str(soup))

        return talk_id.group(1)

    def get_title(self, soup):
        """Returns the title of the talk."""
        title_tag = soup.find(attrs={'name': 'title'}).attrs['content']
        tag_list = title_tag.split(':')
        title = ":".join(tag_list[1:]).lstrip()
        return title

    def get_speaker_1(self, soup):
        """Returns the first speaker in TED's speaker list."""
        try:
            speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert to DataFrame
            speakers_df = pd.read_json(speaker_tag)
            full_name_raw = (speakers_df.loc[:, 'firstname'] + ' '
                         + speakers_df.loc[:, 'middleinitial'] + ' '
                         + speakers_df.loc[:, 'lastname'])
            full_name_clean = full_name_raw.str.replace('\s+', ' ')
            # transform series to a dict
            speaker = full_name_clean.iloc[0]
        except:
            speaker = re.search(r"(?<=\"speaker_name\":)\"(.*?)\"", str(soup)).group(1)
        return speaker

    def get_all_speakers(self, soup):
        """Returns dict of all speakers per talk."""
        try:
            speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert to DataFrame
            speakers_df = pd.read_json(speaker_tag)
            full_name_raw = (speakers_df.loc[:, 'firstname'] + ' '
                         + speakers_df.loc[:, 'middleinitial'] + ' '
                         + speakers_df.loc[:, 'lastname'])
            full_name_clean = full_name_raw.str.replace('\s+', ' ')
            # transform series to a dict
            speakers = full_name_clean.to_dict()
        except:
            speakers = None
        return speakers

    def get_occupations(self, soup):
        """Returns list of the occupation(s) of the speaker(s) per talk."""
        try:
            occupations_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert json to DataFrame
            occupations_series = pd.read_json(occupations_tag)['description']
            if occupations_series.all():
                # clean and create dict
                occupations = occupations_series.str.lower().str.split(', ')
                occupations = occupations.to_dict()
            else:
                occupations = None
        except:
            occupations = None
        return occupations

    def get_about_speakers(self, soup):
        """Returns dict with each 'About the Speaker' blurb per talk."""
        try:
            speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", str(soup))[0]
            # convert to DataFrame
            about_series = pd.read_json(speaker_tag)['whotheyare']
            if about_series.all():
                # transform series to a dict
                about_speakers = about_series.to_dict()
            else:
                about_speakers = None
        except:
            about_speakers = None
        return about_speakers

    def get_views(self, soup):
        """Returns viewed count per talk."""
        view_count = re.search(r"(?<=\"viewed_count\":)\d+", str(soup)).group(0)
        return view_count

    def get_recorded_date(self, soup):
        """Returns date a talk was recorded."""
        try:
            tag = re.search(r"(?<=\"recorded_at\":\")[\d-]+", str(soup))
            recorded_at = tag.group(0)
        except:
            recorded_at = None
        return recorded_at

    def get_published_date(self, soup):
        """Returns date a talk was published in TED.com."""
        published_raw = soup.find(attrs={'itemprop': 'uploadDate'}).attrs['content']
        published_date = re.search(r"[\d-]+", published_raw).group(0)
        return published_date

    def get_event(self, soup):
        """Returns name of the event in which the talk was given."""
        event = re.search(r"(?<=\"event\":)\"(.*?)\"", str(soup)).group(1)
        return event
    
    def get_native_lang(self, soup):
        """Returns native language code for each talk as a string."""
        native_lang = re.search(r'(?<=nativeLanguage\":)\"(.*?)\"', str(soup)).group(1)
        return native_lang
    
    def get_available_lang(self, soup):
        """Returns list of all available languages (lang codes) for a talk."""
        languages = re.findall(r'(?<=languageCode\":)\"(.*?)\"', str(soup))
        clean_lang = sorted(list(set(languages)))
        return clean_lang

    def get_comments_count(self, soup):
        """Return the count of comments per talk."""
        try:
            comments_count = re.search(r"(?<=\"count\":)(\d+)", str(soup)).group(1)
        except AttributeError:
            comments_count = None
        return comments_count

    def get_duration(self, soup):
        """Returns duration of a talk in seconds."""
        duration =  re.search(r"(?<=\"duration\":)(\d+)", str(soup)).group(1)
        return duration

    def get_topics(self, soup):
        """Returns list of tags (topics) per talk."""
        match_obj = re.search(r"\"tag\":\"(.*?)\"", str(soup))
        topics = match_obj.group(1).split(',')
        return topics

    def get_related_talks(self, soup):
        """Returns dict (keys: id & title) of related talks."""
        related_tag = re.search(r"(?<=\"related_talks\":).*?]", str(soup)).group(0)
        related_sr = pd.read_json(related_tag)
        related_talks = dict(zip(related_sr['id'], related_sr['title']))
        return related_talks

    def get_talk_url(self, soup):
        """Returns url for each talk as a string."""
        talk_tag = soup.find(attrs={'property': 'og:url'}).attrs['content']
        talk_url = talk_tag.split('transcript')[0]
        return talk_url

    def get_description(self, soup):
        """Returns description of the talk."""
        desc_tag = soup.find(attrs={'property': 'og:description'}).attrs['content']
        talk_desc = desc_tag.split(': ', 1)[1]
        return talk_desc

    def get_transcript(self, soup):
        """Returns talk's transcript as a single string.""" 
        transcript = ''
        transcript_strings = []
        for div in soup.find_all('div', class_="Grid__cell flx-s:1 p-r:4"):
            for p in div.find_all('p'):
                # add every string in the transcript to a list
                transcript_strings.append(" ".join(p.text.split()))
            else:
                # after all strings have been added, create a single transcript string
                transcript = " ".join(transcript_strings)
        return transcript
    
    def download_transcript_with_time(self, talk_id):
        soup = SoupMaker()
        script_soup = soup.make_soup("https://hls.ted.com/talks/"+talk_id+"/subtitles/en/full.vtt")
        scr = str(script_soup)[70:-18]

        script_dict={}
        index = 0;
        tmp = {}
    
        for part in scr.split('\n'):
            if part.find('--&gt;')!=-1:

                script_dict[index] = tmp.copy()
                tmp={}
                index += 1
       
                times = part.split('--&gt;')
                tmp['start'] = times[0][:-1]
                tmp['end'] = times[1][1:]

            elif part!='':
                try:
                    tmp['sentence1']
                except:
                    tmp['sentence1'] = part
                    continue
                try:
                    tmp['sentence2']
                except:
                    tmp['sentence2'] = part
                    continue
                try:
                    tmp['sentence3']
                except:
                    tmp['sentence3'] = part
        df = pd.DataFrame.from_dict(script_dict,orient='index')
        df.to_csv('../data/transcript_'+talk_id+'.csv', index=False)
        print("downloded at '../data/transcript_"+talk_id+".csv'")
        return

## URLs

In [8]:

class URLs(SoupMaker):
    """Get and process urls to scrape."""

    
    def topics_url_param(self):
        """Returns string of the url query from topics parameter."""
        topics_param = ''
        if self.topics != 'all':
            if isinstance(self.topics, list):
                for topic in self.topics:
                    topics_param += ('&topics[]=' + topic)
            else:
                raise ValueError("'topics' param needs to be a list")
        return topics_param

    def get_max_page(self):
        """Returns max pagination number from www.ted.com/talks."""
        page_num = [1]
        # make soup from ted.com/talks with specified language
        soup = self.make_soup(self.base_url + '&page=1&sort=newest')
        # iterate through each pagination element and get the max
        page_elem = soup.find_all('a', class_='pagination__item pagination__link')
        for element in page_elem:
            page_num.append(int(element.text))
        return max(page_num)
    
    def get_all_url_paths(self):
        """Returns list of all the talk url paths available in www.ted.com/talks"""
        url_path_list = []
        # construct url with lang code specified by the user
        talks_url = (self.base_url + '&page=')
        # set range from 1 to the max page in the pagination element
        page_range = range(1, self.get_max_page()+1)
        # iterate through each page and get the url for each talk
        for i in page_range:
            # try a second attempt if first attempt fails
            for attempt in range(2):
                try:
                    talks_page_url = talks_url + str(i) + '&sort=newest'
                    soup = self.make_soup(talks_page_url)
                    # delay between searches
                    self.sleep_short()
                    for div in soup.find_all('div', attrs={'class': 'media__image'}):
                        for a in div.find_all('a'):
                            url_path_list.append(a.get('href'))
                except:
                    # delay before continuing to second attempt
                    self.sleep_two()
                # break from attempts loop if no exceptions are raised
                else:
                    break
        return url_path_list

    def get_all_urls(self):
        """Returns list of complete urls for each talk's transcript page."""
        # '/talks/jen_gunter_why_can_t_we_talk_about_periods?language=fa'
        url_list = []
        for url in self.get_all_url_paths():
            url_list.append(('https://www.ted.com'
                             + url.replace(
                                 # to replace
                                 '?language=' + self.lang_code,
                                 # replace with
                                 '/transcript' + '?language=' + self.lang_code)
                            ))
        return url_list
    
    def clean_urls(self, urls):
        """Returns list of clean urls from urls the user inputs."""
        clean_urls = []
        for idx, url in enumerate(urls):
            if url.startswith('https://www.ted.com/talks'):
                parts = url.split('/')
                joined = '/'.join(parts[:5])
                clean = joined.split('?')
                lang = clean[0] + '/transcript?language=' + self.lang_code
                topic = lang + self.topics_url_param()
                clean_urls.append(lang)
            else:
                print(f'bad url @ {idx} >> {url}')
                continue
        return clean_urls
    
    def url_issues(self):
        """Returns DataFrame of urls with known issues."""
        issues_df = pd.read_csv('../data/known_issues.csv')
        return issues_df
    
    def remove_urls_with_issues(self):
        """Remove urls with known issues to prevent unnecessary scraping."""
        urls = self.all_urls()
        final_urls = []
        removed_urls = []
        removed_counter = 0
        issues_df = pd.read_csv('../data/known_issues.csv')
        for url in urls:
            try:
                base_url = url.replace('transcript?language=' + self.lang_code, '')
                # is base url in the issues df?
                url_in_issues = (issues_df['url'] == base_url).any()
                # get the lang_codes of the base_url
                langs = issues_df.loc[issues_df['url'] == base_url, 'lang_code']
                # check if the url in issues_df
                if not url_in_issues:
                    final_urls.append(url)
                # if the url is in issues_df, check if it's for the same lang_code
                elif self.lang_code in langs.any():
                    removed_urls.append(url)
                    removed_counter += 1
                    continue
                else:
                    final_urls.append(url)
            except:
                removed_urls.append(url)
                removed_counter += 1
                continue
        if removed_urls:
            print(f"Removed the following {removed_counter} urls as they have "
                  "known issues:\n", removed_urls, end='\n\n')
        return final_urls

    def all_urls(self):
        """Return all urls based on parameter 'urls' without removing."""
        # define url attribute
        if self.urls == 'all':
            urls = self.get_all_urls()
        else:
            if isinstance(self.urls, list):
                urls = self.clean_urls(self.urls)
            else:
                raise ValueError("'urls' param needs to be a list")
        return urls

    def final_urls(self):
        """Return final urls to fetch."""
        # define url attribute
        if self.force_fetch:
            urls = self.all_urls()
        else:
            urls = self.remove_urls_with_issues()  
        return urls

    def seen_urls(self, url, attempt):
        """Returns attempt depending on seen urls for urls that fail."""
        if url not in self.seen:
            yield url
            seen.add(url)
        # if the url was appended earlier after 2 failed attempts
        # it means this is the last attempt (3)
        elif url in self.seen and attempt == 1:
            attempt = 3
        return attempt


## TEDscraper

In [9]:
class TEDscraper(TalkFeatures, URLs):
    """Gets urls and scrapes TED talk data in the specified language.

    Attributes:
        lang_code (str): Language code. Defaults to 'en'.
        language (str): Language name derived from lang_code.
        urls (list): URLs of talks. Defaults to 'all'.
        topics (list): Talk topics. Defaults to 'all'.
        exclude (bool): Exclude transcript. Defaults to False.
        ted_dict (dict): Dict to store ted talk features after scraping.
        dict_id (int): Index of nested dict in 'ted_dict'.
        failed_counter: Counts urls that failed to get scraped.
    """
 

    def __init__(self, lang_code='en', urls='all', topics='all',
                 force_fetch = False, exclude_transcript=False):
        self.lang_code = lang_code
        self.language = self.convert_lang_code()
        self.urls = urls
        self.topics = topics
        self.exclude = exclude_transcript
        self.ted_dict = {}
        self.dict_id = 0
        self.failed_counter = 0
        self.failed_urls = []
        self.force_fetch = force_fetch
        self.seen = set()
        self.base_url = ('https://www.ted.com/talks'
                         + '?language=' + self.lang_code
                         + self.topics_url_param())

    def scrape_all_features(self, soup):
        """Scrapes all features to a nested dict."""
        # create nested dict
        self.ted_dict[self.dict_id] = {}
        nested_dict = self.ted_dict[self.dict_id]
        # add the features to the nested dict
        nested_dict['talk_id'] = self.get_talk_id(soup)
        nested_dict['title'] = self.get_title(soup)
        nested_dict['speaker_1'] = self.get_speaker_1(soup)
        nested_dict['all_speakers'] = self.get_all_speakers(soup)
        nested_dict['occupations'] = self.get_occupations(soup)
        nested_dict['about_speakers'] = self.get_about_speakers(soup)
        nested_dict['views'] = self.get_views(soup)
        nested_dict['recorded_date'] = self.get_recorded_date(soup)
        nested_dict['published_date'] = self.get_published_date(soup)
        nested_dict['event'] = self.get_event(soup)
        nested_dict['native_lang'] = self.get_native_lang(soup)
        nested_dict['available_lang'] = self.get_available_lang(soup)
        nested_dict['comments'] = self.get_comments_count(soup)
        nested_dict['duration'] = self.get_duration(soup)
        nested_dict['topics'] = self.get_topics(soup)
        nested_dict['related_talks'] = self.get_related_talks(soup)
        nested_dict['url'] = self.get_talk_url(soup)
        nested_dict['description'] = self.get_description(soup)
        # add transcript if param is set to False (default)
        if not self.exclude:
            nested_dict['transcript'] = self.get_transcript(soup)
        return nested_dict

    def get_data(self):
        """Returns nested dictionary of features from each talk's transcript page."""
        print("Fetching urls...\n")
        urls = self.final_urls()
        print(f"Scraping {len(urls)} TED talks in '{self.language}'...")
        print(f"Estimated time to complete is {round((.9*len(urls)/60), 1)} minutes\n")
        # iterate through each TED talk transcript url
        for url in urls:
            # delay between each scrape
            self.sleep_short()
            # try up to three attempts
            for i in range(1, 4):
                # check if url has been seen, if true:
                # it means it previously failed twice so make it the final attempt
                attempt = self.seen_urls(url, i)
                try:
                    # make soup
                    soup = self.make_soup(url)                                        
                    # create nested dict
                    self.ted_dict[self.dict_id] = {}
                    # scrape features and add to a nested dict
                    self.scrape_all_features(soup)
                except Exception as e:
                    # taste if it's a bad soup
                    if self.taste_soup(soup):
                        print(f"[BAD_SOUP] {url}")
                        self.failed_urls.append(url)
                        self.failed_counter += 1
                        break
                    elif attempt == 1:
                        # 3-5 second delay before another attempt
                        self.sleep_five()
                        continue
                    elif attempt == 2:
                        # append the url to 'urls' to try again later
                        urls.append(url)
                        break
                    elif attempt == 3:
                        print(f"[EXCEPTION] {e} {url}")
                        self.failed_counter += 1
                        self.failed_urls.append(url)
                        break
                else:
                    # indicate successful scrape
                    print(f"[OK] {self.dict_id} {url}")
                    # add 1 to create a new nested dict
                    self.dict_id += 1
                    # exit attempts loop
                    break
        # print results
        print(f"""\nTed.com scraping results:
            \n\t• Successful: {self.dict_id}
            \n\t• Failed: {self.failed_counter}\n""")
        if self.failed_counter:
            print(f"Failed to scrape:\n{self.failed_urls}\n")
        return self.ted_dict

    def convert_lang_code(self):
        """Reads languages.csv and returns language.
        Parameters:
            lang_code (str): Language code
        """
        df = pd.read_csv('../data/languages.csv')
        lang_series = df.loc[(df['lang_code'] == self.lang_code), 'language']
        language = lang_series.values[0]
        return language

    def to_dataframe(self, ted_dict):
        """Returns sorted DataFrame object from dict."""
        df = pd.DataFrame.from_dict(ted_dict, orient='index')
        df = df.sort_values(by='published_date')
        sorted_df = df.reset_index(drop=True)
        return sorted_df


## Get Data


In [18]:


soup = SoupMaker()
talk = TalkFeatures()
ted_url = "https://www.ted.com/talks/blaise_aguera_y_arcas_how_computers_are_learning_to_be_creative"
ms = soup.make_soup(ted_url)
talk_id = talk.get_talk_id(ms)
talk.download_transcript_with_time(talk_id)

downloded at '../data/transcript_2533.csv'


In [140]:
# instantiate the scraper & pass in optional arguments
scraper = TEDscraper(lang_code='en', urls='all', topics='all')

In [None]:
# scrape the data and save it to a dictionary
ted_dict = scraper.get_data()

Fetching urls...



In [68]:
# transform the dictionary to a sorted pandas DataFrame
df = scraper.to_dataframe(ted_dict)


In [69]:
df

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
0,1,Averting the climate crisis,Al Gore,{0: 'Al Gore'},{0: ['climate advocate']},{0: 'Nobel Laureate Al Gore focused the world'...,3540904,2006-02-25,2006-06-27,TED2006,en,"[ar, bg, cs, de, el, en, es, fa, fr, fr-ca, gl...",273,977,"[alternative energy, cars, climate change, cul...","{243: 'New thinking on the climate crisis', 54...",https://www.ted.com/talks/al_gore_averting_the...,With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
1,92,The best stats you've ever seen,Hans Rosling,{0: 'Hans Rosling'},{0: ['global health expert; data visionary']},"{0: 'In Hans Rosling’s hands, data sings. Glob...",14645889,2006-02-22,2006-06-27,TED2006,en,"[ar, az, bg, bn, bs, cs, da, de, el, en, es, e...",634,1190,"[Africa, Asia, Google, demo, economics, global...","{2056: 'Own your body's data', 2296: 'A visual...",https://www.ted.com/talks/hans_rosling_the_bes...,You've never seen data presented like this. Wi...,"About 10 years ago, I took on the task to teac..."
2,7,Simplicity sells,David Pogue,{0: 'David Pogue'},{0: ['technology columnist']},{0: 'David Pogue is the personal technology co...,1933628,2006-02-24,2006-06-27,TED2006,en,"[ar, bg, de, el, en, es, fa, fr, he, hr, hu, i...",125,1286,"[computers, entertainment, interface design, m...","{1725: '10 top time-saving tech tips', 2274: '...",https://www.ted.com/talks/david_pogue_simplici...,New York Times columnist David Pogue takes aim...,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,53,Greening the ghetto,Majora Carter,{0: 'Majora Carter'},{0: ['activist for environmental justice']},{0: 'Majora Carter redefined the field of envi...,2714748,2006-02-26,2006-06-27,TED2006,en,"[ar, bg, bn, ca, cs, de, en, es, fa, fi, fr, h...",219,1116,"[MacArthur grant, activism, business, cities, ...",{1041: '3 stories of local eco-entrepreneurshi...,https://www.ted.com/talks/majora_carter_greeni...,"In an emotionally charged talk, MacArthur-winn...",If you're here today — and I'm very happy that...
4,66,Do schools kill creativity?,Sir Ken Robinson,{0: 'Sir Ken Robinson'},"{0: ['author', 'educator']}",{0: 'Creativity expert Sir Ken Robinson challe...,66006508,2006-02-25,2006-06-27,TED2006,en,"[af, ar, az, be, bg, bn, ca, cs, da, de, el, e...",4959,1164,"[children, creativity, culture, dance, educati...","{865: 'Bring on the learning revolution!', 173...",https://www.ted.com/talks/sir_ken_robinson_do_...,Sir Ken Robinson makes an entertaining and pro...,Good morning. How are you? (Audience) Good. It...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4122,65074,"A comprehensive, neighborhood-based response t...",Kwame Owusu-Kesse,{0: 'Kwame Owusu-Kesse'},{0: ['community leader']},{0: 'Kwame Owusu-Kesse ensures the programming...,511464,2020-06-18,2020-07-21,TED2020,en,"[en, tr]",8,396,"[coronavirus, education, community, Audacious ...",{63813: 'How to quickly scale up contact traci...,https://www.ted.com/talks/kwame_owusu_kesse_a_...,Crisis interventions often focus on a single a...,Good evening. It is such a blessing to work at...
4123,65280,"The myth of Jason, Medea, and the Golden Fleece",Iseult Gillespie,{0: 'Iseult Gillespie'},,,183671,2020-07-21,2020-07-21,TED-Ed,en,"[en, es, fr, zh-tw]",,269,"[TED-Ed, education, animation, history, war, v...",{46592: 'The myth of Jason and the Argonauts '...,https://www.ted.com/talks/iseult_gillespie_the...,"In Colchis, the hide of a mystical flying ram ...",In the center of Colchis in an enchanted garde...
4124,65061,How caffeine and alcohol affect your sleep,Matt Walker,{0: 'Matt Walker'},"{0: ['sleep scientist', 'professor', 'author']}",{0: 'Matt Walker is a brain scientist trying t...,53038,2020-07-22,2020-07-22,Sleeping with Science,en,"[en, fr, hi]",5,294,"[sleep, science, health, human body]","{65060: 'A walk through the stages of sleep', ...",https://www.ted.com/talks/matt_walker_how_caff...,"Caffeine wakes you up, and alcohol makes you n...",Many of us like to start the day with a cup of...
4125,65073,Give yourself permission to be creative,Ethan Hawke,{0: 'Ethan Hawke'},"{0: ['actor', 'writer', 'director']}","{0: '""Ethan Hawke has quietly emerged as perha...",543518,2020-06-25,2020-07-23,TED2020,en,"[en, ro]",22,556,"[creativity, arts, life, humanity, vulnerabili...","{64173: 'How drawing can set you free', 60752:...",https://www.ted.com/talks/ethan_hawke_give_you...,"Reflecting on moments that shaped his life, ac...",I was hoping today to talk a little bit about ...


In [73]:
# fill NoneType data into int(0)
for i in range(0,len(df)):
    
    if str(type(df['comments'][i])) =="<class 'NoneType'>":
        df['comments'][i]=0
    print(i,type(df['comments'][i]))
    if str(type(df['views'][i])) =="<class 'NoneType'>":
        df['views'][i]=0

0 <class 'int'>
1 <class 'int'>
2 <class 'int'>
3 <class 'int'>
4 <class 'int'>
5 <class 'int'>
6 <class 'int'>
7 <class 'int'>
8 <class 'int'>
9 <class 'int'>
10 <class 'int'>
11 <class 'int'>
12 <class 'int'>
13 <class 'int'>
14 <class 'int'>
15 <class 'int'>
16 <class 'int'>
17 <class 'int'>
18 <class 'int'>
19 <class 'int'>
20 <class 'int'>
21 <class 'int'>
22 <class 'int'>
23 <class 'int'>
24 <class 'int'>
25 <class 'int'>
26 <class 'int'>
27 <class 'int'>
28 <class 'int'>
29 <class 'int'>
30 <class 'int'>
31 <class 'int'>
32 <class 'int'>
33 <class 'int'>
34 <class 'int'>
35 <class 'int'>
36 <class 'int'>
37 <class 'int'>
38 <class 'int'>
39 <class 'int'>
40 <class 'int'>
41 <class 'int'>
42 <class 'int'>
43 <class 'int'>
44 <class 'int'>
45 <class 'int'>
46 <class 'int'>
47 <class 'int'>
48 <class 'int'>
49 <class 'int'>
50 <class 'int'>
51 <class 'int'>
52 <class 'int'>
53 <class 'int'>
54 <class 'int'>
55 <class 'int'>
56 <class 'int'>
57 <class 'int'>
58 <class 'int'>
59 <cla

804 <class 'int'>
805 <class 'int'>
806 <class 'int'>
807 <class 'int'>
808 <class 'int'>
809 <class 'int'>
810 <class 'int'>
811 <class 'int'>
812 <class 'int'>
813 <class 'int'>
814 <class 'int'>
815 <class 'int'>
816 <class 'int'>
817 <class 'int'>
818 <class 'int'>
819 <class 'int'>
820 <class 'int'>
821 <class 'int'>
822 <class 'int'>
823 <class 'int'>
824 <class 'int'>
825 <class 'int'>
826 <class 'int'>
827 <class 'int'>
828 <class 'int'>
829 <class 'int'>
830 <class 'int'>
831 <class 'int'>
832 <class 'int'>
833 <class 'int'>
834 <class 'int'>
835 <class 'int'>
836 <class 'int'>
837 <class 'int'>
838 <class 'int'>
839 <class 'int'>
840 <class 'int'>
841 <class 'int'>
842 <class 'int'>
843 <class 'int'>
844 <class 'int'>
845 <class 'int'>
846 <class 'int'>
847 <class 'int'>
848 <class 'int'>
849 <class 'int'>
850 <class 'int'>
851 <class 'int'>
852 <class 'int'>
853 <class 'int'>
854 <class 'int'>
855 <class 'int'>
856 <class 'int'>
857 <class 'int'>
858 <class 'int'>
859 <class

1647 <class 'int'>
1648 <class 'int'>
1649 <class 'int'>
1650 <class 'int'>
1651 <class 'int'>
1652 <class 'int'>
1653 <class 'int'>
1654 <class 'int'>
1655 <class 'int'>
1656 <class 'int'>
1657 <class 'int'>
1658 <class 'int'>
1659 <class 'int'>
1660 <class 'int'>
1661 <class 'int'>
1662 <class 'int'>
1663 <class 'int'>
1664 <class 'int'>
1665 <class 'int'>
1666 <class 'int'>
1667 <class 'int'>
1668 <class 'int'>
1669 <class 'int'>
1670 <class 'int'>
1671 <class 'int'>
1672 <class 'int'>
1673 <class 'int'>
1674 <class 'int'>
1675 <class 'int'>
1676 <class 'int'>
1677 <class 'int'>
1678 <class 'int'>
1679 <class 'int'>
1680 <class 'int'>
1681 <class 'int'>
1682 <class 'int'>
1683 <class 'int'>
1684 <class 'int'>
1685 <class 'int'>
1686 <class 'int'>
1687 <class 'int'>
1688 <class 'int'>
1689 <class 'int'>
1690 <class 'int'>
1691 <class 'int'>
1692 <class 'int'>
1693 <class 'int'>
1694 <class 'int'>
1695 <class 'int'>
1696 <class 'int'>
1697 <class 'int'>
1698 <class 'int'>
1699 <class 

2445 <class 'int'>
2446 <class 'int'>
2447 <class 'int'>
2448 <class 'int'>
2449 <class 'int'>
2450 <class 'int'>
2451 <class 'int'>
2452 <class 'int'>
2453 <class 'int'>
2454 <class 'int'>
2455 <class 'int'>
2456 <class 'int'>
2457 <class 'int'>
2458 <class 'int'>
2459 <class 'int'>
2460 <class 'int'>
2461 <class 'int'>
2462 <class 'int'>
2463 <class 'int'>
2464 <class 'int'>
2465 <class 'int'>
2466 <class 'int'>
2467 <class 'int'>
2468 <class 'int'>
2469 <class 'int'>
2470 <class 'int'>
2471 <class 'int'>
2472 <class 'int'>
2473 <class 'int'>
2474 <class 'int'>
2475 <class 'int'>
2476 <class 'int'>
2477 <class 'int'>
2478 <class 'int'>
2479 <class 'int'>
2480 <class 'int'>
2481 <class 'int'>
2482 <class 'int'>
2483 <class 'int'>
2484 <class 'int'>
2485 <class 'int'>
2486 <class 'int'>
2487 <class 'int'>
2488 <class 'int'>
2489 <class 'int'>
2490 <class 'int'>
2491 <class 'int'>
2492 <class 'int'>
2493 <class 'int'>
2494 <class 'int'>
2495 <class 'int'>
2496 <class 'int'>
2497 <class 

3286 <class 'int'>
3287 <class 'int'>
3288 <class 'int'>
3289 <class 'int'>
3290 <class 'int'>
3291 <class 'int'>
3292 <class 'int'>
3293 <class 'int'>
3294 <class 'int'>
3295 <class 'int'>
3296 <class 'int'>
3297 <class 'int'>
3298 <class 'int'>
3299 <class 'int'>
3300 <class 'int'>
3301 <class 'int'>
3302 <class 'int'>
3303 <class 'int'>
3304 <class 'int'>
3305 <class 'int'>
3306 <class 'int'>
3307 <class 'int'>
3308 <class 'int'>
3309 <class 'int'>
3310 <class 'int'>
3311 <class 'int'>
3312 <class 'int'>
3313 <class 'int'>
3314 <class 'int'>
3315 <class 'int'>
3316 <class 'int'>
3317 <class 'int'>
3318 <class 'int'>
3319 <class 'int'>
3320 <class 'int'>
3321 <class 'int'>
3322 <class 'int'>
3323 <class 'int'>
3324 <class 'int'>
3325 <class 'int'>
3326 <class 'int'>
3327 <class 'int'>
3328 <class 'int'>
3329 <class 'int'>
3330 <class 'int'>
3331 <class 'int'>
3332 <class 'int'>
3333 <class 'int'>
3334 <class 'int'>
3335 <class 'int'>
3336 <class 'int'>
3337 <class 'int'>
3338 <class 

3999 <class 'int'>
4000 <class 'int'>
4001 <class 'int'>
4002 <class 'int'>
4003 <class 'int'>
4004 <class 'int'>
4005 <class 'int'>
4006 <class 'int'>
4007 <class 'int'>
4008 <class 'int'>
4009 <class 'int'>
4010 <class 'int'>
4011 <class 'int'>
4012 <class 'int'>
4013 <class 'int'>
4014 <class 'int'>
4015 <class 'int'>
4016 <class 'int'>
4017 <class 'int'>
4018 <class 'int'>
4019 <class 'int'>
4020 <class 'int'>
4021 <class 'int'>
4022 <class 'int'>
4023 <class 'int'>
4024 <class 'int'>
4025 <class 'int'>
4026 <class 'int'>
4027 <class 'int'>
4028 <class 'int'>
4029 <class 'int'>
4030 <class 'int'>
4031 <class 'int'>
4032 <class 'int'>
4033 <class 'int'>
4034 <class 'int'>
4035 <class 'int'>
4036 <class 'int'>
4037 <class 'int'>
4038 <class 'int'>
4039 <class 'int'>
4040 <class 'int'>
4041 <class 'int'>
4042 <class 'int'>
4043 <class 'int'>
4044 <class 'int'>
4045 <class 'int'>
4046 <class 'int'>
4047 <class 'int'>
4048 <class 'int'>
4049 <class 'int'>
4050 <class 'int'>
4051 <class 

In [74]:
# change data type

for i in range(0,len(df)):
    df['talk_id'][i]=int(df['talk_id'][i])
    df['views'][i]=int(df['views'][i])
    df['comments'][i]=int(df['comments'][i])
    df['duration'][i]=int(df['duration'][i])
    

In [75]:
df=df.sort_values(['talk_id'])

df

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
0,1,Averting the climate crisis,Al Gore,{0: 'Al Gore'},{0: ['climate advocate']},{0: 'Nobel Laureate Al Gore focused the world'...,3540904,2006-02-25,2006-06-27,TED2006,en,"[ar, bg, cs, de, el, en, es, fa, fr, fr-ca, gl...",273,977,"[alternative energy, cars, climate change, cul...","{243: 'New thinking on the climate crisis', 54...",https://www.ted.com/talks/al_gore_averting_the...,With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
16,2,Simple designs to save a life,Amy Smith,{0: 'Amy Smith'},"{0: ['inventor', 'engineer']}","{0: 'Amy Smith designs cheap, practical fixes ...",1741302,2006-02-24,2006-08-15,TED2006,en,"[ar, bg, ca, de, el, en, es, fa, fr, fr-ca, he...",101,906,"[MacArthur grant, alternative energy, design, ...","{1561: 'Energy from floating algae pods', 1072...",https://www.ted.com/talks/amy_smith_simple_des...,Fumes from indoor cooking fires kill more than...,"In terms of invention, I'd like to tell you th..."
35,3,How to rebuild a broken state,Ashraf Ghani,{0: 'Ashraf Ghani'},{0: ['president of afghanistan']},{0: 'Afghanistan's president Ashraf Ghani has ...,994796,2005-07-12,2006-10-18,TEDGlobal 2005,en,"[ar, bg, cs, de, el, en, es, fa, fr, he, hu, i...",75,1125,"[business, corruption, culture, economics, ent...","{127: 'Want to help Africa? Do business here',...",https://www.ted.com/talks/ashraf_ghani_how_to_...,Ashraf Ghani's passionate and powerful 10-minu...,"A public, Dewey long ago observed, is constitu..."
36,4,The real future of space exploration,Burt Rutan,{0: 'Burt Rutan'},{0: ['aircraft engineer']},"{0: 'In 2004, legendary spacecraft designer Bu...",2445177,2006-02-24,2006-10-25,TED2006,en,"[ar, az, bg, cs, de, el, en, es, fa, fi, fr, h...",196,1177,"[NASA, aircraft, business, design, engineering...","{141: 'Inside the world's deepest caves', 264:...",https://www.ted.com/talks/burt_rutan_the_real_...,"In this passionate talk, legendary spacecraft ...","I want to start off by saying, Houston, we hav..."
74,5,Great cars are great art,Chris Bangle,{0: 'Chris Bangle'},{0: ['car designer']},{0: 'Car design is a ubiquitous but often over...,983639,2002-02-02,2007-04-05,TED2002,en,"[ar, bg, de, en, es, fr, he, hr, hu, it, ja, k...",82,1204,"[business, cars, design, industrial design, in...","{4: 'The real future of space exploration', 26...",https://www.ted.com/talks/chris_bangle_great_c...,American designer Chris Bangle explains his ph...,"What I want to talk about is, as background, i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4116,65060,A walk through the stages of sleep,Matt Walker,{0: 'Matt Walker'},"{0: ['sleep scientist', 'professor', 'author']}",{0: 'Matt Walker is a brain scientist trying t...,56561,2020-07-15,2020-07-15,Sleeping with Science,en,"[en, fr, pt-br, tr]",7,287,"[sleep, health, human body, science]","{41105: 'Sleep is your superpower', 61900: 'Wh...",https://www.ted.com/talks/matt_walker_a_walk_t...,Did you know you go on a journey every night a...,Sleep is perhaps the single most effective thi...
4124,65061,How caffeine and alcohol affect your sleep,Matt Walker,{0: 'Matt Walker'},"{0: ['sleep scientist', 'professor', 'author']}",{0: 'Matt Walker is a brain scientist trying t...,53038,2020-07-22,2020-07-22,Sleeping with Science,en,"[en, fr, hi]",5,294,"[sleep, science, health, human body]","{65060: 'A walk through the stages of sleep', ...",https://www.ted.com/talks/matt_walker_how_caff...,"Caffeine wakes you up, and alcohol makes you n...",Many of us like to start the day with a cup of...
4125,65073,Give yourself permission to be creative,Ethan Hawke,{0: 'Ethan Hawke'},"{0: ['actor', 'writer', 'director']}","{0: '""Ethan Hawke has quietly emerged as perha...",543518,2020-06-25,2020-07-23,TED2020,en,"[en, ro]",22,556,"[creativity, arts, life, humanity, vulnerabili...","{64173: 'How drawing can set you free', 60752:...",https://www.ted.com/talks/ethan_hawke_give_you...,"Reflecting on moments that shaped his life, ac...",I was hoping today to talk a little bit about ...
4122,65074,"A comprehensive, neighborhood-based response t...",Kwame Owusu-Kesse,{0: 'Kwame Owusu-Kesse'},{0: ['community leader']},{0: 'Kwame Owusu-Kesse ensures the programming...,511464,2020-06-18,2020-07-21,TED2020,en,"[en, tr]",8,396,"[coronavirus, education, community, Audacious ...",{63813: 'How to quickly scale up contact traci...,https://www.ted.com/talks/kwame_owusu_kesse_a_...,Crisis interventions often focus on a single a...,Good evening. It is such a blessing to work at...


In [77]:
# output DataFrame as CSV
df.to_csv('../data/ted_talks.csv', index=False)

In [116]:
df

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
0,1,Averting the climate crisis,Al Gore,{0: 'Al Gore'},{0: ['climate advocate']},{0: 'Nobel Laureate Al Gore focused the world'...,3540904,2006-02-25,2006-06-27,TED2006,en,"[ar, bg, cs, de, el, en, es, fa, fr, fr-ca, gl...",273,977,"[alternative energy, cars, climate change, cul...","{243: 'New thinking on the climate crisis', 54...",https://www.ted.com/talks/al_gore_averting_the...,With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
16,2,Simple designs to save a life,Amy Smith,{0: 'Amy Smith'},"{0: ['inventor', 'engineer']}","{0: 'Amy Smith designs cheap, practical fixes ...",1741302,2006-02-24,2006-08-15,TED2006,en,"[ar, bg, ca, de, el, en, es, fa, fr, fr-ca, he...",101,906,"[MacArthur grant, alternative energy, design, ...","{1561: 'Energy from floating algae pods', 1072...",https://www.ted.com/talks/amy_smith_simple_des...,Fumes from indoor cooking fires kill more than...,"In terms of invention, I'd like to tell you th..."
35,3,How to rebuild a broken state,Ashraf Ghani,{0: 'Ashraf Ghani'},{0: ['president of afghanistan']},{0: 'Afghanistan's president Ashraf Ghani has ...,994796,2005-07-12,2006-10-18,TEDGlobal 2005,en,"[ar, bg, cs, de, el, en, es, fa, fr, he, hu, i...",75,1125,"[business, corruption, culture, economics, ent...","{127: 'Want to help Africa? Do business here',...",https://www.ted.com/talks/ashraf_ghani_how_to_...,Ashraf Ghani's passionate and powerful 10-minu...,"A public, Dewey long ago observed, is constitu..."
36,4,The real future of space exploration,Burt Rutan,{0: 'Burt Rutan'},{0: ['aircraft engineer']},"{0: 'In 2004, legendary spacecraft designer Bu...",2445177,2006-02-24,2006-10-25,TED2006,en,"[ar, az, bg, cs, de, el, en, es, fa, fi, fr, h...",196,1177,"[NASA, aircraft, business, design, engineering...","{141: 'Inside the world's deepest caves', 264:...",https://www.ted.com/talks/burt_rutan_the_real_...,"In this passionate talk, legendary spacecraft ...","I want to start off by saying, Houston, we hav..."
74,5,Great cars are great art,Chris Bangle,{0: 'Chris Bangle'},{0: ['car designer']},{0: 'Car design is a ubiquitous but often over...,983639,2002-02-02,2007-04-05,TED2002,en,"[ar, bg, de, en, es, fr, he, hr, hu, it, ja, k...",82,1204,"[business, cars, design, industrial design, in...","{4: 'The real future of space exploration', 26...",https://www.ted.com/talks/chris_bangle_great_c...,American designer Chris Bangle explains his ph...,"What I want to talk about is, as background, i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4116,65060,A walk through the stages of sleep,Matt Walker,{0: 'Matt Walker'},"{0: ['sleep scientist', 'professor', 'author']}",{0: 'Matt Walker is a brain scientist trying t...,56561,2020-07-15,2020-07-15,Sleeping with Science,en,"[en, fr, pt-br, tr]",7,287,"[sleep, health, human body, science]","{41105: 'Sleep is your superpower', 61900: 'Wh...",https://www.ted.com/talks/matt_walker_a_walk_t...,Did you know you go on a journey every night a...,Sleep is perhaps the single most effective thi...
4124,65061,How caffeine and alcohol affect your sleep,Matt Walker,{0: 'Matt Walker'},"{0: ['sleep scientist', 'professor', 'author']}",{0: 'Matt Walker is a brain scientist trying t...,53038,2020-07-22,2020-07-22,Sleeping with Science,en,"[en, fr, hi]",5,294,"[sleep, science, health, human body]","{65060: 'A walk through the stages of sleep', ...",https://www.ted.com/talks/matt_walker_how_caff...,"Caffeine wakes you up, and alcohol makes you n...",Many of us like to start the day with a cup of...
4125,65073,Give yourself permission to be creative,Ethan Hawke,{0: 'Ethan Hawke'},"{0: ['actor', 'writer', 'director']}","{0: '""Ethan Hawke has quietly emerged as perha...",543518,2020-06-25,2020-07-23,TED2020,en,"[en, ro]",22,556,"[creativity, arts, life, humanity, vulnerabili...","{64173: 'How drawing can set you free', 60752:...",https://www.ted.com/talks/ethan_hawke_give_you...,"Reflecting on moments that shaped his life, ac...",I was hoping today to talk a little bit about ...
4122,65074,"A comprehensive, neighborhood-based response t...",Kwame Owusu-Kesse,{0: 'Kwame Owusu-Kesse'},{0: ['community leader']},{0: 'Kwame Owusu-Kesse ensures the programming...,511464,2020-06-18,2020-07-21,TED2020,en,"[en, tr]",8,396,"[coronavirus, education, community, Audacious ...",{63813: 'How to quickly scale up contact traci...,https://www.ted.com/talks/kwame_owusu_kesse_a_...,Crisis interventions often focus on a single a...,Good evening. It is such a blessing to work at...


In [107]:
driver=webdriver.Chrome()

In [None]:
driver.set_window_size(1024,1024)

In [117]:
#url=df[df['talk_id']==1]['url'][0]
url=df['url'][0]

print(url)


https://www.ted.com/talks/al_gore_averting_the_climate_crisis/


In [None]:
driver.get(url)

In [None]:
//*[@id="content"]/div/div[2]/div[1]/div[2]/div/div/div[2]/ul/li[4]/button/div/span/svg

In [None]:
driver.find_element_by_xpath('//*[@id="content"]')

In [None]:
driver.find_element_by_css_selector('input[type=submit][id]').click()

In [None]:
driver.close()