In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import numpy as np
import pandas as pd 
import difflib 
import prof_data


In [2]:
def get_data(year):
    
    """
    Get episode details from wikipedia page of The Late Show.
    
    Parameters: Year for which the episodes details needs to be pulled out. 
    
    Returns: Two data frames details_df and guest_df with episode details and guest details.

    """
    site= "https://en.wikipedia.org/wiki/List_of_The_Late_Show_with_Stephen_Colbert_episodes_(%s)" %(year)
    page = urlopen(site)
    soup = BeautifulSoup(page.read(),"lxml")
    table = soup.find_all('tr', class_= 'vevent')
    data = []
    wiki_page = []
    date_list=[]
    episode_no=[]
    guests=[]
    links_list = []
    link_final = []
    episode_details = {}
    wiki_page = []
    guest_flat = []
    link_flat = []
    guest_details = {}

    for row in table:
        cols = row.find_all('td')
        links = [ele.find_all('a',href=True) for ele in cols]
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])
        
        str_list = list(filter(None, links))
  
        for item in str_list:
            for i in item:
                link_final.append(i['href'])
            
    data_final = [item[0:3] for item in data]
    #print(len(data_final))
    #print(link_final)
    

    for item in data_final:
        episode_no.append(item[0])
        date_list.append(item[1])
        guest_list=re.sub(r"( to | & )", ",", item[2]).split(",")
        guest_list_n=[]
        for each_guest in guest_list:
            guest_list_n.append(each_guest.strip())
    
        guests.append(guest_list_n)
        #print(guests)
        
    for guest_item in guests:
        for each_guest in guest_item:
            get_link =difflib.get_close_matches(each_guest, link_final,1,0.4)
            guest_flat.append(each_guest.strip())
            if len(get_link)>0:
                link_flat.append(get_link[0])
            else:
                link_flat.append(None)
                
    print(guest_flat)
        
    guest_details['guest_name'] = guest_flat
    guest_details['wiki_link'] = link_flat
    guest_df=pd.DataFrame(guest_details)
    guest_df['wiki_link'] = 'https://en.wikipedia.org' + guest_df['wiki_link'].astype(str)


    episode_details['episode_no'] = episode_no
    episode_details['episode_date']= date_list
    episode_details['guest_list']= guests

    details_df=pd.DataFrame(episode_details)
    
    return details_df,guest_df
    

In [3]:
years=['2015','2016','2017','2018']

episode_details_df = pd.DataFrame(columns=['episode_no','episode_date','guest_list'])
guest_details_df = pd.DataFrame(columns=['guest_name','wiki_link'])

for year in years:
    print(year)
    year_episode_details,year_guests_details=get_data(year) 
    episode_details_df = pd.concat([episode_details_df,year_episode_details],ignore_index=True)
    guest_details_df = pd.concat([guest_details_df,year_guests_details],ignore_index=True)
    
    
print(len(episode_details_df))
print(len(guest_details_df))

episode_details_df.to_csv('episode_details.csv',encoding='utf-8')
guest_details_df.to_csv('guest_details.csv',encoding='utf-8')

2015
['George Clooney', 'Governor Jeb Bush', 'Scarlett Johansson', 'Elon Musk', 'Vice President Joe Biden', 'Travis Kalanick', 'Amy Schumer', 'Stephen King', 'Emily Blunt', 'Justice Stephen Breyer', 'Jake Gyllenhaal', 'Tim Cook', 'Kevin Spacey', 'Carol Burnett', 'Trevor Noah', 'Secretary-General Ban Ki-moon', "Lupita Nyong'o", 'Senator Bernie Sanders', 'Christopher Wheeldon', 'Stephen Curry', 'Senator Ted Cruz', 'Donald Trump', 'Dr. Ernest Moniz', 'Hugh Jackman with Hugh Evans', 'Senator Elizabeth Warren', 'Jim Gaffigan', 'Maria Shriver', 'Andrew Sullivan', 'Archbishop Thomas Wenski', 'Malala Yousafzai', 'Kerry Washington', 'First Lady Michelle Obama', 'John Legend', 'Ellen Page', 'Jesse Eisenberg', 'John Oliver', 'Evan Spiegel', 'Bill Withers', 'Ed Sheeran', 'Secretary of State John Kerry', 'Claire Danes', 'PewDiePie', 'Morgan Freeman', 'Ruth Wilson', 'Senator John McCain', 'Misty Copeland', 'Bill Clinton', 'Billy Eichner', 'Gina Rodriguez', 'Ben Bernanke', 'Cate Blanchett', 'Brian Ch

['Oprah Winfrey', 'Kate Beckinsale', 'Chris Messina', 'Jen Kirkman', 'Adam Driver', 'Alexa Davalos', 'Charlie Rose', 'Hayden Panettiere', 'Jack Maxwell', 'Billy Joel', 'Josh Holloway', 'Andrew Garfield', 'Erin Andrews', 'Jude Law', 'Gabrielle Union', 'Thomas L. Friedman', 'Tom Selleck', 'Craig Robinson', 'Cuba Gooding Jr.', 'Rupert Friend', 'Sarah Paulson', 'Corey Stoll', 'Billy Eichner', 'Mel B', 'Gilbert Gottfried', 'Idina Menzel', 'Rachel Bloom', 'Louie Anderson', 'Johnny Galecki', 'Chris Matthews', 'Jim Gaffigan', 'Cristela Alonzo', 'Leslie Mann', 'Lewis Black', 'Dan Levy', 'Josh Groban', 'Rachael Ray', 'Ricky Gervais', 'Christina Ricci', 'Dr. Phil McGraw', 'Michael Bolton', 'Priyanka Chopra', 'Thomas Sadoski', 'Paul Giamatti', 'Wendy Williams', 'John Oliver', 'Isabelle Huppert', 'Robert De Niro', 'Jake Tapper', 'Ezra Edelman', 'David Oyelowo', 'Taran Killam', 'Will Arnett', 'Pete Holmes', 'Shailene Woodley', 'Laverne Cox', 'Rick', 'Marty Lagina', 'Christine Baranski', 'Bob Odenkir