In [4]:
import pandas as pd

# TV Guide
TV Guide has the best family name data (which is nearly complete) for the early seasons of the show. In some cases, the hometowns are also provided in the description. The date appears to be the date of the last airing on TV (according to Guide), not the original air date.

In [5]:
# Already downloaded lists of episodes by season from TV Guide
# Each season in a sepearte text file
# Starts at Season 14 (Steve's first season), TV Guide does not have season 12 or 13

def tv_guide_data(season_number):
    """ Get df of TV Guide data for one season based on argument """
    # Read file by season number (14-24)
    with open(f'../data/tv_guide/S{season_number} TV Guide.txt') as f:
        lines = f.readlines()

    # Remove new line charcaters which are sporadic throughout
    x = ' '.join(lines).replace("\n"," ")

    # Split on word "Episode" and drop unneeded header
    x = x.replace(" Episode ","###Episode ").split("###")[2:]

    # Remove repeated description text from episode descriptions
    # Aim to only keep family names and hometowns
    remove_text = ["""Comedian Steve Harvey hosts as the """,
        """Families compete against each by guessing answers to various surveys""",
        """Families compete to guess the most popular answers to various survey questions""",
        """in this durable game show""",
        """Steve Harvey hosts the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts the network\'s new season of the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts the the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts this hilarious version of the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey plays host to two teams, each comprised of five family members, who try to match the answers given to survey questions asked to groups of people""",
        """Steve Harvey presents two families who battle it out by answering survey questions for a chance to win cash and prizes""",
        """The durable game show in which two teams of five relatives compete for cash and prizes by guessing the most popular answers to questions based on what the "survey said" in polls conducted with 100 people""",
        """Two families battle against each other by guessing the answers to survey questions""",
        """Two families battle each other by trying to match the answers to the survey questions""",
        """Two families compete by trying to match the answers to survey questions given to a group of people""",
        """Two families compete to guess answers to various surveys""",
        """Two families compete to guess the answers to surveys""",
        """Two families compete to guess the answers to various surveys""",
        """Two families compete to guess the answers to various surveys. Steve Harvey hosts""",
        """Two families of five face off to guess the answers with the results of a survey given to a group of people""",
        """Two families of five face off to guess the answers with the results of a survey of one hundred people""",
        """Two families of five face off to name the top responses to questions posed to 100 people""",
        """Two families of five try to guess what the "survey said" in polls conducted with 100 people""",
        """Two families of five try to guess what the "survey said" in polls conducted with 100 people in this durable game show""",
        """Two families try to guess what the "survey said" in polls conducted with 100 people""",
        """Two new families compete for the right to play the returning champions in this classic game show""",
        """Two teams of families try to guess the answers to different survey questions""",
        """Two teams play against each other to win the grand prize by answering the most popular responses to surveys""",
        """Where To Watch""",
        """.Where To Watch"""
    ]
        
    # Replace repeated phrases from list with space
    for i in range(len(remove_text)):
        x = [a.replace(remove_text[i]," ") for a in x]

    # Check if string contains any letters or numbers
    def non_empty(words):
        return [word for word in words if len(re.compile('[^a-zA-Z0-9]').sub('',word)) > 0]

    # Split on double space
    split_text = ["  "]
    for i in range(len(split_text)):
        x = [a.split(split_text[i]) for a in x]

    # Only keeps values with data
    x = [non_empty(a) for a in x]
    
    # Some seasons have zero descriptions so the column may not exist yet in nested list
    try:
        df = pd.DataFrame(x, columns=['Episode','EpisodeTitle','TVGuideTime','TVGuideDescription'])
    except:
        df = pd.DataFrame(x, columns=['Episode','EpisodeTitle','TVGuideTime'])
        
    df = df.fillna("")
    
    def description_stop_words(text):
        """ Remove words that add no info to descriptions (keep only family names and hometowns/state)"""
        if not text: return text
        
        # Remove non-letters and add leading/traing spaces
        text = re.compile('[^a-zA-Z]').sub(' ', text)
        text = ' ' + text + ' '
        
        # Replace stop words with space
        # Terrible technique lol
        to_replace = ['"',' to ',' the ',' various ',' family ',' Family ', " a ", " against ", " and ", " answer ", " answers ", " are ", " based ", " battle ", " battles ", " between ", " board ", " by ", " chance ", " compete ", " competes ", " each ", " face ", " families ", " Fast ", " five ", " for ", " from ", " game ", " go ", " guess ", " guessing ", " Harvey ", " hosted ", " hosts ", " in ", " it ", " more ", " off ", " on ", " one ", " other ", " out ", " people ", " play ", " player ", " questions ", " quiz ", " responses ", " round ", " rounds ", " several ", " Show ", " Steve ", " survey ", " survey-driven ", " surveys ", " teams ", " the ", " The ", " to ", " top ", " try ", " Two ", " up ", " various ", " with ",]
        for t in to_replace:
            text = text.replace(t,' ')
            
        return text.strip()
    
    # Remove stop words if descriptions exist, else create empty column
    if "TVGuideDescription" in df.columns:
        df["TVGuideDescription"] = df["TVGuideDescription"].apply(description_stop_words)
    else:
        df["TVGuideDescription"] = ""
        
    # Extract season/episode number from text
    def get_episode_number(text):
        t = text.replace("Episode","")
        try:
            return int(t)
        except:
            return t
    
    # Get family names from game title
    def get_family(title, letter):
        if letter == 'A': return title.split(' vs ')[0].strip()
        if letter == 'B':
            try: return title.split(' vs ')[1].split(' Big Money')[0].strip()
            except: return ""
    
    # Notes
    def get_notes(title):
        if "Big Money Tournament" in title:
            return "Big Money Tournament"
        else:
            return ""
    
    df["EpisodeTitle"] = df.EpisodeTitle.apply(lambda x : re.sub("[^a-zA-Z ]", "", x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace(" Family","").replace("Episode","")
    
    # Create joinable columns
    df["EpisodeNum"] = df["Episode"].apply(get_episode_number)
    df["Season"] = f"Season {season_number}"
    df["SeasonNum"] = season_number
    
    # Get family names from episode title
    df["FamilyA"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'A'))
    df["FamilyB"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'B'))
    
    df["Notes"] = df.EpisodeTitle.apply(lambda x : get_notes(x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace("Big Money Tournament Featuring","").replace("the","")
    
    # Reorder Columns
    df = df[['SeasonNum', 'EpisodeNum', 'Season', 'Episode', 'EpisodeTitle','FamilyA','FamilyB','TVGuideTime', 'TVGuideDescription','Notes']]

    return df

In [6]:
# Load TV Guide dataframe from pickle
try:
    df_guide = pd.read_pickle("../pickles/df_guide.pkl")
    
# Run 'tv_guide_data' for all seasons and concat into single dataframe
except:
    df_guide = pd.DataFrame()
    for season in range(14,25):
        df_ = tv_guide_data(season)
        df_guide = pd.concat([df_guide,df_]).reset_index(drop=True)
    df_guide.to_pickle("../pickles/df_guide.pkl")

In [10]:
# Total Episodes
print("Total Episodes:",df_guide.shape[0])

Total Episodes: 2013


In [11]:
# Episodes by Season
df_guide.groupby('SeasonNum').count()['EpisodeNum']

SeasonNum
14    168
15    179
16    180
17    200
18    200
19    200
20    200
21    200
22    153
23    245
24     88
Name: EpisodeNum, dtype: int64

In [12]:
# Sample of TV Guide data
df_guide.sample(n=10)

Unnamed: 0,SeasonNum,EpisodeNum,Season,Episode,EpisodeTitle,FamilyA,FamilyB,TVGuideTime,TVGuideDescription,Notes
756,18,30,Season 18,Episode 30,Fields vs de Clairville,Fields,de Clairville,"Fri, Oct 28, 2016 30 mins",,
547,17,21,Season 17,Episode 21,September,September,,"Wed, Sep 30, 2015 30 mins",,
597,17,71,Season 17,Episode 71,October,October,,"Tue, Oct 27, 2015 30 mins",,
1221,20,95,Season 20,Episode 95,,,,"Mon, Feb 11, 2019 30 mins",,
1696,23,9,Season 23,Episode 9,,,,,,
1190,20,64,Season 20,Episode 64,,,,"Tue, Oct 9, 2018 30 mins",,
1615,22,89,Season 22,Episode 89,March,March,,"Tue, Mar 2, 2021 30 mins",,
571,17,45,Season 17,Episode 45,November,November,,"Fri, Nov 13, 2015 30 mins",,
671,17,145,Season 17,Episode 145,,,,"Mon, Apr 18, 2016 30 mins",,
1616,22,90,Season 22,Episode 90,March,March,,"Wed, Mar 3, 2021 30 mins",,
