In [1]:
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 200)

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
import datefinder
# Example
string_with_dates = "“Family Feud” 11/23/22"
matches = datefinder.find_dates(string_with_dates)
next(matches);

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# Philo

In [None]:
# Page source of full Philo library
# !!! Make sure the library is fully expanded before extracting source, or else episodes will not be loaded into HTML code
with open('philo.txt') as f:
    lines = f.readlines()

In [None]:
# Some new lines exist already but remove these by joining
text = ' '.join(lines)

In [None]:
# Split on episode title
eps = text.split('Family Feud, ')

# Get only the first x characters which are the actual episode title
# Exclude first element since this is the "pre-split"
titles = [ep[:28] for ep in eps[1:]]

# Split on new element
titles = [tit.split(">")[0] for tit in titles]

# Some titles have new lines in them
titles = [tit.replace("\n  ","") for tit in titles]

# Some titles are missing a space between "Episode" and number
titles = [tit.replace("Episode","Episode ") for tit in titles]
titles = [tit.replace("Episode  ","Episode ") for tit in titles]

In [None]:
regex = re.compile('[^a-zA-Z0-9, ]')
titles = [regex.sub('', tit) for tit in titles]

In [None]:
len(titles)

In [None]:
regex = re.compile('[^0-9,]')
title_nums = [regex.sub('', tit) for tit in titles]

In [None]:
title_num_list = [tit.split(",") for tit in title_nums]

In [None]:
df = pd.DataFrame(title_num_list,columns=['Season','Episode']).drop_duplicates()
df = df.astype(int)
df = df.sort_values(by=['Season','Episode']).reset_index(drop=True)

In [None]:
df[0:5]

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# TV Guide
TV Guide has mostly full family lists for the early seasons of the show. In some cases, the hometowns are also provided in the description. The date appears to be the date of the last airing on TV (according to Guide), not the original air date.

In [None]:
# Already downloaded lists of episodes by season from TV Guide
# Each season in a sepearte text file

def tv_guide_data(season_number):
    # Read file by season number (14-24)
    with open(f'S{season_number} TV Guide.txt') as f:
        lines = f.readlines()

    # Remove new line charcaters which are sporadic throughout
    x = ' '.join(lines).replace("\n"," ")

    # Split on word "Episode" and drop unneeded header
    x = x.replace(" Episode ","###Episode ").split("###")[2:]

    # Remove repeated description text from episode descriptions
    # Aim to only keep family names and hometowns
    remove_text = ["""Comedian Steve Harvey hosts as the """,
        """Families compete against each by guessing answers to various surveys""",
        """Families compete to guess the most popular answers to various survey questions""",
        """in this durable game show""",
        """Steve Harvey hosts the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts the network\'s new season of the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts the the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts this hilarious version of the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey plays host to two teams, each comprised of five family members, who try to match the answers given to survey questions asked to groups of people""",
        """Steve Harvey presents two families who battle it out by answering survey questions for a chance to win cash and prizes""",
        """The durable game show in which two teams of five relatives compete for cash and prizes by guessing the most popular answers to questions based on what the "survey said" in polls conducted with 100 people""",
        """Two families battle against each other by guessing the answers to survey questions""",
        """Two families battle each other by trying to match the answers to the survey questions""",
        """Two families compete by trying to match the answers to survey questions given to a group of people""",
        """Two families compete to guess answers to various surveys""",
        """Two families compete to guess the answers to surveys""",
        """Two families compete to guess the answers to various surveys""",
        """Two families compete to guess the answers to various surveys. Steve Harvey hosts""",
        """Two families of five face off to guess the answers with the results of a survey given to a group of people""",
        """Two families of five face off to guess the answers with the results of a survey of one hundred people""",
        """Two families of five face off to name the top responses to questions posed to 100 people""",
        """Two families of five try to guess what the "survey said" in polls conducted with 100 people""",
        """Two families of five try to guess what the "survey said" in polls conducted with 100 people in this durable game show""",
        """Two families try to guess what the "survey said" in polls conducted with 100 people""",
        """Two new families compete for the right to play the returning champions in this classic game show""",
        """Two teams of families try to guess the answers to different survey questions""",
        """Two teams play against each other to win the grand prize by answering the most popular responses to surveys""",
        """Where To Watch""",
        """.Where To Watch"""
    ]
        
    # Replace repeated phrases from list with space
    for i in range(len(remove_text)):
        x = [a.replace(remove_text[i]," ") for a in x]

    # Check if string contains any letters or numbers
    def non_empty(words):
        return [word for word in words if len(re.compile('[^a-zA-Z0-9]').sub('',word)) > 0]

    # Split on double space
    split_text = ["  "]
    for i in range(len(split_text)):
        x = [a.split(split_text[i]) for a in x]

    # Only keeps values with data
    x = [non_empty(a) for a in x]
    
    def description_stop_words(text):
        """ Remove words that add no info to descriptions (keep only family names and hometowns/state)"""
        if not text: return text
        
        # Remove non-letters and add leading/traing spaces
        text = re.compile('[^a-zA-Z]').sub(' ', text)
        text = ' ' + text + ' '
        
        # Replace stop words with space
        # Terrible technique lol
        to_replace = ['"',' to ',' the ',' various ',' family ',' Family ', " a ", " against ", " and ", " answer ", " answers ", " are ", " based ", " battle ", " battles ", " between ", " board ", " by ", " chance ", " compete ", " competes ", " each ", " face ", " families ", " Fast ", " five ", " for ", " from ", " game ", " go ", " guess ", " guessing ", " Harvey ", " hosted ", " hosts ", " in ", " it ", " more ", " off ", " on ", " one ", " other ", " out ", " people ", " play ", " player ", " questions ", " quiz ", " responses ", " round ", " rounds ", " several ", " Show ", " Steve ", " survey ", " survey-driven ", " surveys ", " teams ", " the ", " The ", " to ", " top ", " try ", " Two ", " up ", " various ", " with ",]
        for t in to_replace:
            text = text.replace(t,' ')
            
        return text.strip()
    
    # Some seasons have zero descriptions so the column may not exist yet in nested list
    try:
        df = pd.DataFrame(x, columns=['Episode','EpisodeTitle','TVGuideTime','TVGuideDescription'])
    except:
        df = pd.DataFrame(x, columns=['Episode','EpisodeTitle','TVGuideTime'])
        
    df = df.fillna("")
    
    
    # Remove stop words if descriptions exist, else create empty column
    if "TVGuideDescription" in df.columns:
        df["TVGuideDescription"] = df["TVGuideDescription"].apply(description_stop_words)
    else:
        df["TVGuideDescription"] = ""
        
    # Extract season/episode number from text
    def get_episode_number(text):
        t = text.replace("Episode","")
        try:
            return int(t)
        except:
            return t
    
    df["EpisodeTitle"] = df.EpisodeTitle.apply(lambda x : re.sub("[^a-zA-Z ]", "", x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace(" Family","").replace("Episode","")
    df["EpisodeNum"] = df["Episode"].apply(get_episode_number)
    df["Season"] = f"Season {season_number}"
    df["SeasonNum"] = season_number
    
    def get_family(title, letter):
        if letter == 'A': return title.split(' vs ')[0].strip()
        if letter == 'B':
            try: return title.split(' vs ')[1].split(' Big Money')[0].strip()
            except: return ""
    
    def get_notes(title):
        if "Big Money Tournament" in title:
            return "Big Money Tournament"
        else:
            return ""
    
    df["FamilyA"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'A'))
    df["FamilyB"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'B'))
    
    df["Notes"] = df.EpisodeTitle.apply(lambda x : get_notes(x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace("Big Money Tournament Featuring","").replace("thr","")
    
    # Reorder Columns
    df = df[['SeasonNum', 'EpisodeNum', 'Season', 'Episode', 'EpisodeTitle','FamilyA','FamilyB','TVGuideTime', 'TVGuideDescription','Notes']]

    return df

In [None]:
# Run 'tv_guide_data' for all seasons and concat the dataframes
df = pd.DataFrame()
for season in range(14,25):
    df_ = tv_guide_data(season)
    df = pd.concat([df,df_]).reset_index(drop=True)

df_guide = df

## Final

In [None]:
df_guide;

In [None]:
# Total Episodes
print("Total Episodes:",df_guide.shape[0],'\n')

# Episodes by Season
df_guide.groupby('SeasonNum').count()['EpisodeNum']

## Testing

In [None]:
# Testing, but keep
# Word counts of aggregate descriptions of all episodes
descrips = [a for a in list(df_guide.TVGuideDescription) if isinstance(a, str)]
words = ' '.join(descrips).split(' ')
df_word = pd.DataFrame(words)
df_word[:200].value_counts();

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# bobbymgsk

In [4]:
# Read in massive 126 page html file, we love you Bobby McBride <3
# File is the fully (?) expanded version of the "Family Feud" category from blog
with open(f"bobbymgsk.html", encoding="UTF-8") as f:
    r = f.read()

# Replace space symbol that decoder missed
r = r.replace(u'\xa0', u' ')

In [5]:
# SLOW # RUNS VERY SLOW #
# Parse to soup and collect list of posts
soup = BeautifulSoup(r, 'html.parser')

In [6]:
# Get list of posts (most posts contain two episodes)
posts = soup.find_all("div", {"class": "post"})
len(posts)

1260

In [7]:
# Testing get all words in 'p' tags and find most common
spans = []
for i in range(len(posts)):
    post_title = posts[i].find('a', href=True).text
    if "“Family Feud”" in post_title:
        spans += [s.text for s in posts[i].find_all('span')]

df = pd.DataFrame(spans)
df[0].value_counts()[0:100];

In [8]:
# Games can start with random titles ...yay... the following all designate new games
game_titles = ['G1 M-U','G1','G2','CG','SF','M-U',
               '$160K CHAMPIONSHIP',
               'Regular Game',
               'Bonus Wk. Game',
               'PCH Wk. Game',
               'Sole Game Tonight',
               'SOLE GAME TONIGHT',
               '$10K FEUD OF THE DAY',
               'Last Game of Season',
               'Final Car Feud of Season',
               'Wounded Warrior Projects Game',
               'SPECIAL “ALMOST CHRISTMAS” GAME',
               'FINAL GAME THIS WINTER',
               'FINAL CAR GAME OF WINTER ’16',
               'CAR FEUD OF THE EVENING',
               'STEVE’S 1,000TH SYNDICATED GAME',
               '3RD CAR FEUD OF ’16',
               '2ND CAR GAME THIS YR',
               'LAST GAME OF S2',
               'CAR GAME',
               'Professional Boxers Showdown',
               'G2 M-U',
               '“PROPERTY BROTHERS” v. “AMERICA’S MOST DESPERATE KITCHENS” GAME']

In [9]:
def get_post_date(div, post_title):
    post_date = ''
    S = div.find_all('span')
    for s in S:
        if bool([str(ele) for ele in list(range(2010,2024)) if(str(ele) in s.text)]):
            post_date = s.text
            return post_date
        else:
            try:
                matches = datefinder.find_dates(post_title)
                post_date = next(matches)
                return post_date
            except:
                pass
    return post_date

In [10]:
game_count = 0
for i in range(len(posts)):
    display = []
    FM_count = 0
    post_title = posts[i].find('a', href=True).text
   
    display.append(post_title)
#     post_date = get_post_date(posts[i], post_title)
#     display.append(post_date)
    P = posts[i].find_all('p')
    for p in P:
        if 'FM:' in p.text:
            FM_count += 1
        if bool([ele for ele in game_titles if(ele in p.text)]):
            display.append(p.text)
    game_count += len(display)-1
    if len(display) < FM_count+1:
        for d in display: print(d)


“Family Feud” 11/5/10
“Family Feud” 11/4/10
“Family Feud” 11/3/10
“Family Feud” 11/2/10
“Family Feud” 2/11/13- FIRST-RUN REMATCH
“Family Feud” 10/19/10
“Family Feud” 10/16/12


In [11]:
def get_family_info(title,num):
    try:
        family = re.split(' v. | v | vs. | vs ', game_title)[num]
    except:
        family=""
    for g in game_titles:
        family = family.replace(g,'').replace(':','').strip()
    family = family.replace('(','$$@(').split('$$@')
    name = family[0]
    info = family[1:]
    return name, info

In [18]:
game_count = 0
games = []
for post in posts:
    post_title = post.find('a', href=True).text
    P = post.find_all('p')
    for p in P:
        if bool([ele for ele in game_titles if(ele in p.text)]):
            game_title = p.text
            [nameFamilyA, infoFamilyA] = get_family_info(game_title,0)
            [nameFamilyB, infoFamilyB] = get_family_info(game_title,1)
             
            game = [post_title, post_version, post_date, game_title, nameFamilyA, nameFamilyB, infoFamilyA, infoFamilyB]
            games.append(game)
            
            
            
            

In [22]:
games = pd.DataFrame(games)

In [23]:
games[0:100]

Unnamed: 0,0,1,2,3,4
0,"G1: Edwardses (East St. Louis, IL) v. Crowders",Edwardses,Crowders,"[(East St. Louis, IL)]",[]
1,G2: Buccholzes v. Herberts,Buccholzes,Herberts,[],[]
2,"G1: Hills (Dianna, Susanna, Jack, Pat & Bob) v...",Hills,Crowders,"[(Dianna, Susanna, Jack, Pat & Bob)]",[]
3,"G2: Buccholzes (Vegas)(Krystal, Daniel, Kimber...",Buccholzes,Traverses,"[(Vegas), (Krystal, Daniel, Kimberly, Joseph &...",[]
4,"G1: Shaddixes v. Crowders (Decatur)(Jarred, Ka...",Shaddixes,Crowders,[],"[(Decatur), (Jarred, Karen, Katina, Kevin & Da..."
5,"G2: Drelicks v. Traverses (Temple Hills, MD)(T...",Drelicks,Traverses,[],"[(Temple Hills, MD), (Tonja, Brennan, Donovan ..."
6,"G1: Shaddixes (BIR)(Lisa, Katie, Meredith, Joa...",Shaddixes,O’Gormans,"[(BIR), (Lisa, Katie, Meredith, Joan & Rachel)]",[]
7,"G2: Drelicks v. Hoggses (Columbus, NJ)(Seantia...",Drelicks,Hoggses,[],"[(Columbus, NJ), (Seantia, Tyneshia, Tizana, B..."
8,"G1: Joneses (Long Beach)(Tami, Jacques, Rainey...",Joneses,O’Gormans,"[(Long Beach), (Tami, Jacques, Rainey, Raven &...",[]
9,"G2: Drelicks v. Herberts (SAC)(Shunise, Asia, ...",Drelicks,Herberts,[],"[(SAC), (Shunise, Asia, Crystal, Diamond & Van..."


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# trakt.tv (Powered by DirecTV?)

In [None]:
def get_episode_data(season):
    
    # Get the webpage
    url = f"https://trakt.tv/shows/family-feud-2010/seasons/{season}"
    r = requests.get(url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(r.text, "html.parser")

    # Get the episode titles
    episodes = soup.find_all("div", class_="row fanarts sortable")

    episode_dict = [] # list of all episode data
    episode_data = [] # single episode data

    for ep in episodes:
        episode_data = []
        # Seaon/Episode number e.g. 16x27 
        episode_data.append(ep.find("span", class_="main-title-sxe").text)
        # Episode title, families
        episode_data.append(ep.find("span", class_="main-title").text)
        # Date
        episode_data.append(ep.find("span", class_="convert-date").text)
        # Add episode to list
        episode_dict.append(episode_data)

    df = pd.DataFrame(episode_dict, columns = ['num','EpisodeTitle','AirDate'])
    
    # Seems like accurate air dates
    df.AirDate = pd.to_datetime(df.AirDate).dt.date
    
    # Reformat/create columns 
    def get_family(title, letter):
        if letter == 'A': return title.split(' vs ')[0].strip()
        if letter == 'B':
            try: return title.split(' vs ')[1].split(' Big Money')[0].strip()
            except: return ""
    
    def get_notes(title):
        if "Big Money Tournament" in title:
            return "Big Money Tournament"
        else:
            return ""
        
    df["SeasonNum"] = df.num.apply(lambda x : int(x.split('x')[0]))
    df["EpisodeNum"] = df.num.apply(lambda x : int(x.split('x')[1]))
    
    df["EpisodeTitle"] = df.EpisodeTitle.apply(lambda x : re.sub("[^a-zA-Z ]", "", x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace(" Family","")
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace("Episode","")
    
    df["FamilyA"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'A'))
    df["FamilyB"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'B'))
    
    df["Notes"] = df.EpisodeTitle.apply(lambda x : get_notes(x))
    
    df = df[['SeasonNum', 'EpisodeNum', 'EpisodeTitle', 'AirDate','FamilyA','FamilyB','Notes','num']]

    return df

In [None]:
df = pd.DataFrame()
for season in range(14,25):
    df_ = get_episode_data(season)
    df = pd.concat([df,df_])
    
df_trakt = df

## Final

In [None]:
df_trakt;

In [None]:
df_trakt.groupby('SeasonNum').count()['EpisodeNum']

# Combo

In [None]:
df = pd.merge(df_trakt, df_guide,  how='left', left_on=['SeasonNum','EpisodeNum'], 
              right_on = ['SeasonNum','EpisodeNum'], suffixes=['_trakt','_guide'])

In [None]:
df[df.SeasonNum == 18]