In [16]:
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings("ignore")

In [54]:
from bs4 import BeautifulSoup
import requests

import missingno as ms
import time
import random

In [18]:
import datefinder
# Example
string_with_dates = "“Family Feud” 11/23/22"
matches = datefinder.find_dates(string_with_dates)
next(matches);

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# Philo
Saved to DVR on Philo

In [19]:
# Page source of full Philo library
# !!! Make sure the library is fully expanded before extracting source, or else episodes will not be loaded into HTML code
with open('data/philo.txt') as f:
    lines = f.readlines()

In [20]:
# Some new lines exist already but remove these by joining
text = ' '.join(lines)

In [21]:
# Split on episode title
eps = text.split('Family Feud, ')

# Get only the first x characters which are the actual episode title
# Exclude first element since this is the "pre-split"
titles = [ep[:28] for ep in eps[1:]]

# Split on new element
titles = [tit.split(">")[0] for tit in titles]

# Some titles have new lines in them
titles = [tit.replace("\n  ","") for tit in titles]

# Some titles are missing a space between "Episode" and number
titles = [tit.replace("Episode","Episode ") for tit in titles]
titles = [tit.replace("Episode  ","Episode ") for tit in titles]

In [22]:
regex = re.compile('[^a-zA-Z0-9, ]')
titles = [regex.sub('', tit) for tit in titles]

In [23]:
regex = re.compile('[^0-9,]')
title_nums = [regex.sub('', tit).split(",") for tit in titles]

df = pd.DataFrame(title_nums, columns=['SeasonNum','EpisodeNum']).drop_duplicates()
df = df.astype(int)
df = df.sort_values(by=['SeasonNum','EpisodeNum']).reset_index(drop=True)

In [24]:
# Total number of Philo episodes
len(df)

864

In [25]:
# Sample of Philo episodes
df.sort_values(by=['SeasonNum','EpisodeNum'], ascending=False).sample(n=10)

Unnamed: 0,SeasonNum,EpisodeNum
244,17,161
631,21,18
772,22,45
31,15,30
530,19,188
143,16,113
689,21,96
53,15,66
257,17,174
296,18,17


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# TV Guide
TV Guide has the best family name data (which is nearly complete) for the early seasons of the show. In some cases, the hometowns are also provided in the description. The date appears to be the date of the last airing on TV (according to Guide), not the original air date.

In [31]:
# Already downloaded lists of episodes by season from TV Guide
# Each season in a sepearte text file
# Starts at Season 14 (Steve's first season), no season 12 or 13

def tv_guide_data(season_number):
    """ Get df of TV Guide data for one season based on argument """
    # Read file by season number (14-24)
    with open(f'data/tv_guide/S{season_number} TV Guide.txt') as f:
        lines = f.readlines()

    # Remove new line charcaters which are sporadic throughout
    x = ' '.join(lines).replace("\n"," ")

    # Split on word "Episode" and drop unneeded header
    x = x.replace(" Episode ","###Episode ").split("###")[2:]

    # Remove repeated description text from episode descriptions
    # Aim to only keep family names and hometowns
    remove_text = ["""Comedian Steve Harvey hosts as the """,
        """Families compete against each by guessing answers to various surveys""",
        """Families compete to guess the most popular answers to various survey questions""",
        """in this durable game show""",
        """Steve Harvey hosts the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts the network\'s new season of the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts the the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey hosts this hilarious version of the long-running classic, in which two families compete to guess the answers to various surveys""",
        """Steve Harvey plays host to two teams, each comprised of five family members, who try to match the answers given to survey questions asked to groups of people""",
        """Steve Harvey presents two families who battle it out by answering survey questions for a chance to win cash and prizes""",
        """The durable game show in which two teams of five relatives compete for cash and prizes by guessing the most popular answers to questions based on what the "survey said" in polls conducted with 100 people""",
        """Two families battle against each other by guessing the answers to survey questions""",
        """Two families battle each other by trying to match the answers to the survey questions""",
        """Two families compete by trying to match the answers to survey questions given to a group of people""",
        """Two families compete to guess answers to various surveys""",
        """Two families compete to guess the answers to surveys""",
        """Two families compete to guess the answers to various surveys""",
        """Two families compete to guess the answers to various surveys. Steve Harvey hosts""",
        """Two families of five face off to guess the answers with the results of a survey given to a group of people""",
        """Two families of five face off to guess the answers with the results of a survey of one hundred people""",
        """Two families of five face off to name the top responses to questions posed to 100 people""",
        """Two families of five try to guess what the "survey said" in polls conducted with 100 people""",
        """Two families of five try to guess what the "survey said" in polls conducted with 100 people in this durable game show""",
        """Two families try to guess what the "survey said" in polls conducted with 100 people""",
        """Two new families compete for the right to play the returning champions in this classic game show""",
        """Two teams of families try to guess the answers to different survey questions""",
        """Two teams play against each other to win the grand prize by answering the most popular responses to surveys""",
        """Where To Watch""",
        """.Where To Watch"""
    ]
        
    # Replace repeated phrases from list with space
    for i in range(len(remove_text)):
        x = [a.replace(remove_text[i]," ") for a in x]

    # Check if string contains any letters or numbers
    def non_empty(words):
        return [word for word in words if len(re.compile('[^a-zA-Z0-9]').sub('',word)) > 0]

    # Split on double space
    split_text = ["  "]
    for i in range(len(split_text)):
        x = [a.split(split_text[i]) for a in x]

    # Only keeps values with data
    x = [non_empty(a) for a in x]
    
    def description_stop_words(text):
        """ Remove words that add no info to descriptions (keep only family names and hometowns/state)"""
        if not text: return text
        
        # Remove non-letters and add leading/traing spaces
        text = re.compile('[^a-zA-Z]').sub(' ', text)
        text = ' ' + text + ' '
        
        # Replace stop words with space
        # Terrible technique lol
        to_replace = ['"',' to ',' the ',' various ',' family ',' Family ', " a ", " against ", " and ", " answer ", " answers ", " are ", " based ", " battle ", " battles ", " between ", " board ", " by ", " chance ", " compete ", " competes ", " each ", " face ", " families ", " Fast ", " five ", " for ", " from ", " game ", " go ", " guess ", " guessing ", " Harvey ", " hosted ", " hosts ", " in ", " it ", " more ", " off ", " on ", " one ", " other ", " out ", " people ", " play ", " player ", " questions ", " quiz ", " responses ", " round ", " rounds ", " several ", " Show ", " Steve ", " survey ", " survey-driven ", " surveys ", " teams ", " the ", " The ", " to ", " top ", " try ", " Two ", " up ", " various ", " with ",]
        for t in to_replace:
            text = text.replace(t,' ')
            
        return text.strip()
    
    # Some seasons have zero descriptions so the column may not exist yet in nested list
    try:
        df = pd.DataFrame(x, columns=['Episode','EpisodeTitle','TVGuideTime','TVGuideDescription'])
    except:
        df = pd.DataFrame(x, columns=['Episode','EpisodeTitle','TVGuideTime'])
        
    df = df.fillna("")
    
    
    # Remove stop words if descriptions exist, else create empty column
    if "TVGuideDescription" in df.columns:
        df["TVGuideDescription"] = df["TVGuideDescription"].apply(description_stop_words)
    else:
        df["TVGuideDescription"] = ""
        
    # Extract season/episode number from text
    def get_episode_number(text):
        t = text.replace("Episode","")
        try:
            return int(t)
        except:
            return t
    
    # Get family names from game title
    def get_family(title, letter):
        if letter == 'A': return title.split(' vs ')[0].strip()
        if letter == 'B':
            try: return title.split(' vs ')[1].split(' Big Money')[0].strip()
            except: return ""
    
    # Notes
    def get_notes(title):
        if "Big Money Tournament" in title:
            return "Big Money Tournament"
        else:
            return ""
    
    df["EpisodeTitle"] = df.EpisodeTitle.apply(lambda x : re.sub("[^a-zA-Z ]", "", x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace(" Family","").replace("Episode","")
    
    # Create joinable columns
    df["EpisodeNum"] = df["Episode"].apply(get_episode_number)
    df["Season"] = f"Season {season_number}"
    df["SeasonNum"] = season_number
    
    # Get family names from episode title
    df["FamilyA"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'A'))
    df["FamilyB"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'B'))
    
    df["Notes"] = df.EpisodeTitle.apply(lambda x : get_notes(x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace("Big Money Tournament Featuring","").replace("the","")
    
    # Reorder Columns
    df = df[['SeasonNum', 'EpisodeNum', 'Season', 'Episode', 'EpisodeTitle','FamilyA','FamilyB','TVGuideTime', 'TVGuideDescription','Notes']]

    return df

In [32]:
# Run 'tv_guide_data' for all seasons and concat into single dataframe
df = pd.DataFrame()
for season in range(14,25):
    df_ = tv_guide_data(season)
    df = pd.concat([df,df_]).reset_index(drop=True)

df_guide = df

In [33]:
df_guide.head()

Unnamed: 0,SeasonNum,EpisodeNum,Season,Episode,EpisodeTitle,FamilyA,FamilyB,TVGuideTime,TVGuideDescription,Notes
0,14,1,Season 14,Episode 1,Pawleck vs Millsap,Pawleck,Millsap,"Sat, Jul 13, 2019 30 mins",Pawleck Millsap,
1,14,2,Season 14,Episode 2,Holcomb vs McKenzie,Holcomb,McKenzie,"Sat, May 19, 2018 30 mins",Holcomb vs McKenzie,
2,14,3,Season 14,Episode 3,McKenzie vs Carlyle,McKenzie,Carlyle,"Sat, May 19, 2018 30 mins",McKenzie vs Carlyle,
3,14,4,Season 14,Episode 4,Thomas vs Hartman,Thomas,Hartman,"Mon, May 14, 2018 30 mins",Thomas vs Hartman,
4,14,5,Season 14,Episode 5,Moreland vs Hartman,Moreland,Hartman,"Mon, May 14, 2018 30 mins",Moreland vs Hartman,


In [41]:
df_guide;

In [42]:
# Total Episodes
print("Total Episodes:",df_guide.shape[0],'\n')

# Episodes by Season
df_guide.groupby('SeasonNum').count()['EpisodeNum']

Total Episodes: 2013 



SeasonNum
14    168
15    179
16    180
17    200
18    200
19    200
20    200
21    200
22    153
23    245
24     88
Name: EpisodeNum, dtype: int64

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# trakt.tv (Powered by DirecTV?)

In [89]:
def get_episode_data(season):
    
    # Get the webpage
    url = f"https://trakt.tv/shows/family-feud-2010/seasons/{season}"
    r = requests.get(url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(r.text, "html.parser")

    # Get the episode titles
    episodes = soup.find_all("div", class_="row fanarts sortable")

    episode_dict = [] # list of all episode data
    episode_data = [] # single episode data

    for ep in episodes:
        episode_data = []
        # Seaon/Episode number e.g. 16x27 
        episode_data.append(ep.find("span", class_="main-title-sxe").text)
        # Episode title, families
        episode_data.append(ep.find("span", class_="main-title").text)
        # Date
        episode_data.append(ep.find("span", class_="convert-date").text)
        # Add episode to list
        episode_dict.append(episode_data)

    df = pd.DataFrame(episode_dict, columns = ['num','EpisodeTitle','AirDate'])
    
    # Seems like accurate air dates
    df.AirDate = pd.to_datetime(df.AirDate).dt.date
    
    # Reformat/create columns 
    def get_family(title, letter):
        
        if letter == 'A': return re.split(' v | vs ', title)[0].strip()
        if letter == 'B':
            try: return re.split(' v | vs ', title)[1].split(' Big Money')[0].strip()
            except: return ""
    
    def get_notes(title):
        if "Big Money Tournament" in title:
            return "Big Money Tournament"
        else:
            return ""
        
    df["SeasonNum"] = df.num.apply(lambda x : int(x.split('x')[0]))
    df["EpisodeNum"] = df.num.apply(lambda x : int(x.split('x')[1]))
    
    df["EpisodeTitle"] = df.EpisodeTitle.apply(lambda x : re.sub("[^a-zA-Z'‘’\- ]", "", x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace(" Family","")
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace("Episode","")
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace("Summer vs Grace","Summers vs Grace")
    
    df["FamilyA"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'A'))
    df["FamilyB"] = df.EpisodeTitle.apply(lambda x : get_family(x, 'B'))
    
    df["Notes"] = df.EpisodeTitle.apply(lambda x : get_notes(x))
    
    df = df[['SeasonNum', 'EpisodeNum', 'EpisodeTitle', 'AirDate','FamilyA','FamilyB','Notes','num']]

    return df

In [90]:
df = pd.DataFrame()
for season in range(14,25):
    df_ = get_episode_data(season)
    df = pd.concat([df,df_])
    
df_trakt = df

In [91]:
replacements = (("Monts","Montas"),
    ("Hamshudin","Shamshudin"),
    ("Alamand","Almand"),
    ("Midthum","Midthun"),
    ("Dehart","DeHart"),
    ("KcKeon","McKeon"),
    ("De La Rose","De La Rosa"),
    ("St Fleur","St. Fleur"),
    ("Loeher","Loehler"),
    ("Callagher","Gallagher"),
    ("Lahtam","Latham"),
    ("Rettmann","Rettman"),
    ("Chulner","Schulner"),
    ("Venkataram","Venkatram"),
    ("Marriett","Merriett"),
    ("Shellnutt","Shelnutt"),
    ("Hlases","Hlas"),
    ("Newbert","Neubert"),
    ("Leclair","LeClair"),
    ("Pagn","Pagan"),
    ("Bellefant","Bellenfant"),
    ("Benson Jaja","Benson-Jaja"),
    ("McCary","McCrary"),
    ("Buchholz","Buccholz"),
    ("Suddth","Sudduth"),
    ("Pagn","Pagan"))

In [92]:
for r in replacements:
    df_trakt = df_trakt.replace(r[0],r[1],regex=True)

In [93]:
df_trakt;

In [94]:
df_trakt.groupby('SeasonNum').count()['EpisodeNum']

SeasonNum
14    180
15    180
16    180
17    200
18    200
19    200
20    200
21    181
22    166
23     89
24     18
Name: EpisodeNum, dtype: int64

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# bobbymgsk

In [64]:
pages_str = ""
for page in range(1,127):
    print(page)
    url = f'https://bobbymgsk.wordpress.com/category/family-feud/page/{page}/'
    page = requests.get(url)
    pages_str += page.text
    time.sleep(5*random.random())
    
soup = BeautifulSoup(pages_str, 'html.parser')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126


In [65]:
# # Read in massive 126 page html file, we love you Bobby McBride <3
# # File is the fully (?) expanded version of the "Family Feud" category from blog
# with open(f"bobbymgsk.txt", encoding="UTF-8") as f:
#     r = f.read()

# # Replace space symbol that decoder missed
# r = r.replace('\xa0', ' ')
# r = r.replace('\n', ' ')

# # SLOW # RUNS VERY SLOW #
# # Parse to soup and collect list of posts
# soup = BeautifulSoup(r, 'html.parser')

### Posts

In [69]:
# Get list of posts (most posts contain two episodes)
# Number of posts is 1260 as of 12/29/2022
posts = soup.find_all("div", {"class": "post"})
len(posts)

1260

In [73]:
# Games can start with random titles ...yay... 
# the following list of titles each designate a new game
with open("game_titles.txt", "r", encoding="UTF-8") as f:
    game_titles = [x.strip().replace('"', '') for x in f.readlines()]
    
game_titles[0:10];

In [75]:
def get_post_title(post):
    try: return post.find('a', href=True).text
    except: return "No Title Found"

def get_family_info(title, num):
    title = title.split(':')[1]
    try:
        family = re.split(' v. | v | vs. | vs ', title)[num]
    except:
        family=""
    for g in game_titles:
        family = family.replace(g,'').replace(':','').strip()
    family = family.replace('(','$$@(').split('$$@')
    name = family[0]
    info = family[1:]
    return name, info

def get_post_version(title):
    if "Celebrity" in title:
        return "Celebrity"
    else:
        return "Standard"

def has_multiple_backslashes(string):
    backslash_count = 0
    for char in string:
        if char == '\\':
            backslash_count += 1
    if backslash_count > 1:
        return True
    else:
        return False

def get_post_date(post, post_title):
    spans = post.find_all('span')
    if has_multiple_backslashes(post_title):
        span_list = [post_title] + [span.text for span in spans]
    else:
        span_list = [span.text for span in spans] + [post_title]
    for span in span_list:
        try:
            dates = datefinder.find_dates(span)
            return next(dates)
        except:
            pass
    return ""

def get_games_from_post(post):
    games = []
    for p_tag in post.find_all('p'):
        if bool([ele for ele in game_titles if(ele in p_tag.text)]):
            game_title = p_tag.text
            game_split = game_title.split(':')[0].replace('(','$$@(').split('$$@')
            game_header = game_split[0]
            game_info = game_split[1:]
            games.append([game_title, game_header, game_info])
    return games

In [79]:
for post in posts: 
    # Keep track of the number of fast money's per post
    # Since every game has a fast money (and they are all denoted with "FM:"), we can use this to
    # track which games are missing crucial information, like a title or family list
    fastmoney_count = 0
    
    # keep track of each game
    games = []
    
    games.append(get_post_title(post))

    p_tags = post.find_all('p')
    for p_tag in p_tags:
        if 'FM:' in p_tag.text:
            fastmoney_count += 1
        if bool([ele for ele in game_titles if(ele in p_tag.text)]):
            games.append(p_tag.text)
            
    if len(games) < fastmoney_count+1:
        for d in games: print(d)

“Family Feud” 11/5/10
“Family Feud” 11/4/10
“Family Feud” 11/3/10
“Family Feud” 11/2/10
“Family Feud” 2/11/13- FIRST-RUN REMATCH
“Family Feud” 10/19/10
“Family Feud” 10/16/12


In [81]:
game_count = 0
all_games = []
for post in posts:
    post_title = post.find('a', href=True).text
    post_version = get_post_version(post_title)
    post_date = get_post_date(post, post_title)
    
    games = get_games_from_post(post)
    for game in games:
        [game_title, game_header, game_info] = game
        [nameFamilyA, infoFamilyA] = get_family_info(game_title,0)
        [nameFamilyB, infoFamilyB] = get_family_info(game_title,1)
        
        _game = [post_title, post_version, post_date, 
                 game_title, game_header, game_info, 
                 nameFamilyA, nameFamilyB, infoFamilyA, infoFamilyB]
        
        all_games.append(_game)

In [82]:
games = pd.DataFrame(all_games, columns=['PostTitle', 'Version', 'PostDate', 
                                         'GameTitle', 'GameHeader', 'GameInfo', 
                                         'FamilyA', 'FamilyB', 'FamilyAInfo', 'FamilyBInfo'])


games['PostDate'] = pd.to_datetime(games['PostDate'], infer_datetime_format=True, errors='coerce').dt.date
games = games.fillna("")
games = games.apply(lambda x: x.str.strip() if x.dtype == "object" else x)  
games = games.fillna("")

### Replace typos from Bobby
Determined by comparison from Trakt titles, mostly confirmed via video clips. Why does he do plural last names? Agh!

In [85]:
replacements = (('Crovetttos','Crovettos'),
    ('Collingsworth','Hollingsworth'),
    ('Aladenoyers','Aladenoyes'),
    ('McLards','McClards'),
    ('Olszweski','Olszewski'),
    ('McLards','McClards'),
    ('Hlaseses','Hlases'),
    ('Cartotenutos','Carotenutos'),
    ('Boulhac','Roulhac'),
    ('Coanses','Coans'),
    ('Chesnut','Chestnut'),
    ('Keningsbergs','Kenigsberg'),
    ('Mokiaoses','Mokiaos'))

In [86]:
for r in replacements:
    games = games.replace(r[0],r[1],regex=True)

### De-pluralize family names

In [88]:
trakt_names = [t.strip() for t in list(df_trakt.FamilyA) + list(df_trakt.FamilyB) if len(t.strip())>0]
trakt_names_ending_e = list(set([t for t in trakt_names if t[-1] == 'e']))


def get_non_plural_family(name, df_trakt):
    try:
        name = name.strip()
        if name in ['Wis']:
            return name
        if (name[-2:] == 'es') and (name[:-1] in trakt_names_ending_e):
            return name[:-1] 
        if name[-2:] == 'es':
            return name[:-2]
        elif name[-1] == 's':
            return name[:-1]
        else:
            return name
    except:
        return name

games['FamilyA_'] = games['FamilyA'].apply(get_non_plural_family)
games['FamilyB_'] = games['FamilyB'].apply(get_non_plural_family)

games = games.replace("’", "'", regex=True)

NameError: name 'df_trakt' is not defined

### Save game list to .csv

In [None]:
games.to_csv('games.csv')

### Testing: Count of Game Headers

In [None]:
game_titles_df = pd.DataFrame([g.split(":")[0] for g in list(games.GameHeader)]).value_counts()
game_titles_df;

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# All

### Create empty dataframe for all episodes (season, episode dyad) based on IMDB counts

In [66]:
# From IMDB
episode_count = {
    14: 180,
    15: 180,
    16: 180,
    17: 200,
    18: 200,
    19: 200,
    20: 200,
    21: 175,
    22: 166,
    23: 180,
    24: 81
}

In [67]:
# Create dataframe of season/episode dyads
all_eps = []
for season in range(14,25):
    for episode in range(1,episode_count[season]+1):
        all_eps.append([season, episode])
df_all = pd.DataFrame(all_eps, columns=['SeasonNum','EpisodeNum'])

In [None]:
print("Total Episodes:", len(df_all),'\n')
df_all.groupby('SeasonNum').count()

### Join Trakt (best family name data)

In [None]:
df_all = pd.merge(df_all, df_trakt, 
              left_on = ['SeasonNum', 'EpisodeNum'], 
              right_on = ['SeasonNum', 'EpisodeNum'],
              how='left')

### Join IMDB Season 23 Air Dates (Trakt has poor season 23/24 coverage)

In [None]:
df_all = pd.merge(df_all, df_imdb23, 
              left_on = ['SeasonNum', 'EpisodeNum'], 
              right_on = ['SeasonNum', 'EpisodeNum'],
              how='left')

In [None]:
df_all['AirDate'] = df_all['AirDate'].combine_first(df_all['AirDate1'])
df_all['AirDate'] = pd.to_datetime(df_all['AirDate']).dt.date

In [None]:
df_all.sort_values(by=['SeasonNum','EpisodeNum'], ascending=False)[0:500]

<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


<span>-------------------------------------------------------------------------------------------------------------------------------------------------------


# Combo

In [None]:
df = pd.merge(df_trakt, games[['FamilyB_', 'FamilyA_']], 
              left_on=['FamilyA', 'FamilyB'], 
              right_on=['FamilyA_', 'FamilyB_'], how='left')

### Search by family name in bobbymgsk (both plural and de-pluralized)

In [None]:
name = 'Summers'
name_search = games[(games['FamilyA'] == name) | 
      (games['FamilyB'] == name) | 
      (games['FamilyA_'] == name) | 
      (games['FamilyB_'] == name)]

name_search

### Search by family name in Trakt

In [None]:
name = 'Montas'
name_search = df_trakt[(df_trakt['FamilyA'] == name) | (df_trakt['FamilyB'] == name)]

name_search

### Join Trakt and bobbymgsk

In [None]:
df = pd.merge(games[['FamilyB_', 'FamilyA_']], 
              df_trakt[['FamilyB', 'FamilyA','EpisodeNum','SeasonNum']], 
              left_on = ['FamilyA_', 'FamilyB_'], 
              right_on = ['FamilyA', 'FamilyB'],
              how='left')

df[['SeasonNum','EpisodeNum']] = df[['SeasonNum','EpisodeNum']].fillna(0).astype(int)
df = df[df.SeasonNum != 0]

In [None]:
df = df.sort_values(by=['SeasonNum','EpisodeNum'], ascending=True)

In [None]:
with open("imdb_season_23_airdates.txt", "r") as f:
    lines = f.readlines()
lines = [line.replace(',\n','').split(',')[0:3] for line in lines]
df_imdb23 = pd.DataFrame(lines, columns=['SeasonNum','EpisodeNum','AirDate1'])
df_imdb23[['SeasonNum','EpisodeNum']] = df_imdb23[['SeasonNum','EpisodeNum']].astype(int)
df_imdb23['AirDate1'] = pd.to_datetime(df_imdb23['AirDate1'])