In [8]:
import pandas as pd

# trakt.tv (Powered by DirecTV?)

In [9]:
def get_trakt_data(season):
    """ """
    
    # Parse html content to soup
    url = f"https://trakt.tv/shows/family-feud-2010/seasons/{season}"
    soup = BeautifulSoup(requests.get(url).text, "html.parser")

    # Get the episode titles
    episodes = soup.find_all("div", class_="row fanarts sortable")

    # Parse information from episode divs into list
    # Keep SeasonNum/EpisodeNum, Title (families), and AirDate (which appear mostly accurate)
    ep_list = []
    for ep in episodes:
        ep_id = ep.find("span", class_="main-title-sxe").text  # Seaon/Episode number e.g. 16x27 
        ep_title = ep.find("span", class_="main-title").text   # Episode title, families
        ep_date = ep.find("span", class_="convert-date").text  # Date

        ep_list.append([ep_id, ep_title, ep_date])

    # Create dataframe of episodes from list
    df = pd.DataFrame(ep_list, columns = ['num','EpisodeTitle','AirDate'])
    
    # Fix Season/Episode numbers
    df["SeasonNum"] = df.num.apply(lambda x : int(x.split('x')[0]))
    df["EpisodeNum"] = df.num.apply(lambda x : int(x.split('x')[1]))
    
    
    # Remove unneeded text from epsiode titles (symbols and repeated words)
    df["EpisodeTitle"] = df.EpisodeTitle.apply(lambda x : re.sub("[^a-zA-Z'‘’\- ]", "", x))
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace(" Family","").replace("Episode","")
    
    # Replace misspelled name (do not move)
    df["EpisodeTitle"] = df.EpisodeTitle.str.replace("Summer vs Grace","Summers vs Grace")
    
    # Convert dates to datetime format
    df.AirDate = pd.to_datetime(df.AirDate).dt.date
    
    # Parse family names from title
    def get_family_name(title, letter):
        if letter == 'A': 
            return re.split(' v | vs ', title)[0].strip()
        if letter == 'B':
            try: return re.split(' v | vs ', title)[1].split(' Big Money')[0].strip()
            except: return ""
    
    df["FamilyA"] = df.EpisodeTitle.apply(lambda x : get_family_name(x, 'A'))
    df["FamilyB"] = df.EpisodeTitle.apply(lambda x : get_family_name(x, 'B'))
    

    # Re-order columns
    df = df[['SeasonNum', 'EpisodeNum', 'EpisodeTitle', 'AirDate', 'FamilyA', 'FamilyB']]
    df = df.replace('Episode','',regex=True)
    return df

In [10]:
def fix_typos_trakt(df):
    replacements = (("Monts","Montas"),
        ("Hamshudin","Shamshudin"),
        ("Alamand","Almand"),
        ("Midthum","Midthun"),
        ("Dehart","DeHart"),
        ("KcKeon","McKeon"),
        ("De La Rose","De La Rosa"),
        ("St Fleur","St. Fleur"),
        ("Loeher","Loehler"),
        ("Callagher","Gallagher"),
        ("Lahtam","Latham"),
        ("Rettmann","Rettman"),
        ("Chulner","Schulner"),
        ("Venkataram","Venkatram"),
        ("Marriett","Merriett"),
        ("Shellnutt","Shelnutt"),
        ("Hlases","Hlas"),
        ("Newbert","Neubert"),
        ("Leclair","LeClair"),
        ("Pagn","Pagan"),
        ("Bellefant","Bellenfant"),
        ("Benson Jaja","Benson-Jaja"),
        ("McCary","McCrary"),
        ("Buchholz","Buccholz"),
        ("Suddth","Sudduth"),
        ("Pagn","Pagan"))

    for r in replacements:
        df = df.replace(r[0],r[1],regex=True)
        
    return df

In [11]:
# Load Trakt dataframe from pickle
try:
    df_trakt = pd.read_pickle("../pickles/df_trakt.pkl")
    
# Run 'get_trakt_data' for all seasons and concat into single dataframe
except:
    df_trakt = pd.DataFrame()
    for season in range(14,25):
        df_ = get_trakt_data(season)
        df_trakt = pd.concat([df_trakt,df_])
    df_trakt = fix_typos_trakt(df_trakt)
    df_trakt.to_pickle("../pickles/df_trakt.pkl")

In [16]:
# Total Episodes
print("Total Episodes:",df_trakt.shape[0])

Total Episodes: 1794


In [17]:
# Episodes by Season
df_trakt.groupby('SeasonNum').count()['EpisodeNum']

SeasonNum
14    180
15    180
16    180
17    200
18    200
19    200
20    200
21    181
22    166
23     89
24     18
Name: EpisodeNum, dtype: int64

In [18]:
# Sample of Trakt data
df_trakt.sample(n=10)

Unnamed: 0,SeasonNum,EpisodeNum,EpisodeTitle,AirDate,FamilyA,FamilyB
165,20,166,Schieve vs Woods,2019-04-29,Schieve,Woods
196,18,197,Washington vs Taylor,2017-05-23,Washington,Taylor
30,18,31,Hutchison vs Lemus,2016-10-05,Hutchison,Lemus
105,21,106,Ha vs Murphy,2020-01-29,Ha,Murphy
168,21,169,,2020-04-29,,
102,20,103,Gross vs Klein,2019-01-28,Gross,Klein
66,18,67,Turner vs Archer,2016-11-04,Turner,Archer
95,19,96,Wilkins vs Barboza,2018-01-22,Wilkins,Barboza
119,21,120,Witkowski vs Murphy,2020-02-07,Witkowski,Murphy
96,20,97,Keith vs Wade,2019-01-22,Keith,Wade
