# Get Transcripts

In [1]:
import pandas as pd
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

## Update `hasTranscript` of `df_episodes`, inspecting `RickAndMortyScripts.csv`

First, let's indicate in a column `hasTranscript` in RaM_episodes.csv the transcripts we already have through Kaggle database (https://www.kaggle.com/andradaolteanu/rickmorty-scripts/version/1).

In [2]:
df_transcripts = pd.read_csv("transcripts/RickAndMortyScripts.csv")

# create an initial column in RaM_episodes
df_episodes = pd.read_csv('RaM_episodes.csv')
df_episodes['hasTranscript']=[False for _ in range (len(df_episodes))]

# assign True for all transcripts we have in RickAndMortyScripts.csv from Kaggle database >> 11 episodes are concerned
def registerHasTranscript(aRow):
    rowIndex = df_episodes[(df_episodes.Episode_nb == aRow['episode no.']) & (df_episodes.Season_nb==aRow['season no.'])].index[0]
    df_episodes.loc[rowIndex, ['hasTranscript']]=True

_ = df_transcripts[['season no.', 'episode no.']].drop_duplicates().apply(registerHasTranscript, axis=1)

assert len(df_episodes[df_episodes.hasTranscript]) == len(df_transcripts[['season no.', 'episode no.']].drop_duplicates())

In [3]:
df_transcripts[df_transcripts['season no.']==1 and ]

Unnamed: 0,season no.,episode no.,episode name,name,line
0,1,1,Pilot,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,1,1,Pilot,Morty,"What, Rick? What’s going on?"
2,1,1,Pilot,Rick,"I got a surprise for you, Morty."
3,1,1,Pilot,Morty,It's the middle of the night. What are you tal...
4,1,1,Pilot,Rick,"Come on, I got a surprise for you. Come on, h..."
...,...,...,...,...,...
6012,5,2,Mortyplicity,Woodpecker,What is it?
6013,5,2,Mortyplicity,Dolphin,A relic...from a kinder past.
6014,5,2,Mortyplicity,Glockenspiel Jerry,"Wait, cowboys?"
6015,5,2,Mortyplicity,Glockenspiel Jerry,Christianity again? After cowboys? You went al...


In [202]:
# save as csv
df_episodes.to_csv('RaM_episodes.csv', index=False)

## What to extent?

[OK] Let's extend our transcripts database with the transcripts from the top-10 best episodes based on IMDb ranking (https://www.imdb.com/search/title/?series=tt2861424&sort=user_rating,desc&count=250&view=advanced).

In [198]:
df_imdb = pd.read_csv('RaM_imdb_episodes_ranking.csv')
df_merged = df_episodes.merge(df_imdb, on='Title', how='left')
# get the episodes whose transcript is missing
top_ep = df_merged[(df_merged.Season_nb_x == 5)].sort_values('rank').head(10)
df_missing = top_ep[['rank','Title','Hyperlink','Season_nb_x','Episode_nb_x','hasTranscript']]
df_missing.style.format(make_clickable, 'Hyperlink')

Unnamed: 0,rank,Title,Hyperlink,Season_nb_x,Episode_nb_x,hasTranscript
50,4,Rickmurai Jack,https://rickandmorty.fandom.com/wiki/Rickmurai_Jack,5,10,True
41,12,Mort Dinner Rick Andre,https://rickandmorty.fandom.com/wiki/Mort_Dinner_Rick_Andre,5,1,False
42,21,Mortyplicity,https://rickandmorty.fandom.com/wiki/Mortyplicity,5,2,False
49,30,Forgetting Sarick Mortshall,https://rickandmorty.fandom.com/wiki/Forgetting_Sarick_Mortshall,5,9,False
48,32,Rickternal Friendshine of the Spotless Mort,https://rickandmorty.fandom.com/wiki/Rickternal_Friendshine_of_the_Spotless_Mort,5,8,False
43,44,A Rickconvenient Mort,https://rickandmorty.fandom.com/wiki/A_Rickconvenient_Mort,5,3,False
46,48,Rick & Morty's Thanksploitation Spectacular,https://rickandmorty.fandom.com/wiki/Rick_%26_Morty%27s_Thanksploitation_Spectacular,5,6,False
45,49,Amortycan Grickfitti,https://rickandmorty.fandom.com/wiki/Amortycan_Grickfitti,5,5,False
47,50,Gotron Jerrysis Rickvangelion,https://rickandmorty.fandom.com/wiki/Gotron_Jerrysis_Rickvangelion,5,7,False
44,51,Rickdependence Spray,https://rickandmorty.fandom.com/wiki/Rickdependence_Spray,5,4,False


## Get additional transcripts

Let's gather the transcript of other episodes so that we get the top 10 of best rated episodes covered + 50% of each season to analyse the evolution of relationships.
- Doesn't exist or not parsable : S01E07 (new lines in one character's words), S02E09 (no speaker name), SO2E07 (no speaker name), S03E08 (not complete), S03E05 (no speaker name), S03E10 (no speaker name), S03E09 (not found), ALL S04

In [165]:
def collect_lines(pattern_line, lines, seasonNb, episodeNb):
    """ Input : list of string for each line / output : dataframe of lines """
    list_lines = []

    for aLine in lines:
        aLine = aLine.replace('\n','')
        match = re.findall(pattern_line, aLine)
        for aMatch in match:
            speaker, words = aMatch
            list_lines.append([speaker, words])
    # to pandas
    df_lines = pd.DataFrame(list_lines, columns=['name','line'])
    df_lines['season no.'],df_lines['episode no.']=[seasonNb for _ in range (len(df_lines))],[episodeNb for _ in range (len(df_lines))]
    # add episode name
    df_episodes = pd.read_csv('RaM_episodes.csv')
    episode_name = df_episodes.loc[df_episodes[(df_episodes.Season_nb == seasonNb) & (df_episodes.Episode_nb == episodeNb)].index[0],'Title']
    df_lines['episode name'] = [episode_name for _ in range (len(df_lines))]
    return df_lines

def remove_elements(aPattern, aString):
    for element in re.findall(aPattern,aString):
        aString = aString.replace(element,'')
    return aString

def remove_brackets_elements(aString):
    pattern_brackets = "\[\[.*?\]\]"
    return remove_elements(pattern_brackets,aString)

def remove_simple_brackets_elements(aString):
    pattern_brackets = "\[.*?\]"
    return remove_elements(pattern_brackets,aString)

def remove_parenthesis_elements(aString):
    pattern_parenthesis = "\(.*?\)"
    return remove_elements(pattern_parenthesis,aString)

def getSeasonAndEpisodeNumbers(aString):
    return int(aString[1:3]), int(aString[4:])

def get_df_lines(anEpisodeString, pattern_line="(.*?) ?\: ?(.*)", line_separator = '\n\n', remove_additional = None):
    script = open('transcripts/'+anEpisodeString+'.txt', encoding="utf-8").read()
    script = remove_brackets_elements(script)
    script = remove_simple_brackets_elements(script)
    script = remove_parenthesis_elements(script)
    if remove_additional != None:
        script = remove_elements(remove_additional, script)
    lines = script.split(line_separator)
    season_nb, episode_nb = getSeasonAndEpisodeNumbers(anEpisodeString)
    df_lines = collect_lines(pattern_line,lines,season_nb, episode_nb)
    print(list(df_lines.name.unique()))
    return df_lines

In [111]:
episode = 'S02E04'
df_lines = get_df_lines(episode)

['Jerry', 'Uncle Steve', 'Beth', 'Morty', 'Summer', 'Rick', 'Mr. Poopybutthole', 'Poopybutthole', 'Nicky', 'All', 'Nazi', 'Mr. Beauregard', 'Frankenstein', 'Sleepy Gary', 'Mr. Beauregard ', 'Pencilvester', 'Tinkles', 'Gary', 'Hamurai', 'Amish Cyborg', 'Gerry', 'Reverse Giraffe', 'Ghost in a Jar', 'Mrs. Refrigerator', 'Reverse', ' All', 'Duck with Muscles', 'Mrs Refrigerator', 'Cross Teddy Bear', 'Doctor']


In [113]:
episode = 'S05E10'
df_lines = get_df_lines(episode)

['Alien woman', 'Owl 1', 'Short alien man', 'Owl 2', 'Owl 3', 'Rick', 'Morty', 'Crow-horse', 'Two crows', 'Pussifer', 'Waiter', 'Crowscare', 'Jerry', 'Beth', 'Summer', 'Andy', 'Employee Morty', 'Toddler Morty 1', 'Toddler Morty 2', 'Twenty-Six Year Old Morty', 'Deformed Morty', 'Police Morty', 'Secret Service Rick', 'President Morty', 'Rick Stan Lee', 'Loudspeaker system', 'Rick D716', "'Rick D716-B", 'Orange Afro Rick', 'Orange Afro Morty', 'Female computer voice', 'Mr. Poopybutthole']


In [116]:
episode = 'S01E10'
df_lines = get_df_lines(episode)

['Beth', 'Rick', 'Jerry', 'Rick 1', 'Summer', 'Morty', 'Rick Salesman 1', 'Rick Salesman 2', 'Morty Doll', 'Rick Salseman 3', 'Rick Council 1', 'Rick crowd', 'Crowd rick', 'Armed Rick 1', 'Armed Rick 2', 'Armed Rick 3', 'Armed Ricks 4 and 5', 'Pizza 1', 'Pizza 2', 'Phone 1', 'Phone 2', 'Armed Rick 6', 'Armed Ricks', 'Sofa 1', 'Sofa 2', 'Rick 2', 'Rick 3', 'All Ricks', 'Rick 4', 'Chair waiter', 'Phone waiter', 'All ricks', 'Doofus Rick', 'Rick 1 and 2', 'Evil Rick', 'Evil Morty', 'Lobster alien', 'Religious Morty', 'All religious Mortys', 'Morty 1', 'Morty 2', 'Hammer Morty', 'All Mortys', 'Council Rick 1']


In [118]:
episode = 'S02E06'
df_lines = get_df_lines(episode)

['Morty', 'Summer', 'Rick', 'Ship', 'Man', 'Thinner Man', 'Anchor', 'Mr.President', 'Zeep', 'Swat Officer 1', 'Swat Officer 2', 'Kid', 'Teenyverse President', 'Kyle', 'Rick, Zeep', 'Army General', 'Mr. President', 'Secretary', 'Female Visitor', 'Taxi Driver', 'Soldier', 'Human President', 'Ice cream clerk']


In [120]:
episode = 'S01E05'
df_lines = get_df_lines(episode)

['Rick', 'Jerry, Beth, and Summer', 'Morty', 'Beth', 'Summer', 'Jerry', 'Meeseeks', 'Meeseeks 2', 'Meeseeks 3', 'Villager', 'Meeseeks 1', 'Principal Gene V.', 'Waiter', 'Meeseeks 4', 'Giant', 'Giant woman', 'Detective', 'Judge', 'Giant Lawyer', 'NOTE', 'All the Meeseeks', 'Meeseeks *', 'Waitress', 'Slippery', 'Mr. Booby Buyer', 'Waitess', 'New Meeseeks', 'Mr. Jelly Bean', 'All other Meeseeks', 'Card Player', 'Woman', 'All Meeseeks', 'Samantha', 'Owner', 'The First Villager', 'Villager 1', 'Villager 2']


In [134]:
episode = 'S01E08'
df_lines = get_df_lines(episode)

['Bachelor', 'Jerry', 'Summer', 'Beth', 'Rick', 'Morty', 'Corn Man 1', 'Corn Man 2', 'Glenn', 'Letterman', 'Woman', 'Announcer', 'Shmlony', 'Host in Trenchcoat', 'Cop', 'Killer 1', 'Judge', 'Man', 'Killer 2', 'Killer 3', 'Beth, Summer, and Jerry', 'Beth and Summer', 'Ants-In-My-Eyes Johnson', 'Johnny Depp', 'Mr. Sneezy 3D', 'Fake Doors Salesman', 'Gazorpazorpfield', 'Jon', 'Spokesman', 'Suspender Guy', 'Garbageman', 'Spokeman', 'Trunkperson', 'New Yorker', 'Mr. Tophat Jones', 'Unmuscular Michael 1', 'Muscular Mannies', 'Police Chief', 'Regular Legs', 'Baby Legs', 'Criminal ', 'Business Man', 'News Anchor', 'Hamster News Anchor', 'Female Hamster', 'Male Hamster']


In [141]:
episode = 'S01E04'
df_lines = get_df_lines(episode)

['Rick', 'Morty', 'Beth', 'Mr. Goldenfold', 'Jessica', 'Student', 'Zigerion 1', 'Prince Nebulon', 'Kevin', 'Stu', 'Jerry', 'Zigerion 2', 'Zigerion 3', 'Radio', 'Paramedic', 'Mr. Marklevitz', 'All', 'Old man', 'Woman', 'Mailman', 'Simulation Beth', 'Zigerion', 'Cynthia', 'Mailmen']


In [146]:
episode = 'S01E11'
df_lines = get_df_lines(episode)

['Summer', 'Morty', 'Rick', 'Beth', 'Jerry', 'Beth and Jerry in unison', 'Tour Guide', 'Lucy', 'Tammy', 'Brad', 'Birdperson', 'Jelly-like creature with items stuck inside of it', 'Blue alien', 'Gearhead', 'Squanchy', 'Slow Mobius', 'Voice through speaker', 'Captain #1', 'Captain #2', 'Passengers', 'Guy', 'Nancy', 'Girl', 'Abradolph Lincoler', 'Jessica', 'Everyone except Morty and Jessica', 'Jesica', 'Stereo', 'Song', 'Everyone', 'Morty and Summer in unison', 'Rick and the song', 'Shirtless guy', 'Shirtless Guy']


In [157]:
episode = 'S01E09'
df_lines = get_df_lines(episode, line_separator='\n')

['Morty', 'Rick', 'Jerry', 'Beth', 'Butter robot', 'Summer', 'Goldenfold', 'Needful', 'Rick and Needful', 'Jerry and Morty', 'Flippynips', 'Vagina', 'Tate', 'Baby', 'Hostess', 'Host', 'Scroopy Noopers', 'Customer', 'Employee', 'Plutonian woman', 'Plutonian man', 'Plutonians', 'Policeman', 'Crowd']


In [168]:
episode = 'S01E03'
df_lines = get_df_lines(episode, remove_additional="\*.*?\*")

['Jerry', 'Beth', 'Morty', 'Summer', 'Rick', 'Ruben', 'Leonard', 'Joyce', 'Beth, Summer, and Morty', 'Jacob', 'Automated voice', 'Poncho', 'Dr. Bloom', 'Roger', 'Alexander', 'Annie', 'Animatronics', 'Ethan', 'Animatronic Ruben', 'Reporter', 'Tom', 'Eric', 'Alejandro', 'All']


In [174]:
episode = 'S02E02'
df_lines = get_df_lines(episode, remove_additional="\*.*?\*")

['Rick', 'Morty', 'Jerry', 'Jerry-Sitter', 'Pajama Jerry', 'Underpants Jerry', 'Tuxedo Jerry', 'Receptionist', 'Krombopulos Michael', 'Roy', 'Roy’s mother', 'Teacher', 'Announcer', 'Roy’s wife', 'Doctor', 'Customer', 'Voice', 'Alien', 'Pink-shirt Jerry', 'Other Jerry', 'Daycare Beth', 'Jerrys', 'Another Jerry', 'Ship', 'Gromflomite', 'Fart', 'Gearhead', 'Random Jerry', 'Gear Anchor', 'Gear Policeman #1', 'Gear Policeman #2', 'Bearded Jerry', 'Tank-top Jerry', 'Buzzcut Jerry', 'Officer', 'Garblovian', 'Other Jerrys', 'Paul', 'Various Jerrys', 'Other Rick', 'Other Morty', 'Another Rick', 'Ricks']


In [180]:
episode = 'S02E03'
df_lines = get_df_lines(episode)

['All', 'Rick', 'Morty', 'Morty and Summer', 'Summer', 'Female alien', 'Male alien 1', 'Male alien with glasses', 'All six aliens together', 'Unity', 'Beth', 'Jerry', 'Beta-7', 'Blim Blam', 'Steven Phillips', 'Ron Benson', 'Daryl Jefferson', 'Alien man', 'Rioter', 'Both', 'Beth and Jerry']


In [186]:
episode = 'S03E06'
df_lines = get_df_lines(episode)

['Girl 1', 'Girl 2', 'Jessica', 'Morty', 'Rick', 'Rick and Morty', 'Alien', 'Toxic Rick', 'Toxic Morty', 'Radio', 'Mr. Goldenfold', 'Off screen student', 'Student with glasses', 'Off screen student 2', 'Female student', 'Female student 2', 'Mitch', 'Female student looking into her mirror', 'Female Student', 'Bully', 'Waiter', 'Stacy', 'Stacey', 'Groin System 6000', 'Mrs. Pancakes', 'Summer', 'Beth', 'Priest', 'Man', 'Crying boy', 'Worker', 'Worker 2', 'Worker in a red shirt', 'Jessica on the phone', 'Jacquelyn', 'Credits song']


In [191]:
episode = 'S03E02'
df_lines = get_df_lines(episode, line_separator='\n')

['Morty', 'Summer', 'Rick', 'Jerry', 'Beth', 'Wind', 'Mohawk Guy', 'Death Stalker #1', 'Colossus', 'Hemorrhage', 'Lunch server', 'Death Stalker #2', 'Death Stalker #3', 'Death Stalker #4', 'Eli', 'Death Stalker #5', 'Death Stalker #6', 'Blue Footprint Guy', 'Soldier', 'Death Stalker #7', 'Hemmorhage', 'Leash Man', 'Mechanical Summer', 'Mechanical Morty', 'MS, MR, and Rick, in unison', 'Slaveowner', "Eli's Girlfriend", 'Mechanical Rick']


In [199]:
episode = 'S05E02'
df_lines = get_df_lines(episode)

['Beth', 'Rick', 'Morty', 'Jerry', 'Summer', 'Mr. Always Wants To Be Hunted', 'TV', 'Dracula', 'When Wolf', 'Decoy Rick', 'Decoy Jerry', 'Second Rick', 'Second Morty', 'Second Summer', 'Second Beth', 'Scarecrow Rick', 'Scarecrow Beth', 'Glockenspiel Rick', 'Glockenspiel Jerry', 'Glockenspiel Summer', 'Glockenspiel Morty', 'Glockenspiel Beth', 'Decoy Summer', 'Decoy Beth', 'Other Decoy Summer', 'Other Decoy Jerry', 'Decoy Morty', 'House', 'Steve', 'The President', 'Both Mortys', 'Space Beth', 'Woodpecker', 'Dolphin']


In [200]:
# to execute after obtaining df_lines (to concat with RickAndMortyScripts.csv)
df_lines.to_csv('transcripts/script_'+episode+'.csv', index=False)
df_all_lines = pd.read_csv("transcripts/RickAndMortyScripts.csv")
df_all_lines = pd.concat([df_all_lines, df_lines], axis=0)
df_all_lines.to_csv("transcripts/RickAndMortyScripts.csv", index=False)