# The Fairy Network!

In [41]:
import pandas as pd
import numpy as np
import itertools
from pyvis import network as net
from pyvis.network import Network
import networkx as nx
import igraph
import numpy as np

Fortunately, because of the way the transcripts are written, every dialogue is written in a single line. Therefore, we can read the file line by line to get the dialogues. However, it is important to note that a single line may not be a dialogue but a scene description, or in other cases dialogues may have scene descrptions inside of them, meaning that just because a name is present in a line doesn't mean that somebody is referring to that character. (maybe strip those away?)

In [42]:
Transcripts = {}
for i in range(1,15):
    with open(f"Transcripts/Episode {i}.txt", "r", encoding= "ISO-8859-1") as file:
        try:
            Transcripts["Episode " + f"{i}"] = file.readlines()
        except UnicodeDecodeError:
            print(f"Decoding error in Episode {i}")


# Note: Episodes 13 and 14 couldn't be read due to decoding issues 
# under the standard encoding UTF-8. Encoding ISO-8859-1 which is also
# common seemed to work properly.
 

In [43]:
Transcripts["Episode 1"][0:5]

['(Bugle playing Reveille is heard; Timmy is asleep, snoring; camera points to Cosmo and Wanda in fish form)\n',
 'Wanda: Ready, Cosmo?\n',
 'Cosmo: Ready, Wanda.\n',
 'Cosmo and Wanda: 1, (turn to their fairy form) 2, 3!\n',
 'Cosmo: (flies near Timmy) Wakey-wakey, Timmy!\n']

## Cleaning Transcripts
We will now strip all those lines that describe a scene from the transcripts. A quick inspection suggests that such lines begin either with a parenthesis or a square bracket. Thus if a line satisfies this condition it will be discarded from the transcript.

In [44]:
for i in range(1,15):
    for line in Transcripts[f"Episode {i}"]:
        if line[0] == "(":
            Transcripts[f"Episode {i}"].remove(line)
        elif line[0] == "[":
            Transcripts[f"Episode {i}"].remove(line)

Transcripts["Episode 1"][0:5]

['Wanda: Ready, Cosmo?\n',
 'Cosmo: Ready, Wanda.\n',
 'Cosmo and Wanda: 1, (turn to their fairy form) 2, 3!\n',
 'Cosmo: (flies near Timmy) Wakey-wakey, Timmy!\n',
 "Wanda: Oh, come on, little fella, even though we're your (lifts Timmy up in the air with her wand) fairy godparents...\n"]

## Creating the Network

### Nodes
First we need to find all the potential nodes for the network, and to do so we have to keep the following in mind:

 - There will be dialogues of unimportant characters which we don't want to take into account.
 - Some characters may have more than one way to refer to them (e.g. Mr. Turner is called "Dad" by Timmy)
 - Coupled with the previous one, there may be inconsistencies in the way characters are referred to 
by the writers between episodes.

In order to get all the possible candidates for nodes, notice that every line that corresponds to a dialogue starts with the name of the character that speaks, followed by a colon and then the actual dialogue. We will use this to find all characters that speak.

In [45]:
# Find the first instance of a colon (":") and everything that comes before that may be considered a potential node.

def Retrieve_speaker(dialogue):
    colon = dialogue.find(":")
    if colon != -1:
        character = dialogue[:colon]
    else:
        character = "No character"
    return character

def Characters_in_episode(episode_num):
    episode = Transcripts["Episode " + str(episode_num)]
    characters = []
    for line in episode:
        speaker = Retrieve_speaker(line)
        if speaker not in characters:
            characters.append(speaker)
    return characters

Characters_in_episode(11)
# We may be missing Chompy in this episode as well as Phillip in other ones.



['Timmy',
 'Wanda',
 'Cosmo',
 'Vicky',
 'Mayor',
 'No character',
 'Male police officer',
 'Miss Dimmsdale',
 'Girl #1',
 'Girl #2',
 'Mrs. Turner',
 'Mr. Turner',
 'Journalist #1',
 "Timmy's subconscious",
 'Chet Ubetcha',
 'Crowd']

In [46]:
def Possible_Characters():
    characters = []
    for i in range(1,15):
        characters += Characters_in_episode(i)
    characters = list(set(characters)) # drop duplicate instances of characters
    return characters
characters_clean = Possible_Characters()
len(characters_clean)

162

In [47]:
characters_clean[0:10]

['Tour guide',
 'Future Timmy',
 'Captives',
 'Announcer',
 'Jorgen',
 'Trixie',
 'Santa',
 'Commentator',
 'French Kid #1',
 'Yugopotamian Guard']

As it can be seen, we may encounter issues like: unison dialogues (e.g "Tad and Chad"), irrelevant characters (e.g. "Kids") or even nonsensical characters (e.g. "Everyone"). Because of this, it is important to handle these cases, and to try to do so in an automated way as much as possible (161 characters is still quite large to do it by hand). We will do this in two stages, a first general stage which will be done now and a second "episode dependent" stage later on. 

For the first stage we will remove every character whose name has either the string "Kid" or "#" inside, since either one of these probably makes reference to an irrelevant character. Coupled with this, we will also remove strings that have conjunctions such as "and" and "&" to mitigate redundance.

In [48]:
unwanted_strings = ["#", "Kid", " and ", " & "]
characters_clean = [character for character in characters_clean if not
                     any(string in character for string in unwanted_strings)]


In [49]:
len(characters_clean)

123

### Edges

The graph will be directed, and an edge will be considered from node A to node B when character A names character B in a dialogue.

In [50]:
#edges = []
edges_clean = pd.DataFrame({"Character A": [], "Character B": []})
for i in range(1, 15):
    edges_clean[f"Weight Ep. {i}"] = []



def Edges_dialogue(dialogue):
    # Variables chars_A and chars_B below are lists of characters involved
    # in the dialogue. Notice that there may be more than one speaker
    # (unison dialogues) and more than one character referenced to 
    # in any dialogue. 
    chars_A = Retrieve_speaker(dialogue).replace(" & ", " and ") 
    chars_A = chars_A.split(" and ") # Split characters in unison dialogues
    chars_A = [character for character in chars_A if character in characters_clean]


    chars_B = [] 
    for character in characters_clean:
        if dialogue.find(character) != -1: # Check whether a particular character is present in the dialogue
            chars_B.append(character)

    if len(chars_B) > 0:
        edges = list(itertools.product(chars_A, chars_B)) # Construct all directed pairs
    else:
        edges = []
    return edges


def Edges_episode(episode_num):
    edges_ep = []
    for dialogue in Transcripts[f"Episode {episode_num}"]:
        edges_ep += Edges_dialogue(dialogue)
    return edges_ep

for i in range(1,15):
    for edge in Edges_episode(i):
        # First, check using a boolean mask whether a particular connection 
        # is already on the edges dataframe
        mask = (edges_clean["Character A"] == edge[0]) & (edges_clean["Character B"] == edge[1])
        if not edges_clean[mask].any(axis = None):
            # If it isn't, then add it to the df
            new_row = pd.DataFrame([{"Character A": edge[0], "Character B": edge[1],
                                      **{f"Weight Ep. {j}": 1 if j == i else 0 for j in range(1, 15)}}])
            edges_clean = pd.concat([edges_clean, new_row], ignore_index=True)
        else:
            # If it is, add to the weight of the particular episode
            edges_clean.loc[mask, f"Weight Ep. {i}"] += 1    


Self-references are usually uninteresing since they come from scene descriptions not actual monologues, thus we rule those out. Moreover, we discard all connections involving the placeholder "No character".

In [51]:
edges_clean = edges_clean.loc[edges_clean["Character A"] != edges_clean["Character B"]]
edges_clean = edges_clean.loc[(edges_clean["Character A"] != "No character")  & (edges_clean["Character B"] != "No character")]
edges_clean.reset_index(drop = True)

Unnamed: 0,Character A,Character B,Weight Ep. 1,Weight Ep. 2,Weight Ep. 3,Weight Ep. 4,Weight Ep. 5,Weight Ep. 6,Weight Ep. 7,Weight Ep. 8,Weight Ep. 9,Weight Ep. 10,Weight Ep. 11,Weight Ep. 12,Weight Ep. 13,Weight Ep. 14
0,Wanda,Cosmo,7.0,6.0,2.0,1.0,1.0,2.0,3.0,4.0,2.0,11.0,2.0,2.0,2.0,1.0
1,Cosmo,Wanda,8.0,2.0,1.0,0.0,4.0,5.0,4.0,5.0,4.0,9.0,3.0,2.0,2.0,1.0
2,Cosmo,Timmy,6.0,1.0,4.0,2.0,2.0,11.0,5.0,4.0,1.0,2.0,5.0,6.0,3.0,1.0
3,Wanda,Timmy,10.0,6.0,1.0,2.0,0.0,11.0,6.0,4.0,2.0,1.0,4.0,9.0,4.0,4.0
4,Timmy (1/4),Timmy,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,Maria,Timmy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
269,Santa,Wanda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
270,Santa,Cosmo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
271,All Holiday mascots,All,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [52]:
edges_clean["Weight Season"] = edges_clean.sum(numeric_only=True, axis = 1)

In [53]:
edges_clean.reset_index(drop= True, inplace= True)
edges_clean.head(10)

Unnamed: 0,Character A,Character B,Weight Ep. 1,Weight Ep. 2,Weight Ep. 3,Weight Ep. 4,Weight Ep. 5,Weight Ep. 6,Weight Ep. 7,Weight Ep. 8,Weight Ep. 9,Weight Ep. 10,Weight Ep. 11,Weight Ep. 12,Weight Ep. 13,Weight Ep. 14,Weight Season
0,Wanda,Cosmo,7.0,6.0,2.0,1.0,1.0,2.0,3.0,4.0,2.0,11.0,2.0,2.0,2.0,1.0,46.0
1,Cosmo,Wanda,8.0,2.0,1.0,0.0,4.0,5.0,4.0,5.0,4.0,9.0,3.0,2.0,2.0,1.0,50.0
2,Cosmo,Timmy,6.0,1.0,4.0,2.0,2.0,11.0,5.0,4.0,1.0,2.0,5.0,6.0,3.0,1.0,53.0
3,Wanda,Timmy,10.0,6.0,1.0,2.0,0.0,11.0,6.0,4.0,2.0,1.0,4.0,9.0,4.0,4.0,64.0
4,Timmy (1/4),Timmy,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,Timmy,All,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,5.0
6,Mr. Turner,Mrs. Turner,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
7,Vicky,Mrs. Turner,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Wanda,Vicky,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
9,Dad,Timmy,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


With this edge dataframe, we will carry on the second stage of handling cases of possible irrelevant characters through their connections. The heuristic behind the following procedure is that characters that have few and small-weighted connections are probably irrelevant, or at least not relevant enough. Because important characters may appear briefly in some episodes, we will use the "Weight Season" column to identify irrelevant characters in the abovementioned sense. 

Let us define a character as "weakly connected" if the sum of the "Weight Season" column, over all the edges the character is present, is less than 4 (so that the character had to be present in at least 4 dialogues throughout the season). Thus, this stage will consist in removing all edges that connect weakly connected characters.

In [54]:
def Weakly_connected(character):
    relevance = edges_clean.loc[(edges_clean["Character A"] == character) 
                                | (edges_clean["Character B"] == character), "Weight Season"].sum()
    if relevance < 4: 
        return True
    else:
        return False
    
characters_clean = [character for character in characters_clean if not Weakly_connected(character)]
len(characters_clean)

51

In [55]:
edges_clean = edges_clean.loc[(edges_clean["Character A"].isin(characters_clean)) & (edges_clean["Character B"].isin(characters_clean))]
edges_clean.reset_index(drop = True)

Unnamed: 0,Character A,Character B,Weight Ep. 1,Weight Ep. 2,Weight Ep. 3,Weight Ep. 4,Weight Ep. 5,Weight Ep. 6,Weight Ep. 7,Weight Ep. 8,Weight Ep. 9,Weight Ep. 10,Weight Ep. 11,Weight Ep. 12,Weight Ep. 13,Weight Ep. 14,Weight Season
0,Wanda,Cosmo,7.0,6.0,2.0,1.0,1.0,2.0,3.0,4.0,2.0,11.0,2.0,2.0,2.0,1.0,46.0
1,Cosmo,Wanda,8.0,2.0,1.0,0.0,4.0,5.0,4.0,5.0,4.0,9.0,3.0,2.0,2.0,1.0,50.0
2,Cosmo,Timmy,6.0,1.0,4.0,2.0,2.0,11.0,5.0,4.0,1.0,2.0,5.0,6.0,3.0,1.0,53.0
3,Wanda,Timmy,10.0,6.0,1.0,2.0,0.0,11.0,6.0,4.0,2.0,1.0,4.0,9.0,4.0,4.0,64.0
4,Timmy,All,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,The Easter Bunny,Wanda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
200,The Easter Bunny,Cosmo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
201,Timmy,Star,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
202,Santa,Wanda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [56]:
G = nx.from_pandas_edgelist(edges_clean, "Character A", "Character B", "Weight Season")
nt = net.Network(notebook= True, cdn_resources='remote')
nt.from_nx(G)

nt.show("Fairy Network season 1.html")

Fairy Network season 1.html


As we can see, this reduced the number of possible relevant characters as well as the size of the edge dataframe. So much so that the number of "outliers" is minimal and can now be handled case by case. Among these we note:

1) The characters "Man", "All", "TV" and "TV Narrator" may be removed.
2) The names "Chet", "Chet Ubetcha", "Chet Ubetcha (offscreen)" refer to the same character.
3) The names "A.J." and "AJ" refer to the same character.
4) The names "Mom" and "Mrs. Turner" refer to the same character.
5) The names "Dad" and "Mr. Turner" refer to the same character.

In [57]:
# 1) Remove irrelevant/nonsensical characters
for character in ["Man", "All", "TV", "TV Narrator"]:
    characters_clean.remove(character)

edges_clean = edges_clean.loc[(edges_clean["Character A"].isin(characters_clean)) & (edges_clean["Character B"].isin(characters_clean))]

# 2), 3), 4) and 5) Merge instances in the edge dataframe accordingly

# First define a mapping to replace the names in the dataframe
renaming = {character : character for character in characters_clean}
renaming["Chet"] = "Chet Ubetcha"
renaming["Chet Ubetcha (offscreen)"] = "Chet Ubetcha"
renaming["AJ"] = "A.J."
renaming["Mom"] = "Mrs. Turner"
renaming["Dad"] = "Mr. Turner"

# Next, use such mapping
edges_clean["Character A"] = edges_clean["Character A"].map(renaming)
edges_clean["Character B"] = edges_clean["Character B"].map(renaming)

# Because you may now have repeated connections, group by the pair of columns
# "Character A" and "Character B" , use the agg method to sum column by column
# and take into account the weights of both registries, and set the edges_dataframe
# as the output of this. 
edges_clean = edges_clean.groupby(["Character A", "Character B"], as_index= False).agg("sum")

# Remove possible references that could come up from this
edges_clean = edges_clean.loc[edges_clean["Character A"] != edges_clean["Character B"]]

The visual representation of the network suggests that Chet Ubetcha may have become a weakly connected character. This is confirmed through a quick check.

In [58]:
edges_clean[(edges_clean["Character A"] == "Chet Ubetcha") | (edges_clean["Character B"] == "Chet Ubetcha")]


Unnamed: 0,Character A,Character B,Weight Ep. 1,Weight Ep. 2,Weight Ep. 3,Weight Ep. 4,Weight Ep. 5,Weight Ep. 6,Weight Ep. 7,Weight Ep. 8,Weight Ep. 9,Weight Ep. 10,Weight Ep. 11,Weight Ep. 12,Weight Ep. 13,Weight Ep. 14,Weight Season
27,Chet Ubetcha,Santa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
28,Chet Ubetcha,Timmy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


 In principle, this could in turn make characters connected to Chet Ubetcha weakly connected and this could go on and on with characters connected to those characters. Luckily, from the visual representation of the network it is easy to see that neither Santa nor Timmy (obviously) become weakly connected. Furthermore, the same visual also tells that this does not happen with any of the changes made through the mapping above. So for the sake of consistency, we indeed remove the character.

In [59]:
characters_clean.remove("Chet Ubetcha")
edges_clean = edges_clean.loc[(edges_clean["Character A"].isin(characters_clean)) & (edges_clean["Character B"].isin(characters_clean))]

In [99]:
# We add color to the edges
edges_clean["color"] = "#2ECCFA"

# We consider attributes of size, shape and color for the nodes
nodes_size = {character: np.max([edges_clean.loc[(edges_clean["Character A"] == character) |
                                   (edges_clean["Character B"] == character), "Weight Season"].sum() / 5, 5])
                                   for character in characters_clean}

nodes_shape = {character: "star" for character in characters_clean}
nodes_color = {character: "#F7FE2E" for character in characters_clean}
df_nodes = pd.DataFrame({character : {"shape" : nodes_shape[character], "size": nodes_size[character], 
                                      "color" : nodes_color[character]} for character in characters_clean})

# We review the attributes of Timmy, Wanda and Cosmo 
# because they are the most important characters
df_nodes["Timmy"].pop("shape") # keep a circular shape
df_nodes["Cosmo"].pop("shape") # keep a circular shape
df_nodes["Wanda"].pop("shape") # keep a circular shape
df_nodes["Timmy"]["color"] = "#FF0080"
df_nodes["Wanda"]["color"] = "#Fe2EC8"
df_nodes["Cosmo"]["color"] = "#04B404"

# Construct the graph object,add the node attributes to the graph and
# visualize it using pyvis.
G = nx.from_pandas_edgelist(edges_clean, source="Character A", target="Character B", edge_attr= ["Weight Season", "color"] )
nx.set_node_attributes(G, df_nodes)
nt = Network(height = "600px", width = "1200px" , bgcolor = "#9b56c0" , font_color = "white", notebook=True, cdn_resources= 'remote') 
nt.barnes_hut( gravity = -700)
nt.from_nx(G)
nt.show("Fairy Network season 1 cleaner.html")

Fairy Network season 1 cleaner.html
