## Extract the synopsis and plot of each episode

In [39]:
import re
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import codecs

In [94]:
def clean_text(aText):
    # remove [[file:<aFile>]]
    pattern_file = "\[\[file\:.*?\]\]"
    cleanText = re.sub(pattern_file,'', aText)
    # replace then remove [https://<link>]
    pattern_url = "\[http.*? (.*?)\]"
    cleanText = re.sub(pattern_url,r'\1', cleanText)
    cleanText = re.sub("\[http.*?\]",'',cleanText)
    # replace [[<aCharacter>|<infos>]] by <aCharacter>
    pattern = "\[\[(.*?)(?:\|.*?)?\]\]"
    cleanText = re.sub(pattern,r"\1", cleanText)
    # replace remained [,]
    cleanText = cleanText.replace('[','')
    cleanText = cleanText.replace(']','')
    return cleanText

In [None]:
pages_path = 'episode_pages'
all_files = [f for f in listdir(pages_path) if isfile(join(pages_path, f))]

# for each episode
all_descriptions = {}
synopsis_header, plot_header = "==Synopsis==", "==Plot=="

for aFile in all_files:
    # Read the file containing the episode's page
    episode_page = open(pages_path+'/'+aFile, encoding="utf-8").read() 
    print('Processing... ', aFile)
    # Replace if needed synopsis and plot headers
    episode_page = re.sub("\=\= ?Synopsis ?\=\=","==Synopsis==",episode_page)
    episode_page = re.sub("\=\= ?Plot ?\=\=","==Plot==",episode_page)
    # Initialize
    synopsis, plot = '',''
    # Get synopsis and plot if found in page
    if synopsis_header in episode_page:
        synopsis = episode_page.split(synopsis_header)[1].split('==')[0]
        print("   Synopsis found")
    if plot_header in episode_page:
        plot = episode_page.split(plot_header)[1].split('==')[0]
        print("   Plot found")
    # Clean synopsis and plot
    synopsis, plot = clean_text(synopsis), clean_text(plot)
    # Merge both texts
    story = synopsis+plot
    # Save it locally
    # f = codecs.open('episode_stories/'+aFile, "w+", "utf-8")
    # f.write(story)
    # f.close()

## Tokenization of episodes

In [98]:
import nltk
from nltk.tokenize import WordPunctTokenizer

In [99]:
pages_path = 'episode_stories'
all_files = [f for f in listdir(pages_path) if isfile(join(pages_path, f))]

In [100]:
# get the stopwords list in given language
stopwords = nltk.corpus.stopwords.words('english')
# tokenization factory
tk = WordPunctTokenizer()
# lemmatization factory
lm = nltk.WordNetLemmatizer()

# for each episode story
for aFile in all_files:
    # Read the file containing the character's page description.
    episode_page = open(pages_path+'/'+aFile, encoding="utf-8").read() 
    # Set everything to lower case.
    episode_page = episode_page.lower()
    # Tokenize your text
    episode_page = tk.tokenize(episode_page)
    # Exclude punctuation and stop words
    episode_page = [aToken for aToken in episode_page if aToken.isalnum() and aToken not in stopwords]
    # Lemmatize words
    episode_page = [ lm.lemmatize(w) for w in episode_page ]
    # Remove words with less than 2 letters
    episode_page = [ w for w in episode_page if len(w)>2]
    # Transform list into list separated by spaces
    episode_page = ''.join([str(elem)+' ' for elem in episode_page])
    # Save your output for future use
    # f = codecs.open('episode_tokens/'+aFile, "w+", "utf-8")
    # f.write(episode_page)
    # f.close()

In [126]:
from gensim.parsing.preprocessing import STOPWORDS

In [128]:
len(gensim.parsing.preprocessing.STOPWORDS)

337

In [125]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Topic Detection

In [101]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

In [102]:
pages_path = 'episode_tokens'
all_files = [f for f in listdir(pages_path) if isfile(join(pages_path, f))]

# get the dictionary of tokens for each episode
all_tokens = {}

# for each episode tokens list
for aFile in all_files:
    # Read the file containing the tokens of the episode
    episode_tokens = open(pages_path+'/'+aFile, encoding="utf-8").read() 
    all_tokens[aFile]= episode_tokens.split(' ')

In [106]:
# build the dictionary id2word by using corpora.Dictionary(YOUR_LIST_OF_LISTS)
dictionary = gensim.corpora.Dictionary(all_tokens.values())

In [107]:
# Filter out tokens that appear in less than 15 documents (absolute number) 
# or more than 0.5 documents (fraction of total corpus size, not absolute number).
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [112]:
# how many words and how many times those words appear = bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in all_tokens.values()]

In [124]:
# get a preview

bow_doc_ep = list(sorted(bow_corpus[20], reverse=True, key=lambda x:x[1]))
for i in range(10):
    print("Word {} (\"{}\") appears {} time.".format(
        bow_doc_ep[i][0], 
        dictionary[bow_doc_ep[i][0]], 
        bow_doc_ep[i][1])
    )

Word 59 ("planet") appears 8 time.
Word 1 ("also") appears 3 time.
Word 19 ("life") appears 3 time.
Word 74 ("rest") appears 3 time.
Word 83 ("despite") appears 3 time.
Word 119 ("even") appears 3 time.
Word 9 ("eventually") appears 2 time.
Word 14 ("gun") appears 2 time.
Word 21 ("much") appears 2 time.
Word 37 ("saying") appears 2 time.
