## Extract the synopsis and plot of each episode

In [1]:
import re
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import codecs

In [94]:
def clean_text(aText):
    # remove [[file:<aFile>]]
    pattern_file = "\[\[file\:.*?\]\]"
    cleanText = re.sub(pattern_file,'', aText)
    # replace then remove [https://<link>]
    pattern_url = "\[http.*? (.*?)\]"
    cleanText = re.sub(pattern_url,r'\1', cleanText)
    cleanText = re.sub("\[http.*?\]",'',cleanText)
    # replace [[<aCharacter>|<infos>]] by <aCharacter>
    pattern = "\[\[(.*?)(?:\|.*?)?\]\]"
    cleanText = re.sub(pattern,r"\1", cleanText)
    # replace remained [,]
    cleanText = cleanText.replace('[','')
    cleanText = cleanText.replace(']','')
    return cleanText

In [None]:
pages_path = 'episode_pages'
all_files = [f for f in listdir(pages_path) if isfile(join(pages_path, f))]

# for each episode
all_descriptions = {}
synopsis_header, plot_header = "==Synopsis==", "==Plot=="

for aFile in all_files:
    # Read the file containing the episode's page
    episode_page = open(pages_path+'/'+aFile, encoding="utf-8").read() 
    print('Processing... ', aFile)
    # Replace if needed synopsis and plot headers
    episode_page = re.sub("\=\= ?Synopsis ?\=\=","==Synopsis==",episode_page)
    episode_page = re.sub("\=\= ?Plot ?\=\=","==Plot==",episode_page)
    # Initialize
    synopsis, plot = '',''
    # Get synopsis and plot if found in page
    if synopsis_header in episode_page:
        synopsis = episode_page.split(synopsis_header)[1].split('==')[0]
        print("   Synopsis found")
    if plot_header in episode_page:
        plot = episode_page.split(plot_header)[1].split('==')[0]
        print("   Plot found")
    # Clean synopsis and plot
    synopsis, plot = clean_text(synopsis), clean_text(plot)
    # Merge both texts
    story = synopsis+plot
    # Save it locally
    # f = codecs.open('episode_stories/'+aFile, "w+", "utf-8")
    # f.write(story)
    # f.close()

## Tokenization of episodes

In [130]:
import nltk
from nltk.tokenize import WordPunctTokenizer
from gensim.parsing.preprocessing import STOPWORDS

In [131]:
pages_path = 'episode_stories'
all_files = [f for f in listdir(pages_path) if isfile(join(pages_path, f))]

In [165]:
df_characters = pd.read_csv('RaM_characters.csv')

In [191]:
# get the stopwords list in given language
stopwords = gensim.parsing.preprocessing.STOPWORDS
# tokenization factory
tk = WordPunctTokenizer()
# lemmatization factory
lm = nltk.WordNetLemmatizer()
# to exclude
specific_words = ['smith','sanchez']

# for each episode story
for aFile in all_files:
    # Read the file containing the character's page description.
    episode_page = open(pages_path+'/'+aFile, encoding="utf-8").read() 
    # Set everything to lower case.
    episode_page = episode_page.lower()
    # Exclude characters names, BEFORE tokenisation
    for aCharacterName in specific_words:
        episode_page = episode_page.replace(aCharacterName.lower(),'')
    # Tokenize your text
    episode_page = tk.tokenize(episode_page)
    # Exclude punctuation and stop words
    episode_page = [aToken for aToken in episode_page if aToken.isalnum() and aToken not in stopwords]
    # Lemmatize words
    episode_page = [ lm.lemmatize(w) for w in episode_page ]
    # Remove words with less than 2 letters
    episode_page = [ w for w in episode_page if len(w)>2]
    # Transform list into list separated by spaces
    episode_page = ''.join([str(elem)+' ' for elem in episode_page])
    # Save your output for future use
    f = codecs.open('episode_tokens/'+aFile, "w+", "utf-8")
    f.write(episode_page)
    f.close()

# Topic Detection
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [192]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

In [193]:
pages_path = 'episode_tokens'
all_files = [f for f in listdir(pages_path) if isfile(join(pages_path, f))]

# get the dictionary of tokens for each episode
all_tokens = {}

# for each episode tokens list
for aFile in all_files:
    # Read the file containing the tokens of the episode
    episode_tokens = open(pages_path+'/'+aFile, encoding="utf-8").read() 
    all_tokens[aFile]= episode_tokens.split(' ')

In [194]:
# build the dictionary id2word by using corpora.Dictionary(YOUR_LIST_OF_LISTS)
dictionary = gensim.corpora.Dictionary(all_tokens.values())

In [195]:
# Filter out tokens that appear in less than 15 documents (absolute number) 
# or more than 0.5 documents (fraction of total corpus size, not absolute number).
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [196]:
# how many words and how many times those words appear = bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in all_tokens.values()]

In [197]:
df_episodes = pd.read_csv("RaM_episodes.csv")
df_ep_ranking = pd.read_csv('RaM_imdb_episodes_ranking.csv')
df_ep_ranking.head(5)

Unnamed: 0,rank,Title,rate,nb_votes,imdb_link,Season_nb,Episode_nb
0,1,Tales From the Citadel,9.8,29698,https://www.imdb.com/title/tt5218332/,3,7
1,2,The Rickshank Rickdemption,9.6,19948,https://www.imdb.com/title/tt5218228/,3,1
2,3,Total Rickall,9.6,16633,https://www.imdb.com/title/tt4832262/,2,4
3,4,Rickmurai Jack,9.5,8127,https://www.imdb.com/title/tt15041334/,5,10
4,5,The Vat of Acid Episode,9.5,12058,https://www.imdb.com/title/tt10655692/,4,8


In [198]:
def get_seasonNb_and_episodeNb(aFileEpisodeName):
    return int(aFileEpisodeName[1:3]), int(aFileEpisodeName[4:7])

In [199]:
# get a preview for a chosen episode
episode_to_preview = 'S01E05'
ep_index = np.argwhere(np.array(all_files)==episode_to_preview+'.txt')[0][0]

seasonNb, epNb = get_seasonNb_and_episodeNb(episode_to_preview)
ep_title = list(df_episodes[(df_episodes.Season_nb == seasonNb) & (df_episodes.Episode_nb == epNb)].Title)[0]
print("Top10 words for {} ({}):".format(episode_to_preview, ep_title))

bow_doc_ep = list(sorted(bow_corpus[ep_index], reverse=True, key=lambda x:x[1]))
for i in range(10):
    print(" '{}' appears {} times.".format(
        dictionary[bow_doc_ep[i][0]], 
        bow_doc_ep[i][1])
    )

Top10 words for S01E05 (Meeseeks and Destroy):
 'adventure' appears 12 times.
 'help' appears 7 times.
 'quickly' appears 6 times.
 'hand' appears 5 times.
 'lead' appears 5 times.
 'creature' appears 4 times.
 'portal' appears 4 times.
 'point' appears 3 times.
 'world' appears 3 times.
 'having' appears 2 times.


In [200]:
# TF IDF
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [201]:
# Running LDA using Bag of Words
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.054*"planet" + 0.035*"adventure" + 0.032*"quickly" + 0.030*"help" + 0.029*"garage" + 0.021*"fight" + 0.021*"explains" + 0.017*"travel" + 0.017*"ship" + 0.016*"open"
Topic: 1 
Words: 0.041*"planet" + 0.032*"people" + 0.029*"world" + 0.027*"life" + 0.027*"ship" + 0.026*"car" + 0.022*"room" + 0.022*"stop" + 0.021*"escape" + 0.021*"killing"
Topic: 2 
Words: 0.034*"portal" + 0.029*"planet" + 0.026*"new" + 0.022*"gun" + 0.021*"going" + 0.020*"despite" + 0.020*"return" + 0.020*"want" + 0.019*"free" + 0.019*"life"
Topic: 3 
Words: 0.039*"fight" + 0.036*"body" + 0.033*"idea" + 0.032*"world" + 0.029*"control" + 0.029*"space" + 0.027*"head" + 0.025*"portal" + 0.022*"notice" + 0.022*"lead"
Topic: 4 
Words: 0.068*"save" + 0.055*"device" + 0.037*"head" + 0.035*"point" + 0.028*"death" + 0.024*"planet" + 0.021*"appears" + 0.019*"instead" + 0.019*"new" + 0.017*"travel"
Topic: 5 
Words: 0.034*"room" + 0.031*"space" + 0.025*"know" + 0.022*"having" + 0.020*"adventure" + 0.019*"plan" + 0

In [202]:
# Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.047*"creature" + 0.022*"relationship" + 0.021*"world" + 0.020*"dimension" + 0.017*"head" + 0.017*"place" + 0.017*"new" + 0.017*"life" + 0.017*"device" + 0.017*"want"
Topic: 1 
Word: 0.060*"portal" + 0.047*"dimension" + 0.043*"gun" + 0.033*"return" + 0.021*"head" + 0.021*"house" + 0.020*"new" + 0.019*"shoot" + 0.018*"save" + 0.015*"killing"
Topic: 2 
Word: 0.058*"inside" + 0.028*"house" + 0.023*"making" + 0.019*"friend" + 0.019*"named" + 0.019*"fight" + 0.018*"garage" + 0.018*"attack" + 0.018*"going" + 0.018*"car"
Topic: 3 
Word: 0.037*"house" + 0.034*"car" + 0.032*"body" + 0.025*"space" + 0.023*"free" + 0.023*"talking" + 0.020*"look" + 0.020*"want" + 0.018*"killed" + 0.018*"idea"
Topic: 4 
Word: 0.036*"soon" + 0.030*"travel" + 0.029*"body" + 0.029*"death" + 0.026*"world" + 0.023*"inside" + 0.020*"new" + 0.020*"adventure" + 0.019*"causing" + 0.017*"way"
Topic: 5 
Word: 0.039*"planet" + 0.023*"ship" + 0.022*"open" + 0.019*"work" + 0.017*"know" + 0.017*"people" + 0.017*"

In [3]:
pd.read_csv("df_characters_with_attr_and_communities.csv")

ParserError: Error tokenizing data. C error: Expected 11 fields in line 542, saw 12
