In [None]:
import pandas as pd



def remove_speakers_and_empty_lines(episode_content: str) -> str:
    """Removes superfluous empty lines and the names of the speakers from the input data:
    e.g. :
    Picard: Make it so.
    becomes:
    Make it so.
    """

    cleaned_lines = []
    for line in episode_content.split('\n'):
        # ignore empty lines
        if line == '':
            continue
        # lines that start with square brackets are just information about the location.
        if line.startswith('['):
            continue
        # the actual talking lines always contain a ':' - we will just keep the text, not the talker
        # for this application
        if ':' in line:
            part_to_keep = line.split(':')[1]
            cleaned_lines.append(part_to_keep.strip() + ' \n')
            continue
        # after this string there are only information about the franchise, we can leave those out.
        if line == '<Back':
            break
        
        
        cleaned_lines.append(line.strip() + ' \n')
    return ''.join(cleaned_lines)


all_series_scripts = pd.read_json('all_scripts_raw.json')
#  remove the names of the speakers and get rid of the empty lines
# and I'll focus on The Next Generation Episodes for now
tng_series_scripts_cleaned = all_series_scripts.TNG.map(remove_speakers_and_empty_lines)



In [None]:
# after this preprocessing we want to check for outliers in the data
# thinks like miss spelled words etc.
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize


# lets make a single text from all episodes.
text = ''
for index, episode_text in tng_series_scripts_cleaned.items():
    text += episode_text


# tokenize the text
tokenized_text =  word_tokenize(text)

fd = FreqDist(tokenized_text)

print(len(fd))
print(len(fd.hapaxes()))
# as we can see here, our corpus contains 26404 different words - though we did not do any normalization like casefolding, stemming or lemmatization, so the actual different words will be less
# whats even more astounding for me is number of words that occur only once: almost half of all words!
# this approach apparently isnt helpful in finding missspelled words
# but lets see if we can glimpse whether those are misspelled words
print(fd.hapaxes()[:100])

In [None]:
# so, there are almost no spelling mistakes - that is good news! this should allow us to continue for now.

# now we can have a look whether it makes sense to combine words into combined tokens based on their cooccurence.
from gensim.models.phrases import Phrases
from nltk.tokenize import sent_tokenize
# gensims Phrases class expects a sequence of sentences, where each sentence is a list of tokens
sentences = sent_tokenize(text)
word_tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

bigrams = Phrases(word_tokenized_sentences)

# lets test that on the first episode
first_episode_bigrams = bigrams[word_tokenize(tng_series_scripts_cleaned[0])]
print(first_episode_bigrams)

# now that looks good - not too many bigrams, but a few typical combinations like tractor beam have been found.