In [1]:
%run -i "../util/file_utils.ipynb"

In [3]:
sherlock_holmes_part_of_text = read_text_file("../data/sherlock_holmes_1.txt")
print(sherlock_holmes_part_of_text)

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,
were abhorrent to his cold, precise but admirably balanced mind. He
was, I take it, the most perfect reasoning and observing machine that
the world has seen, but as a lover he would have placed himself in a
false position. He never spoke of the softer passions, save with a gibe
and a sneer. They were admirable things for the observer—excellent for
drawing the veil from men’s motives and actions. But for the trained
reasoner to admit such intrusions into his own delicate and finely
adjusted temperament was to introduce a distracting factor which might
throw a doubt upon all his mental results. Grit in a sensitive
instrument, or a crack in one of his own high-power lenses, would not
be more disturbing than a strong em

# Divide into sentences using nltk

In [3]:
import nltk
#nltk.download('punkt') # Run this line only the first time you run this notebook to download tokenizer data
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentences_nltk = tokenizer.tokenize(sherlock_holmes_part_of_text)
print(sentences_nltk)
print(len(sentences_nltk))

['To Sherlock Holmes she is always _the_ woman.', 'I have seldom heard him\nmention her under any other name.', 'In his eyes she eclipses and\npredominates the whole of her sex.', 'It was not that he felt any emotion\nakin to love for Irene Adler.', 'All emotions, and that one particularly,\nwere abhorrent to his cold, precise but admirably balanced mind.', 'He\nwas, I take it, the most perfect reasoning and observing machine that\nthe world has seen, but as a lover he would have placed himself in a\nfalse position.', 'He never spoke of the softer passions, save with a gibe\nand a sneer.', 'They were admirable things for the observer—excellent for\ndrawing the veil from men’s motives and actions.', 'But for the trained\nreasoner to admit such intrusions into his own delicate and finely\nadjusted temperament was to introduce a distracting factor which might\nthrow a doubt upon all his mental results.', 'Grit in a sensitive\ninstrument, or a crack in one of his own high-power lenses, wou

# Divide into sentences using spacy

In [4]:
import spacy
#!python -m spacy download en_core_web_sm # Run this line only the first time you run this notebook to download the model
nlp = spacy.load("en_core_web_sm")
doc = nlp(sherlock_holmes_part_of_text)
sentences_spacy = [sentence.text for sentence in doc.sents]
print(sentences_spacy)
print(len(sentences_spacy))

['To Sherlock Holmes she is always _the_ woman.', 'I have seldom heard him\nmention her under any other name.', 'In his eyes she eclipses and\npredominates the whole of her sex.', 'It was not that he felt any emotion\nakin to love for Irene Adler.', 'All emotions, and that one particularly,\nwere abhorrent to his cold, precise but admirably balanced mind.', 'He\nwas, I take it, the most perfect reasoning and observing machine that\nthe world has seen, but as a lover he would have placed himself in a\nfalse position.', 'He never spoke of the softer passions, save with a gibe\nand a sneer.', 'They were admirable things for the observer—excellent for\ndrawing the veil from men’s motives and actions.', 'But for the trained\nreasoner to admit such intrusions into his own delicate and finely\nadjusted temperament was to introduce a distracting factor which might\nthrow a doubt upon all his mental results.', 'Grit in a sensitive\ninstrument, or a crack in one of his own high-power lenses, wou

In [5]:
print(sentences_nltk == sentences_spacy)

True


# Compare the time it takes using both methods

In [6]:
import time

def split_into_sentences_nltk(text):
    sentences = tokenizer.tokenize(text)
    return sentences

def split_into_sentences_spacy(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sents]
    return sentences

start = time.time()
split_into_sentences_nltk(sherlock_holmes_part_of_text)
print(f"NLTK: {time.time() - start} s")

start = time.time()
split_into_sentences_spacy(sherlock_holmes_part_of_text)
print(f"spaCy: {time.time() - start} s")

NLTK: 0.00019311904907226562 s
spaCy: 0.019065380096435547 s
