In [1]:
import pandas as pd
import re
pd.options.display.max_colwidth = 100

In [2]:
# load in Sherlock Holmes text
sherlock_file = '../Resources/sherlock.txt'

sherlock_df = pd.read_csv(sherlock_file, sep='\n', header=None)
sherlock_df.columns = ['text']
sherlock_df.head()

Unnamed: 0,text
0,The Adventures of Sherlock Holmes
1,by Arthur Conan Doyle
2,Contents
3,I. A Scandal in Bohemia
4,II. The Red-Headed League


In [8]:
# The \s lets us match any white space and {} lets use match exact ranges or characters
# Here we find Holmes followed by a space then a 6 letter word then another space.
p = 'Holmes\s\w{6}\s'
sherlock_df[sherlock_df['text'].str.contains(p)]

Unnamed: 0,text
214,"I rose to go, but Holmes caught me by the wrist and pushed me back into"
759,"Holmes rushed at the bell-pull, tore back a small sliding shutter, and,"
3478,Sherlock Holmes closed his eyes and placed his elbows upon the arms of
3566,"Holmes turned over the leaves of the book upon his knee. “Here it is,”"
4196,"Sherlock Holmes seemed to be embarrassed by the question. “Frankly,"
4208,Sherlock Holmes sprang out of his chair as if he had been galvanised.
4934,Holborn. Holmes pushed open the door of the private bar and ordered two
5020,"Holmes turned to the page indicated. “Here you are, ‘Mrs. Oakshott,"
5027,Sherlock Holmes looked deeply chagrined. He drew a sovereign from his
5080,Sherlock Holmes hailed a four-wheeler which was passing. “In that case


In [10]:
# Adding a second number to the {} lets us find a range
# Here it matches all word of length 4 to 6
p = 'Holmes\s\w{4,6}\s'
sherlock_df[sherlock_df['text'].str.contains(p)]

Unnamed: 0,text
214,"I rose to go, but Holmes caught me by the wrist and pushed me back into"
327,"“Oh, then we have three days yet,” said Holmes with a yawn. “That is"
349,"Holmes took a note of it. “One other question,” said he. “Was the"
650,the voice of Holmes from within assuring them that it was a false
759,"Holmes rushed at the bell-pull, tore back a small sliding shutter, and,"


In [10]:
# Extract all lines of text containing the word Holmes followed
# by some text of length 6 or greater
p = 'Holmes \s*\w{6,}'
sherlock_df[sherlock_df['text'].str.contains(p)]

Unnamed: 0,text
34,I had seen little of Holmes lately. My marriage had drifted us away
214,"I rose to go, but Holmes caught me by the wrist and pushed me back into"
260,"“Kindly look her up in my index, Doctor,” murmured Holmes without"
305,"Holmes laughed. “It is quite a pretty little problem,” said he."
345,Holmes scribbled a receipt upon a sheet of his note-book and handed it
...,...
8137,"“Then I will.” Holmes suddenly bent his strength upon it, but without"
8260,With a dazed face the banker made out the required check. Holmes walked
8738,"“And she would need to be,” said Holmes gravely. “I am much mistaken if"
8832,“Let us have everything in its due order.” Holmes thrust his long thin


In [9]:
# Create capute groups 
p = '(Holmes)(\s*\w{6,})'

# Extract the groups
holmes_df = sherlock_df['text'].str.extractall(p)
holmes_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
34,0,Holmes,lately
214,0,Holmes,caught
260,0,Holmes,without
305,0,Holmes,laughed
345,0,Holmes,scribbled
...,...,...,...
8137,0,Holmes,suddenly
8260,0,Holmes,walked
8738,0,Holmes,gravely
8832,0,Holmes,thrust


In [11]:
# Grab the value counts for each word in the second capture group
holmes_df[1].value_counts()

 quietly         4
 blandly         3
 pushed          3
 turned          3
 answered        3
 returned        3
 sprang          3
 laughed         3
 walked          2
 gravely         2
 cheerily        2
 pulled          2
 suavely         2
 chuckled        2
 leaned          2
 clapped         2
 sweetly         1
 remarked        1
 continued       1
 closed          1
 changed         1
 without         1
 impatient       1
 laying          1
 sternly         1
 suddenly        1
 glanced         1
 grinned         1
 desired         1
 thrust          1
 thoughtfully    1
 standing        1
 stopped         1
 gently          1
 stepped         1
 unlocked        1
 hailed          1
 interposed      1
 caught          1
 looked          1
 lately          1
 staggered       1
 before          1
 refused         1
 welcomed        1
 demurely        1
 twisted         1
 carelessly      1
 seemed          1
 struck          1
 rushed          1
 coldly          1
 scribbled  

In [18]:
# extract all words that start with 'z' or 'Z'
p = '\W(z\w*)\W'
sherlock_df['text'].str.extractall(p, flags=re.I).dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
1777,0,Zealand
3500,0,zero
5678,0,zest
8514,0,zero


In [29]:
# Find all the words 6 letters or longer that come after "Holmes" in the text and count their frequency
p = '(?:Holmes)(\s*\w{6,})'
sherlock_df['text'].str.extractall(p, flags=re.I)[0].value_counts()

 quietly         4
 turned          3
 returned        3
 pushed          3
 answered        3
 laughed         3
 blandly         3
 sprang          3
 suavely         2
 chuckled        2
 walked          2
 pulled          2
 clapped         2
 leaned          2
 cheerily        2
 gravely         2
 changed         1
 hailed          1
 impatient       1
 gently          1
 stopped         1
 twisted         1
 closed          1
 glanced         1
 struck          1
 unlocked        1
 interposed      1
 sternly         1
 caught          1
 coldly          1
 nodded          1
 grinned         1
 lately          1
 demurely        1
 continued       1
 laying          1
 remarked        1
 standing        1
 sweetly         1
 before          1
 rushed          1
 stepped         1
 without         1
 thrust          1
 seemed          1
 looked          1
 calmly          1
 refused         1
 suddenly        1
 staggered       1
 carelessly      1
 thoughtfully    1
 welcomed   

In [31]:
# Find all the words 6 letters or longer that don't end in "ly" that come after "Holmes" 
# in the text and count their frequency
p = '(?:holmes)(?!.+ly)(\s*\w{6,})'
sherlock_df['text'].str.extractall(p, flags=re.I)[0].value_counts()

 answered      3
 returned      3
 pushed        3
 turned        3
 pulled        2
 sprang        2
 laughed       2
 desired       1
 nodded        1
 chuckled      1
 thrust        1
 stopped       1
 refused       1
 struck        1
 walked        1
 continued     1
 unlocked      1
 staggered     1
 clapped       1
 changed       1
 interposed    1
 welcomed      1
 rushed        1
 before        1
 twisted       1
 leaned        1
 without       1
 impatient     1
 hailed        1
 caught        1
 remarked      1
 closed        1
 laying        1
 grinned       1
 scribbled     1
Name: 0, dtype: int64