# Using Regular Expressions to Extract Information from Obituaries

In [1]:
import re
import os
import pandas as pd
from IPython.display import YouTubeVideo

In [None]:
YouTubeVideo("dkdpvf6XHPQ")

In [4]:
with open(os.path.join(os.path.expanduser("~"),"DATA/Misc/obits.txt")) as f0:
    data = f0.readlines()

In [10]:
data = [d for d in data if d[0] not in "0123456789"]

In [19]:
txt = " ".join(data)

In [18]:
txt

"BAIRD, Alice O., 96, of Pine, died Oct. 16. King Funeral Home, Hampton. (DN) \n\nBECKER, Patricia Joan, 68, of West Deer, died Oct. 16. King Funeral Home, Hampton. (DN) \n\nBELKE, Ronald, died Oct. 13. Thomas J. Gmiter Funeral Home, South Side. (DN) \n\nBELKE, Irene, died Oct. 13. Thomas J. Gmiter Funeral Home, South Side. (DN) \n\nBRADBURN, Arlene A., died Oct. 16. William Slater II Funeral Services, Scott. (DN) \n\nBRAUN, Justina F., 92, of McKees Rocks, died Oct. 15. McKees Rocks Funeral Home, McKees Rocks. (DN) \n\nBROWN, Dorothy C., 77, died Oct. 16. Rowland S. Cooke Funeral Home, East End. (DN) \n\nCHERNEGA, Mary M., 83, of East Pittsburgh, died Oct. 17. Patrick T. Lanigan Funeral Home, East Pittsburgh. (DN) \n\nCLOUSE, Walter J., 79, of Fox chapel, died Oct. 17. Weddell-Ajak Funeral Home, Aspinwall. (DN) \n\nDRURY, Anna H., 93, of Mount Washington, died Oct. 15. Daniel T. D'Alessandro Funeral Home, Lawrenceville. (DN) \n\nDUBOSKY, Mary Hotsur, 85, of North Huntingdon, died Oct.

## Extracting Names

* Do we want to capture first, last, middle names together or separately?
* What patterns can we exploit?
* What are the variations in names?

In [12]:
print(len(txt.split("\n")))

29


In [8]:
print(txt)

BAIRD, Alice O., 96, of Pine, died Oct. 16. King Funeral Home, Hampton. (DN) 
BECKER, Patricia Joan, 68, of West Deer, died Oct. 16. King Funeral Home, Hampton. (DN) 
BELKE, Ronald, died Oct. 13. Thomas J. Gmiter Funeral Home, South Side. (DN) 
BELKE, Irene, died Oct. 13. Thomas J. Gmiter Funeral Home, South Side. (DN) 
BRADBURN, Arlene A., died Oct. 16. William Slater II Funeral Services, Scott. (DN) 
BRAUN, Justina F., 92, of McKees Rocks, died Oct. 15. McKees Rocks Funeral Home, McKees Rocks. (DN) 
BROWN, Dorothy C., 77, died Oct. 16. Rowland S. Cooke Funeral Home, East End. (DN) 
CHERNEGA, Mary M., 83, of East Pittsburgh, died Oct. 17. Patrick T. Lanigan Funeral Home, East Pittsburgh. (DN) 
CLOUSE, Walter J., 79, of Fox chapel, died Oct. 17. Weddell-Ajak Funeral Home, Aspinwall. (DN) 
DRURY, Anna H., 93, of Mount Washington, died Oct. 15. Daniel T. D'Alessandro Funeral Home, Lawrenceville. (DN) 
DUBOSKY, Mary Hotsur, 85, of North Huntingdon, died Oct. 16. William Snyder Funeral Hom

In [20]:
re.findall(r"(?P<last_name>[cA-Z]{2,}), (?P<first_name>[A-Z][a-z]+)( (?P<middle_name>[A-Za-z.]+))?", txt) ##마지막에 ?은 optional group middlename에는 space

[('BAIRD', 'Alice', ' O.', 'O.'),
 ('BECKER', 'Patricia', ' Joan', 'Joan'),
 ('BELKE', 'Ronald', '', ''),
 ('BELKE', 'Irene', '', ''),
 ('BRADBURN', 'Arlene', ' A.', 'A.'),
 ('BRAUN', 'Justina', ' F.', 'F.'),
 ('BROWN', 'Dorothy', ' C.', 'C.'),
 ('CHERNEGA', 'Mary', ' M.', 'M.'),
 ('CLOUSE', 'Walter', ' J.', 'J.'),
 ('DRURY', 'Anna', ' H.', 'H.'),
 ('DUBOSKY', 'Mary', ' Hotsur', 'Hotsur'),
 ('EWART', 'Arthur', ' E.', 'E.'),
 ('FARKAS', 'Laura', '', ''),
 ('FLEMING', 'Howard', ' Lee', 'Lee'),
 ('GAMBLE', 'Albert', ' R.', 'R.'),
 ('GIDDANS', 'Donald', '', ''),
 ('GOLDBERG', 'Elaine', ' Unger', 'Unger'),
 ('GREGO', 'Kimberly', ' A.', 'A.'),
 ('McCUE', 'Cletus', '', ''),
 ('McMULLAN', 'Bernard', ' L.', 'L.'),
 ('ACKLIN', 'William', ' H.', 'H.'),
 ('BEGLY', 'Laurel', ' M.', 'M.'),
 ('BOCK', 'Albert', ' J.', 'J.'),
 ('CARR', 'Alberta', ' Weaver', 'Weaver'),
 ('CHARNOCK', 'Ruth', ' A.', 'A.'),
 ('CHINCHILLA', 'Cecilia', ' C.', 'C.'),
 ('CRATSA', 'Steve', '', ''),
 ('CUNLIFFE', 'Grace', ' Stot

In [24]:
reName = re.compile(r"(?P<last_name>[cA-Z]{2,}), (?P<first_name>[A-Z][a-z]+)( (?P<middle_name>[A-Za-z.]+))?")
for match in reName.finditer(txt):
    print(match.groupdict())

{'last_name': 'BAIRD', 'first_name': 'Alice', 'middle_name': 'O.'}
{'last_name': 'BECKER', 'first_name': 'Patricia', 'middle_name': 'Joan'}
{'last_name': 'BELKE', 'first_name': 'Ronald', 'middle_name': None}
{'last_name': 'BELKE', 'first_name': 'Irene', 'middle_name': None}
{'last_name': 'BRADBURN', 'first_name': 'Arlene', 'middle_name': 'A.'}
{'last_name': 'BRAUN', 'first_name': 'Justina', 'middle_name': 'F.'}
{'last_name': 'BROWN', 'first_name': 'Dorothy', 'middle_name': 'C.'}
{'last_name': 'CHERNEGA', 'first_name': 'Mary', 'middle_name': 'M.'}
{'last_name': 'CLOUSE', 'first_name': 'Walter', 'middle_name': 'J.'}
{'last_name': 'DRURY', 'first_name': 'Anna', 'middle_name': 'H.'}
{'last_name': 'DUBOSKY', 'first_name': 'Mary', 'middle_name': 'Hotsur'}
{'last_name': 'EWART', 'first_name': 'Arthur', 'middle_name': 'E.'}
{'last_name': 'FARKAS', 'first_name': 'Laura', 'middle_name': None}
{'last_name': 'FLEMING', 'first_name': 'Howard', 'middle_name': 'Lee'}
{'last_name': 'GAMBLE', 'first_na

In [26]:
reName = re.compile(r"(?P<last_name>[cA-Z]{2,}), (?P<first_name>[A-Z][a-z]+)( (?P<middle_name>[A-Za-z.]+))?")
matchiter = reName.finditer(txt)

In [36]:
match = next(matchiter)

In [37]:
match

<re.Match object; span=(326, 345), match='BRADBURN, Arlene A.'>

In [38]:
match.start

<function Match.start(group=0, /)>

## Extracting Age



In [None]:
YouTubeVideo("ZGnEW1XQ-dk")

In [None]:
print(txt)

In [None]:
reNameAge = re.compile(r"(?P<last_name>[cA-Z]{2,}), (?P<first_name>[A-Z][a-z]+)( (?P<middle_name>[A-Za-z.]+))?,( (?P<age>\d+))?")
reNameAge.findall(txt)
for match in reNameAge.finditer(txt):
    print(match.groupdict())

## Extracting Place of Death

In [None]:
YouTubeVideo("aaftVUMcWgI")

In [None]:
print(txt)

In [None]:
reNameAgePlace = re.compile(r"(?P<last_name>[cA-Z]{2,}), (?P<first_name>[A-Z][a-z]+)( (?P<middle_name>[A-Za-z.]+))?,( (?P<age>\d+))?(, of (?P<place>[A-Z][a-zA-Z ]+),)?")
for match in reNameAgePlace.finditer(txt):
    print(match.groupdict())

## Extracting Date of Death



In [None]:
YouTubeVideo("Vm1TS9P2Kj4")

In [None]:
print(txt)

In [None]:
reNameAgePlaceDOD = re.compile(r"(?P<last_name>[cA-Z]{2,}), (?P<first_name>[A-Z][a-z]+)( (?P<middle_name>[A-Za-z.]+))?,( (?P<age>\d+))?(, of (?P<place>[A-Z][a-zA-Z ]+))?,( died (?P<dod>[A-Z][a-z.]+ \d+))")
for match in reNameAgePlaceDOD.finditer(txt):
    print(match.groupdict())

In [None]:
pd.DataFrame([match.groupdict() for match in reNameAgePlaceDOD.finditer(txt)])


In [None]:
name_re = re.compile(r'(?P<last_name>\b[A-Z](c|ac)?[A-Z]+),\s(?P<first_name>[A-Z][a-z]+)(\s)?(?P<middle_name>[A-Z]([.a-z]*)?)?,')
names = name_re.findall(txt)
print(names)
print(len(data))
print(len(names))