In [1]:
import re
import urllib.request
from urllib.request import urlopen

# re module
- match(pattern, string, flags): from the beginning, return:
    - MatchObject if match exists or
    - None, otherwise
    
- search(pattern, string, flags): search _the entire string) for match, return:
    - MatchObject if exists, or
    - None, otherwise
    
- findAll(pattern, string, flags): returns a list of matches of patterns in the string

- finditer(pattern, string, flags): returns an iterator of matches of patterns in the string

- fullmatch(pattern, string, flags): applies the pattern to the full string, returns a MatchObject or None

- split(pattern, string, maxsplit, flags): breaks the string up by the regex pattern

- sub(pattern, repl, string, count, flags): finds match, replaces repl with it. Returns new string

### match() vs. search()
- match() returns:
    - a MatchObject if zero or more characters at the beginning of the string match the regular expression pattern, or
    - None if the string does not match the pattern
- search() scans through the entire string and returns:
    - a corresponding MatchObject, or
    - None if finds no match

In [3]:
f = open('LordOfTheRings.txt')

In [4]:
lor = f.read()

In [9]:
re.match('THE',lor)

<re.Match object; span=(0, 3), match='THE'>

In [10]:
# Returns None

re.match('frodo',lor)

In [12]:
# Returns None

re.match('Gandalf',lor)

In [13]:
re.search('Frodo',lor)

<re.Match object; span=(5322, 5327), match='Frodo'>

In [14]:
re.search('Gandalf',lor)

<re.Match object; span=(3837, 3844), match='Gandalf'>

In [15]:
re.search('THE',lor)

<re.Match object; span=(0, 3), match='THE'>

In [18]:
regex_space = re.compile('\s+')

In [19]:
for string in ["  ","Hello   ","There", "  "]:
    if re.match(regex_space, string):
        print(repr(string), "match found")
    else:
        print(repr(string), "match NOT found")

'  ' match found
'Hello   ' match NOT found
'There' match NOT found
'  ' match found


# Using Raw Strings
- Use raw strings to avoid confusion by having to escape characters within a regex string.

In [20]:
a = "abc def"

In [24]:
if re.match('\b', a):
    print('Match')
else:
    print('Didn\'t match')

Didn't match


In [25]:
b = 'abc\bdef'

In [26]:
if re.match('\b', b):
    print('Match')
else:
    print('Didn\'t match')

Didn't match


In [27]:
if re.match('\\b', a):
    print('Match')
else:
    print('Didn\'t match')

Match


In [28]:
if re.match(r'\b', a):
    print('Match')
else:
    print('Didn\'t match')

Match


##  BEWARE: Python could be TOO helpful.  It adds a backslash for you. but not always (e.g. \b).

In [29]:
'\d'

'\\d'

In [30]:
match_nums = ['12345abc','abc12345']

In [31]:
for item in match_nums:
    if re.match('\\d{5}', item):
        print('Starts with 5 numbers')
    else:
        print('Does not start with 5 numbers')

Starts with 5 numbers
Does not start with 5 numbers


In [32]:
# No backslash no problem

for item in match_nums:
    if re.match('\d{5}', item):
        print('Starts with 5 numbers')
    else:
        print('Does not start with 5 numbers')

Starts with 5 numbers
Does not start with 5 numbers


# Common regex
- ^ From the start
- $ To the end
- \s whitespace
- \S non-whitespace
- \d digit
- \D non-digit
- \w alphanumeric character
- \W non alphanumeric charcter
- \b word boundary
- \B non word boundary
- * 0 or more instances
- + 1 or more instances
- ? 0 or 1 instances
- {n} exactly n chars
- {n,m} n to m
- {,m} upto m
- (n|m|l) n, m or l chars
- [abcd] a or b or c or d
- [f-m] anyone from f to m
- [^xyz] not x or y or z
- [a-zA-Z] any one of the letters

## MatchObjects
- A MarchObject is returned from either match() or search() if the pattern is found
- MatchObject methods:
    - start() - returns the index of the start of the match
    - end() - returns the index of the end of the match
    - span() - returns both the start and the end of the match
    - groups() - returns a tuple of a all sub-groups (paranthesed-items)
    - group(n) - returns a sub-group, zero is for the whole match

# Groupings
- When a match occurs, matchobj.groups() will return a tuple of the entire match
- Use matchobj.group(n) to obtain a particular group

In [33]:
string = "I am just trying to learn REGEX man!"

In [34]:
match_obj = re.search(r'(\w+) (\w+) (\w+) (\w+)', string)

In [35]:
match_obj.groups()

('I', 'am', 'just', 'trying')

In [38]:
for i in range(len(match_obj.groups()) + 1):
    print('group - {}: {}'.format(str(i), match_obj.group(i)))

group - 0: I am just trying
group - 1: I
group - 2: am
group - 3: just
group - 4: trying


## findall() usage
- The findall() method allows for finding multiple occurrences of a regex
- returns a list of strings that match

In [40]:
re.findall(r'\w+', string)

['I', 'am', 'just', 'trying', 'to', 'learn', 'REGEX', 'man']

In [41]:
re.search(r'\w+', string)

<re.Match object; span=(0, 1), match='I'>

# Matching Flags
- Flags can be set to tailor aspects of the search
    - re.IGNORECASE - case insensitive matches
    - re.VERBOSE - use verbose-style regular expressions
    - re.DOTALL - dot(.) can match any char including newlines
    - re.MULTILINE - matches at the beginning of each line are allowed with match()

# Verbose Flag
- Regular expressions are difficult to read:
always provide inline documentation
- It can be done with verbose regular expressions.
- A verbose regular expression is different from a compact regular expression in two ways:
    - Whitespace is ignored:
        - Spaces, tabs, and carriage returns are not matched as spaces, tabs, and carriage returns. They're not matched at all.
        - If you want to match a space in a verbose regular expression, you'll need to escape it by putting a backslash in front of it
    - Comments are ignored:
        - A comment in a verbose regular expression is just like a comment in Python code: it starts with a # character and goes until the end of the line.

In [42]:
pattern = r'''
    (\(?\d{3}\)?)?
    [-\s.]
    \d{3}
    [-\s.]
    \d{4}
'''

In [43]:
phones = [
    '123-456-7890',
    '123 456 7890',
    '(123) 456-7890',
    '123.456,7890',
    '123-4567',
    'abc-dfg-7789'
]

In [44]:
valid_numbers = [phone for phone in phones if re.match(pattern,phone, re.VERBOSE)]

In [45]:
valid_numbers

['123-456-7890', '123 456 7890', '(123) 456-7890']

## String Manipulation
- Two methods can be used to manipulate strings after a search has been performed:
    - newstr = re.sub(pattern, replacement, sourcestring)
        - replaces first match in the sourcestring
    - re.split(pattern, sourcestring)
        - breaks a string into a list based on a specified pattern

In [46]:
re.sub('World', 'There', 'Hello World')

'Hello There'

In [47]:
re.split(r'\d', 'Hello0There9I0am0trying0to0learn0regex')

['Hello', 'There', 'I', 'am', 'trying', 'to', 'learn', 'regex']

## Compiling for Efficiency
- Use the re.compile(pattern) if a regex is used repeatedly

pattern_obj = re.compile(pattern, re.VERBOSE)

In [48]:
regex = re.compile(pattern, re.VERBOSE)

In [50]:
valid_numbers = [phone for phone in phones if regex.match(phone)]

In [51]:
valid_numbers

['123-456-7890', '123 456 7890', '(123) 456-7890']

## str methods and regex
- The regex.search() method operates a lot like str.index() or str.find()

In [59]:
text = "hello there! I am trying to learn regex regex"

In [60]:
text.index('regex')

34

In [61]:
regex = re.compile('regex')

In [62]:
match = regex.search(text)

In [64]:
match.start()

34

In [65]:
match.end()

39

- The regex.sub() method operates much like str.replace():

In [66]:
text.replace('regex','REGEX')

'hello there! I am trying to learn REGEX REGEX'

In [69]:
regex.sub('world',text)

'hello there! I am trying to learn world world'

## Example: A common task of matching email addresses

In [74]:
email = re.compile(r'\w+@\w+\.[a-z]{3}')

In [79]:
doc = "Please mail everything to tdahibh@ncsu.edu and tushar.h.dahibhate@gmail.com"

In [80]:
email.findall(doc)

['tdahibh@ncsu.edu', 'dahibhate@gmail.com']

In [81]:
email.match(doc)

In [82]:
email.search(doc)

<re.Match object; span=(26, 42), match='tdahibh@ncsu.edu'>

In [83]:
#fixed

In [100]:
email = re.compile(r'\w+[\.\w]+@\w+\.[a-z]{3}')

In [101]:
email.findall(doc)

['tdahibh@ncsu.edu', 'tushar.h.dahibhate@gmail.com']

# Basics of RegEx syntax

### Simple strings are matched directly

In [102]:
lor = open('LordOfTheRings.txt').read()

In [103]:
frodo = re.compile('Frodo')
frodos = frodo.findall(lor)

In [104]:
len(frodos)

1100

### Some characters have special meanings
. ^ $ * + ? { } [ ] \ | ( )
- if you need to match any of these you will have to escape them with a backslash
- use of raw strings is a good practice to avoid clashes with \ use

In [105]:
x = re.compile(r'\$')

In [106]:
x.findall('20$')

['$']

### Square brackets match custom character groups
- If built-in character groups are not sufficient, define your own
- use a dash to specify the range,
- e.g. "[a-m]" will match lower case letters up to m

In [107]:
regex = re.compile('[aeiou]')
regex.split('lajfnsajkvnaskjfgnurg')

['l', 'jfns', 'jkvn', 'skjfgn', 'rg']

### Wildcards match repeated characters
- If you want to match five characters in a row you can "\w\w\w\w\w" or "\w{3}"
- (Ch 14) The following is a table of the repetition markers available for use in regular expressions:
- "ab?" matches "a" or "ab"
- "ab*" matches "a", "ab", "abb", "abbb"...
- "ab+" matches "ab", "abb", "abbb"... but not "a"
- "ab{2}" matches "abb"
- "ab{2,3}" matches "abb" or "abbb"


In [108]:
email = r'''
    [\w.]+
    @\w+\.[a-z]{3}
'''

In [109]:
x = re.compile(email, re.VERBOSE)

In [110]:
x.match('tushar.h.dahibhate@gmail.com')

<re.Match object; span=(0, 28), match='tushar.h.dahibhate@gmail.com'>

### Naming extracted components


In [111]:
email4 = re.compile('(?P<user>[\w.]+)@(?P<domain>\w+)\.(?P<suffix>[a-z]{3})')
match = email4.match('tushar.h.dahibhate@gmail.com')
match.groupdict()

{'user': 'tushar.h.dahibhate', 'domain': 'gmail', 'suffix': 'com'}

# Additional practice

## re.finditer()

In [112]:
html = urlopen('https://en.wikipedia.org/wiki/LexisNexis').read().decode('utf-8')

In [115]:
pattern = re.compile(r'\b(the\s+\w+)\s+', re.IGNORECASE)

In [117]:
for match in pattern.finditer(html):
    print('{}: {}'.format(match.start(), match.group(0)))

8073: the free 
12650: the electronic 
12811: the company 
16400: The story 
17151: the OSBA 
17641: the contents 
18268: the business 
18299: the Data 
18716: the team 
18974: The resulting 
19009: the nonlegal 
19046: the legal 
20423: the OBAR 
20466: the historical 
20909: the board 
21116: the legacy 
22116: the original 
22239: the late 
22412: the implementation 
22434: the specifications 
22642: the same 
22693: the necessary 
22722: the legal 
23124: the data 
23358: the first 
23371: the early 
23422: the vision 
23746: the 1970s 
23837: the professional 
23879: the early 
24054: the librarian 
24787: the end 
24809: the LEXIS 
24885: the entire 
26298: the LEXIS 
26520: the LEXIS 
26735: the first 
26975: The NEXIS 
27671: the grounds 
27825: the spoken 
27915: the computerized 
28139: the company 
28495: the Second 
28536: the 2nd 
28706: the two 
28867: the Michie 
29145: the LexisNexis 
29397: the sale 
29431: the tax 
29656: the Illinois 
29924: The Court 
29964: the low

### re.fullmatch()

In [118]:
re.fullmatch?

In [124]:
pattern = re.compile("d[oh]")
words = ['dog', 'do', 'dh', 'og','d','doh','oh']
for word in words:
    match = pattern.fullmatch(word)
    if match:
        print("'{}' matches '{}'".format(word,match.string))
    else:
        print("'{}' does not match pattern".format(word))

'dog' does not match pattern
'do' matches 'do'
'dh' matches 'dh'
'og' does not match pattern
'd' does not match pattern
'doh' does not match pattern
'oh' does not match pattern


## Non capturing groups
- The group is matched but not captured. 