In [1]:
import re

#### <span style='color: red ;'>Positive Lookahead, Lookbehind </span>

In [2]:
re.search(r'(?<=([c]))a(?=(b))', 'cab', flags=re.M).groups()

('c', 'b')

#### <span style='color: blue ;'>Negative Lookahead, Lookbehind </span>

In [3]:
print(re.search(r'(?<!([c]))a(?!(b))', 'cab', flags=re.M))

None


<span style='color: red ;'>Notice the difference an above and below. </span><br/>
<span style='color: red ;'>Apostrophe is excluded because it's followed by 's' </span><br/>

In [4]:
re.search(r'\b\w+[^s]\b', 'John\'s').group()

"John'"

In [5]:
re.search(r'\b\w+(?<!s)\b', 'John\'s').group()

'John'

#### <span style="color:#2F4F4F"> "\1" is euquivalent to "(\d+)"</span>
#### <span style="color:#2F4F4F"> And "(\d+)" returns matches, 123, 23, 3 in this case</span>

In [6]:
print(re.search(r'(?=(\d+))\w+\1', '123x12'))

None


In [7]:
re.search(r'(?=(\d+))\w+\1', '56x56').group()

'56x56'

In [8]:
re.search(r'(?=(\d+))\w+\1', '456x56').group()

'56x56'

In [9]:
re.search(r'(?=(\d+))\w+\1', '456x6').group()

'6x6'

In [10]:
r = re.compile(r"var (\w+|\d+) = (\w+|\d+)")
print (r.match("var v_name = 12").groups())

('v_name', '12')


In [11]:
m = re.search('(?P<name>.*) (?P<phone>.*)', 'John 123456'); m.groups()

('John', '123456')

In [12]:
m.group('name')

'John'

In [13]:
mm_1 = re.search('(?P<name>[\w]+)\s.*?\s(?P=name)', 'Larry is Larry');mm_1.group()

'Larry is Larry'

In [14]:
mm_2 = re.search(r'(?P<name>[\w]+)\s.*?\s\1', 'Larry is Larry');mm_2.group()

'Larry is Larry'

<span style='color:red;'><b>*</b> mm_2 requires <i>r'...'<i> for the use of <i>\1<i></span>

In [15]:
str_1 = "a    b     c      d"
re.split(" +", str_1)

['a', 'b', 'c', 'd']

In [16]:
str_2 = "Larry's number is 12345.\nSergey's number is 54321.\nSunder's number is 09876."
re.split("[ |\s]", str_2)[0:2]

["Larry's", 'number']

### <span style='color: red ;'>Search from specific position.</span>

In [17]:
s = re.search('(Sergey\'s number)',str_2).start()
p=re.compile(r'[\d]+')
p.search(str_2,s,).group()

'54321'

In [18]:
## Or simply
p.search(str_2[s:]).group()

'54321'

### <span style='color: red ;'>How to get the last matched group?</span>

In [19]:
s = "123456789 nn nn oo nn nn mlm nn203"
match = re.search("(nn)(?!.*nn.*)", s)
print (match.start()-len(s), match.end()-len(s))

-5 -3


In [20]:
## Or reversing at first
s = "123456789 nn nn oo nn nn mlm nn203"
m = re.search("(nn)", s[::-1])
-m.end(), -m.start()

(-5, -3)

In [21]:
# Or re.finditer
[(m.group(), m.start(), m.end()) for m in re.finditer(r'(?<![\d])(nn)(?![\d]+)',s)]

[('nn', 10, 12), ('nn', 13, 15), ('nn', 19, 21), ('nn', 22, 24)]

In [22]:
import string
legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
print ('[%s]+' % re.escape(legal_chars))

[abcdefghijklmnopqrstuvwxyz0123456789\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]+


### <span style='color: red ;'>Reusing subgropus later</span>

In [23]:
p = '(<)?(?:\w+@\w+(?:\.\w+)+)(-)?(?(1)>|)(?(2)-|)'
re.search(p, '<user@host.com->-').group()

'<user@host.com->-'

### <span style='color: red ;'>How to get text from an innermost html code(up to five nesting)?</span>

In [24]:
pattern = "(?P<wrap0><.*?>)?(?P<wrap1><.*?>)?(?P<wrap2><.*?>)?(?P<wrap3><.*?>)?(?P<wrap4><.*?>)?\
(?P<value>.*?)(?(wrap4)<.*?>|)?(?(wrap3)<.*?>|)?(?(wrap2)<.*?>|)?(?(wrap1)<.*?>|)?(?(wrap0)<.*?>|.*?)"
 
string1 = "<div><p><span><a href='#'>target_value</a></span></p></div>"

rs = re.match(pattern,string1);rs.group('value')

'target_value'

In [25]:
ptn_2, n= '', 5;
for i in range(n): 
    p='(?P<wrap'+str(i)+'><.*?>)?'
    ptn_2+=p
    if i==n-1: ptn_2+='(?P<value>.*?)'
#for j in range(n): 
#for n in range(5,-1,-1):
for j in reversed(range(n)):
    if j==0: 
        p='(?(wrap'+str(j)+')<.*?>|.*?)'
    else: 
        p='(?(wrap'+str(j)+')<.*?>|)?'
    ptn_2+=p   
ptn_2

'(?P<wrap0><.*?>)?(?P<wrap1><.*?>)?(?P<wrap2><.*?>)?(?P<wrap3><.*?>)?(?P<wrap4><.*?>)?(?P<value>.*?)(?(wrap4)<.*?>|)?(?(wrap3)<.*?>|)?(?(wrap2)<.*?>|)?(?(wrap1)<.*?>|)?(?(wrap0)<.*?>|.*?)'

In [26]:
rs_2 = re.match(ptn_2,string1);rs_2.group('value')

'target_value'

### <span style='color: red ;'>Finding doubled words in texts.</span>

In [27]:
def reducing(m): return m.groups()[0]
p=r'(?P<doubt>.+) \1'
line='I\'m from from the the United United States'
f=lambda m:m.groups()[0]
#re.sub(p,f,line)
re.sub(p,reducing,line)

"I'm from the United States"

### <span style='color: red ;'>Flags</span>

In [28]:
# match even with NEWLINE with re.S flag
re.search(r'.+','Line1\nLine2\nLine3\n',re.S)

<_sre.SRE_Match object; span=(0, 18), match='Line1\nLine2\nLine3\n'>

#### <span style='color: red ;'>Targetting the first two lines</span>
<span>1. some Varying TEXT</span><br />
<span>2. CAPITALLETTERS</span>

In [29]:
t="""some Varying TEXT\n
CAPITALLETTERS
[more of the above, ending with a newline]
[yep, there is a variable number of lines here]
\n"""

In [30]:
re.search(r"^(.+)\n((?:\n.+)+?)",t,re.M).groups()
#r"^(.+)(?:\n|\r\n?)((?:(?:\n|\r\n?).+)+?)"

('some Varying TEXT', '\nCAPITALLETTERS')

#### <span>re.X/re.VERBOSE</span>

In [31]:
re.search(r"""^(.+)\n      #starts without [] and greedy matching ending with newline
        ((?:\n.+)+?)  #starts with newline and greedy matching, which accepts only one group 
        # any space and newline are ignored with a borbose flag
        """,t,re.M|re.X).groups()

('some Varying TEXT', '\nCAPITALLETTERS')

### <span style='color: red ;'>Splitting pitfalls</span>

In [32]:
re.split(r'\W+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [33]:
re.split(r'(\W+)', 'Words, words, words.') 

['Words', ', ', 'words', ', ', 'words', '.', '']

In [34]:
re.split(r'(\W)+', 'Words, words, words.') 

['Words', ' ', 'words', ' ', 'words', '.', '']

In [35]:
re.split(r'(\W+)', '...words, words...')

['', '...', 'words', ', ', 'words', '...', '']

In [36]:
re.split(r'[a-f]+', '0a3B9', flags=re.IGNORECASE)

['0', '3', '9']

In [37]:
re.split(r'([a-f]+)', '0a3B9', flags=re.IGNORECASE)

['0', 'a', '3', 'B', '9']

#### <span style='color: red ;'>Cutting off subgroups</span>

In [38]:
re.split(r'(?:\W+)', 'Words, words, words.') 

['Words', 'words', 'words', '']

In [39]:
re.split(r'(?:\W)+', 'Words, words, words.') 

['Words', 'words', 'words', '']

In [40]:
re.split(r'(?:\W+)', '...words, words...')

['', 'words', 'words', '']

In [41]:
re.split(r'(?:[a-f]+)', '0a3B9', flags=re.IGNORECASE)

['0', '3', '9']

#### <span style='color: red ;'>Finding the all of words comes with either prefic or suffix, or both</span>

In [42]:
m_1 = re.finditer(r'(?P<first>[.]+)?[\w]+(?(first)|[.]+)', '...words, words...',re.I)
for i in m_1: print(i.group())

...words
words...


In [43]:
m_2 = re.finditer(r'(?:[.]+)?[\w]+(?:[.]+)?', '...words, words..., ...words...',re.I)
for i in m_2: print(i.group())

...words
words...
...words...


#### <span style='color: red ;'>Pitfall: Empty splitting trials</span>

In [44]:
re.split(r'[\b|\B]', 'Words, words, words.')

  """Entry point for launching an IPython kernel.


['Words, words, words.']

In [45]:
re.split(r'\W*', '...words...')

  return _compile(pattern, flags).split(string, maxsplit)


['', 'words', '']

In [46]:
re.split(r'(\W*)', '...words...')

  return _compile(pattern, flags).split(string, maxsplit)


['', '...', 'words', '...', '']

### <span style='color: red ;'>Characters replacement</span>

In [47]:
re.sub(r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):',
       r'static PyObject*\npy_\1(void)\n{',
       'def myfunc():{i=0;print(i)}')

'static PyObject*\npy_myfunc(void)\n{{i=0;print(i)}'

### <span style='color: red ;'>Escapting open blackets in non-caputuring group is tricky </span>

#### <span style='color: red ;'>Robust to open brakets, the difference of the number in each segment, and a couple of seprations  </span>

In [48]:
re.sub(r"(?P<first>[(])?[\d]{2,4}(?(first)[)]|)?[-\\_|]{1,5}(?P<second>[(])?[\d]{2,4}(?(second)[)]|)?[-\\_|]{1,5}(?P<third>[(])?[\d]{2,5}(?(third)[)]|)?",
       '*detected*',
       'My number is (0000)-(00-00000), Office number is (1233)|(4567___(890)')

'My number is *detected*), Office number is *detected*'

In [49]:
m=re.compile(r"(?P<first>[(])?([\d]{2,4})(?(first)[)]|)?([-\\_|]{1,5})(?P<second>[(])?([\d]{2,4})(?(second)[)]|)?([-\\_|]{1,5})(?P<third>[(])?([\d]{2,5})(?(third)[)]|)?")

In [50]:
for m in m.finditer('Office number is (1233)|(4567___890)'): print(m.group())

(1233)|(4567___890


### <span style='color: red ;'>Groupdict allows it to see what string groups match.</span>

In [51]:
m.groupdict()

{'first': '(', 'second': '(', 'third': None}

In [52]:
def dashrepl(matchobj):
    print(matchobj.group())
    if matchobj.group(0) == '-': return ' '
    else: return '-'
re.sub('-{1,2}', dashrepl, 'pro----gram-files')

--
--
-


'pro--gram files'

### <span style='color: red ;'>Special sign for sub(): \g</span>

In [53]:
re.sub(r'(?P<n>[\d]+)(?P<c>[\w]+)', r'\g<c>\g<n>','123ABC')

'ABC123'

#### <span style='color: red ;'>Manipulation with \g</span>

In [54]:
f=lambda first, second:second+first
re.sub(r'(?P<n>[\d]+)(?P<c>[\w]+)', f(*(r'\g<n>', r'\g<c>')),'123ABC')

'ABC123'

In [55]:
def f_2 (*args, **kwargs): return kwargs['second']+args[0]
re.sub(r'(?P<n>[\d]+)(?P<c>[\w]+)',f_2(r'\g<n>',second=r'\g<c>'),'123ABC')

'ABC123'

In [56]:
re.sub(r'(?P<myName>potato)(?P<triple>XXX)(?P=myName)', r'YY\g<triple>YY', 'potatoXXXpotato')

'YYXXXYY'

In [57]:
re.sub(r'(?P<myName>potato)(XXX)(?P=myName)', r'YY\2YY', 'potatoXXXpotato')

'YYXXXYY'

#### <span style='color: red ;'>Distinguish \22 and \2 followed by 2 with \g</span>

In [58]:
re.sub(r'(?P<myName>potato)(XXX)(?P=myName)', r'YY\g<2>2YY', 'potatoXXXpotato')

'YYXXX2YY'

In [59]:
digits_re = r'\d+'
sample = '/usr/sbin/sendmail - 0 errors, 12 warnings'
print(re.sub(digits_re, digits_re.replace('\\', r'\\'), sample)) #str.replace(old, new[, max])



In [60]:
pattern = re.compile("o[gh]")
pattern.fullmatch("dog")      # No match as "o" is not at the start of "dog".
pattern.fullmatch("doggie", 1, 3).group()   # Matches within given limits.

'og'

In [61]:
p=re.compile(r'(?P<food1>[\w]+)(XXX)(?P<food2>[\w]+)')
p.groupindex

mappingproxy({'food1': 1, 'food2': 3})

In [62]:
result = p.search('potatoXXXbanana')
result.group('food2')

'banana'

#### <span style='color: red ;'>Groupindex needs group names</span>

In [63]:
p=re.compile(r'(?:[\w]+)(XXX)(?:[\w]+)')
p.groupindex

mappingproxy({})

In [64]:
p.pattern

'(?:[\\w]+)(XXX)(?:[\\w]+)'

In [65]:
import copy
q=copy.deepcopy(p);q.pattern # cannot copy

TypeError: cannot deepcopy this pattern object

In [66]:
m=re.match(r'([\w]+) \1', 'abc abc');m.expand

<function SRE_Match.expand>

In [67]:
m=re.match(r"(\w+)\s(\w+)","Isaac Newton, physicist")
m.group(0)

'Isaac Newton'

In [68]:
m.group(1)

'Isaac'

In [69]:
m=re.match(r"(\d+)\.?(\d+)?", "24")
m.groups()

('24', None)

In [70]:
(m.pos,m.endpos)==m.span()

True

In [71]:
m=re.match(r"(\d+)\.?(\d+)?", "24")
if m.lastindex==1: print('The number of matched item is {}'.format(m.lastindex))
else : print('The number of matched item is more than one')

The number of matched item is 1
