In [1]:
import re

In [2]:
#Positive Lookahead, Lookbehind 
re.search(r'(?<=([c]))a(?=(b))', 'cab', flags=re.M).groups()

('c', 'b')

In [3]:
#Negative
print(re.search(r'(?<!([c]))a(?!(b))', 'cab', flags=re.M))

None


In [4]:
re.search(r'\b\w+[^s]\b', 'John\'s').group()

"John'"

In [5]:
# Notice the difference an above and below. 
# Apostrophe is excluded because it's followed by 's' 
re.search(r'\b\w+(?<!s)\b', 'John\'s').group()

'John'

In [6]:
print(re.search(r'(?=(\d+))\w+\1', '123x12'))

None


In [7]:
re.search(r'(?=(\d+))\w+\1', '56x56').group()

'56x56'

In [8]:
re.search(r'(?=(\d+))\w+\1', '456x56').group()

'56x56'

In [9]:
re.search(r'(?=(\d+))\w+\1', '456x6').group()

'6x6'

#### <span style="color:#2F4F4F"> "\1" is euquivalent to "(\d+)"</span>
#### <span style="color:#2F4F4F"> And "(\d+)" returns matches, 123, 23, 3 in this case</span>

In [10]:
r = re.compile(r"var (\w+|\d+) = (\w+|\d+)")
print (r.match("var v_name = 12").groups())

('v_name', '12')


In [11]:
m = re.search('(?P<name>.*) (?P<phone>.*)', 'John 123456'); m.groups()

('John', '123456')

In [12]:
m.group('name')

'John'

In [13]:
mm_1 = re.search('(?P<name>[\w]+)\s.*?\s(?P=name)', 'Larry is Larry');mm_1.group()

'Larry is Larry'

In [14]:
mm_2 = re.search(r'(?P<name>[\w]+)\s.*?\s\1', 'Larry is Larry');mm_2.group()

'Larry is Larry'

<span style='color:red;'><b>*</b> mm_2 requires <i>r'...'<i> for the use of <i>\1<i></span>

In [15]:
str_1 = "a    b     c      d"
re.split(" +", str_1)

['a', 'b', 'c', 'd']

In [98]:
str_2 = "Larry's number is 12345.\nSergey's number is 54321.\nSunder's number is 09876."
re.split("[ |\s]", str_2)[0:2]

["Larry's", 'number']

### <span style='color: red ;'>Search from specific position.</span>

In [17]:
s = re.search('(Sergey\'s number)',str_2).start()
p=re.compile(r'[\d]+')
p.search(str_2,s,).group()

'54321'

### <span style='color: red ;'>How to get the last matched group?</span>

In [18]:
s = "123456789 nn nn oo nn nn mlm nn203"
match = re.search("(nn)(?!.*nn.*)", s)
print (match.start()-len(s), match.end()-len(s))

-5 -3


In [19]:
## Or reversing at first
s = "123456789 nn nn oo nn nn mlm nn203"
m = re.search("(nn)", s[::-1])
-m.end(), -m.start()

(-5, -3)

In [20]:
# Or re.finditer
[(m.group(), m.start(), m.end()) for m in re.finditer(r'(?<![\d])(nn)(?![\d]+)',s)]

[('nn', 10, 12), ('nn', 13, 15), ('nn', 19, 21), ('nn', 22, 24)]

In [21]:
import string
legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
print ('[%s]+' % re.escape(legal_chars))

[abcdefghijklmnopqrstuvwxyz0123456789\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]+


In [22]:
p = '(<)?(?:\w+@\w+(?:\.\w+)+)(-)?(?(1)>|)(?(2)-|)'
re.search(p, '<user@host.com->-').group()

'<user@host.com->-'

### <span style='color: red ;'>How to get text from an innermost html code(up to five nesting)?</span>

In [34]:
pattern = "(?P<wrap0><.*?>)?(?P<wrap1><.*?>)?(?P<wrap2><.*?>)?(?P<wrap3><.*?>)?(?P<wrap4><.*?>)?\
(?P<value>.*?)(?(wrap4)<.*?>|)?(?(wrap3)<.*?>|)?(?(wrap2)<.*?>|)?(?(wrap1)<.*?>|)?(?(wrap0)<.*?>|.*?)"
 
string1 = "<div><p><span><a href='#'>target_value</a></span></p></div>"

rs = re.match(pattern,string1);rs.group('value')

'target_value'

In [67]:
ptn_2, n= '', 5;
for i in range(n): 
    p='(?P<wrap'+str(i)+'><.*?>)?'
    ptn_2+=p
    if i==n-1: ptn_2+='(?P<value>.*?)'
#for j in range(n): 
#for n in range(5,-1,-1):
for j in reversed(range(n)):
    if j==0: 
        p='(?(wrap'+str(j)+')<.*?>|.*?)'
    else: 
        p='(?(wrap'+str(j)+')<.*?>|)?'
    ptn_2+=p   
ptn_2

'(?P<wrap0><.*?>)?(?P<wrap1><.*?>)?(?P<wrap2><.*?>)?(?P<wrap3><.*?>)?(?P<wrap4><.*?>)?(?P<value>.*?)(?(wrap4)<.*?>|)?(?(wrap3)<.*?>|)?(?(wrap2)<.*?>|)?(?(wrap1)<.*?>|)?(?(wrap0)<.*?>|.*?)'

In [68]:
rs_2 = re.match(ptn_2,string1);rs_2.group('value')

'target_value'

### <span style='color: red ;'>Finding doubled words in texts.</span>

In [96]:
def reducing(m): return m.groups()[0]
p=r'(?P<doubt>.+) \1'
line='I\'m from from the the United United States'
f=lambda m:m.groups()[0]
#re.sub(p,f,line)
re.sub(p,reducing,line)

"I'm from the United States"