# Part 3 – Advanced (Lookarounds, Backreferences, Performance) – 30 Tasks

In [1]:
import re

text = "5kg 10 kg 20g 30kg"
result = re.findall(r"\b\d+(?=\s*kg\b)", text)
print(result)


['5', '10', '30']


In [2]:
import re

text = "5kg 10 kg 20g 30 40kg 50"
result = re.findall(r"\b\d+(?!\s*kg\b)", text)
print(result)


['1', '20', '30', '4', '50']


In [3]:
import re

text = "apple, banana orange, mango, grape"
result = re.findall(r"\b([A-Za-z]+)(?=,)", text)
print(result)


['apple', 'orange', 'mango']


In [4]:
import re

text = "Loving #Python and #regex in #2025"
result = re.findall(r"(?<=#)[A-Za-z_]\w*", text)
print(result)


['Python', 'regex']


In [5]:
import re

text = "Python Python3 Python 3 Python2"
result = re.findall(r"\bPython(?!3)\b", text)
print(result)


['Python', 'Python']


In [6]:
import re

text = "ababa"
result = [m.group(1) for m in re.finditer(r"(?=(aba))", text)]
print(result)


['aba', 'aba']


In [7]:
import re

text = "9 99 100 123 999 1000"
result = re.findall(r"\b[1-9]\d{2}\b", text)
print(result)


['100', '123', '999']


In [8]:
import re

text = "+3.14 -0.5 42 -.5 +10 -7 5."
result = re.findall(r"[+-]?(?:\d+\.\d+|\d+)", text)
print(result)


['+3.14', '-0.5', '42', '5', '+10', '-7', '5']


In [9]:
import re

texts = ["hello", "world", "sky", "PYTHON"]
for t in texts:
    print(t, bool(re.fullmatch(r"(?i)^(?!.*e).*$", t)))


hello False
world True
sky True
PYTHON True


In [10]:
import re
import ipaddress

samples = ["2001:0db8:85a3:0000:0000:8a2e:0370:7334", "2001:db8::1", "abcd::1234:5678::1", "gggg::1"]
def is_valid_ipv6(s):
    try:
        ipaddress.IPv6Address(s)
        return True
    except ValueError:
        return False

print([is_valid_ipv6(s) for s in samples])


[True, True, False, False]


In [11]:
import re

text = "<div><p>Hello</p></div> </br> </span> <img src='x'>"
result = re.findall(r"<(?!/)[A-Za-z][^>]*>", text)
print(result)


['<div>', '<p>', "<img src='x'>"]


In [12]:
import re

text = "This is is a test test and and words"
result = re.findall(r"\b(\w+)\s+\1\b", text)
print(result)


['is', 'test', 'and']


In [None]:
import re

text = "aba abc cdc dad mom pop axe"
result = re.findall(r"\b([A-Za-z])([A-Za-z])\1\b", text)

print(["".join(t) for t in result])


['ab', 'cd', 'da', 'mo', 'po']


In [None]:
import re

text = "report.pdf image.jpeg notes.txt archive.tar.gz data"

simple = re.findall(r"\b([\w-]+)\.[A-Za-z0-9]+\b", text)
print(simple)

multi = re.findall(r"\b(.+?)(?:\.[A-Za-z0-9]+)\b", text)
print(multi)


['report', 'image', 'notes', 'archive']
['report', ' image', ' notes', ' archive']


In [15]:
import re

cards = ["1234567812345678", "1234-5678-1234-5678", "1234 5678 1234 5678", "1234-5678-1234-567"]
pattern = r"^(?:\d{16}|(?:\d{4}[- ]){3}\d{4})$"
print([bool(re.fullmatch(pattern, c)) for c in cards])


[True, True, True, False]


In [None]:
import re

text = "alpha beta cake code AbaBa Ababa hello river"

result = re.findall(r"\b(?!.*[aeiouAEIOU]{2})(?!.*[^aeiouAEIOU\W]{2})[A-Za-z]+\b", text)
print(result)


['river']


In [17]:
import re

urls = ["https://example.com", "http://site.org/path?q=1", "ftp://bad.com", "https://"]
pattern = r"^https?://[^\s/$.?#].[^\s]*$"
print([bool(re.fullmatch(pattern, u)) for u in urls])


[True, True, False, False]


In [18]:
import re

text = "short extraordinary responsibilities internationalization tiny"
result = re.findall(r"\b[A-Za-z]{11,}\b", text)
print(result)


['extraordinary', 'responsibilities', 'internationalization']


In [19]:
import re

text = "queue strength education rhythm cooperation fly"
result = re.findall(r"\b(?=(?:.*[aeiouAEIOU]){3,})[A-Za-z]+\b", text)
print(result)


['queue', 'strength', 'education', 'rhythm', 'cooperation']


In [20]:
import re

text = "1 5 10 23 30 41 55 100 101 205"
result = re.findall(r"\b\d*[05]\b", text)
print(result)


['5', '10', '30', '55', '100', '205']


In [21]:
import re

text = "start Hello world end ... start another block end"
result = re.findall(r"(?<=start)(.*?)(?=end)", text, flags=re.DOTALL)
print([s.strip() for s in result])


['Hello world', 'another block']


In [None]:
import re

text = "coffee balloon apple committee hello run"
result = re.findall(r"\b\w*(\w)\1\w*\b", text)

words = [m.group(0) for m in re.finditer(r"\b\w*(\w)\1\w*\b", text)]
print(words)


['coffee', 'balloon', 'apple', 'committee', 'hello']


In [23]:
import re

text = "100 1,000 20,345 123,45 12,345,678 9,99"
result = re.findall(r"\b\d{1,3}(?:,\d{3})+\b", text)
print(result)


['1,000', '20,345', '12,345,678']


In [None]:
import re

text = "I IV V IX XL XC CD CM M MMXXV invalid VX"
pattern = r"\bM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b"
result = re.findall(pattern, text)

matches = [m.group(0) for m in re.finditer(pattern, text)]
print(matches)


['I', '', 'IV', '', 'V', '', 'IX', '', 'XL', '', 'XC', '', 'CD', '', 'CM', '', 'M', '', 'MMXXV', '', '', '', '', '']


In [None]:
import re

text = "#Hello #2025 #regex #_cool #coding123"

result = re.findall(r"#(?!\d)[A-Za-z_]\w*", text)
print(result)


['#Hello', '#regex', '#_cool', '#coding123']


In [26]:
import re

text = "abcabc xyz abcab abcabcabc 1212 abababx"
result = [m.group(0) for m in re.finditer(r"\b(\w+)\1+\b", text)]
print(result)


['abcabc', 'abcabcabc', '1212']


In [27]:
import re

text = "101 102 11001 abc 000111 210"
result = re.findall(r"\b[01]+\b", text)
print(result)
 

['101', '11001', '000111']


In [28]:
import re

samples = ["00:1A:2B:3C:4D:5E", "AA-BB-CC-DD-EE-FF", "001A.2B3C.4D5E", "GG:11:22:33:44:55"]
pattern = r"^(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}$"
print([bool(re.fullmatch(pattern, s)) for s in samples])


[True, True, False, False]


In [29]:
import re

text = "John Doe"
m = re.fullmatch(r"([A-Za-z]+)\s+([A-Za-z]+)", text)
result = (m.group(1), m.group(2)) if m else None
print(result)


('John', 'Doe')


In [30]:
import re

text = "Text <!-- first comment --> more <!-- second\nmulti-line comment --> end"
result = re.findall(r"<!--.*?-->", text, flags=re.DOTALL)
print(result)


['<!-- first comment -->', '<!-- second\nmulti-line comment -->']
