###  Regular Expressions (regex: re)

In [1]:
import re

In [2]:
print(dir(re))

['A', 'ASCII', 'DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'Match', 'NOFLAG', 'Pattern', 'RegexFlag', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_cache', '_casefix', '_compile', '_compile_repl', '_compiler', '_constants', '_expand', '_parser', '_pickle', '_special_chars_map', '_subx', 'compile', 'copyreg', 'enum', 'error', 'escape', 'findall', 'finditer', 'fullmatch', 'functools', 'match', 'purge', 'search', 'split', 'sub', 'subn', 'template']


#### Try finding only words with maximum three character

In [3]:
quote="If you do not turn against yourself, the Human Potential is limitless"

In [4]:
re.search(r"([a-z]{3})",quote)

<re.Match object; span=(3, 6), match='you'>

In [5]:
re.match("([a-z]{3})",quote)

In [6]:
re.findall("([a-z]{3})",quote)   # lower case     

['you',
 'not',
 'tur',
 'aga',
 'ins',
 'you',
 'rse',
 'the',
 'uma',
 'ote',
 'nti',
 'lim',
 'itl',
 'ess']

In [7]:
re.findall(r"([a-z]{3})",quote,re.IGNORECASE)

['you',
 'not',
 'tur',
 'aga',
 'ins',
 'you',
 'rse',
 'the',
 'Hum',
 'Pot',
 'ent',
 'ial',
 'lim',
 'itl',
 'ess']

In [8]:
re.findall(r"\w{3}",quote)   # \w : matches lower and upper

['you',
 'not',
 'tur',
 'aga',
 'ins',
 'you',
 'rse',
 'the',
 'Hum',
 'Pot',
 'ent',
 'ial',
 'lim',
 'itl',
 'ess']

In [9]:
re.findall(r"\s*(\w{3})\s*",quote)

['you',
 'not',
 'tur',
 'aga',
 'ins',
 'you',
 'rse',
 'the',
 'Hum',
 'Pot',
 'ent',
 'ial',
 'lim',
 'itl',
 'ess']

In [10]:
re.findall(r"\s+(\w{3})\s+",quote)

['you', 'not', 'the']

In [11]:
re.findall(r"\s+([a-z]{3})\s+",quote)

['you', 'not', 'the']

In [12]:
re.findall(r"\s+(\w{,})\s+",quote)

['you', 'not', 'against', 'the', 'Potential']

In [13]:
matches = re.findall(r"\s+([a-z]{3})\s+",quote)

In [14]:
print(f"Matched texts: {matches}, Total {len(matches)}")

Matched texts: ['you', 'not', 'the'], Total 3


In [15]:
quote

'If you do not turn against yourself, the Human Potential is limitless'

In [16]:
match = re.findall(r"\s+([a-z]{2})\s+|\s+([a-z]{3})\s+",quote)

In [17]:
print(f"Matched texts: {match}, Total {len(match)}")

Matched texts: [('', 'you'), ('', 'not'), ('', 'the'), ('is', '')], Total 4


In [18]:
matchB = re.findall(r"([A-Z]\w+|[a-z]\w+)",quote)

In [19]:
matches 

['you', 'not', 'the']

#### Chunks using split

In [20]:
characters = re.split(r"",quote.strip())

In [21]:
print(characters)

['', 'I', 'f', ' ', 'y', 'o', 'u', ' ', 'd', 'o', ' ', 'n', 'o', 't', ' ', 't', 'u', 'r', 'n', ' ', 'a', 'g', 'a', 'i', 'n', 's', 't', ' ', 'y', 'o', 'u', 'r', 's', 'e', 'l', 'f', ',', ' ', 't', 'h', 'e', ' ', 'H', 'u', 'm', 'a', 'n', ' ', 'P', 'o', 't', 'e', 'n', 't', 'i', 'a', 'l', ' ', 'i', 's', ' ', 'l', 'i', 'm', 'i', 't', 'l', 'e', 's', 's', '']


In [22]:
words = re.split("\s",quote)

In [23]:
words

['If',
 'you',
 'do',
 'not',
 'turn',
 'against',
 'yourself,',
 'the',
 'Human',
 'Potential',
 'is',
 'limitless']

In [24]:
print(f"Total words in quote: {len(words)}")

Total words in quote: 12


In [25]:
[len(word)==3 for word in words]

[False,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False]

In [26]:
[len(word) for word in words]

[2, 3, 2, 3, 4, 7, 9, 3, 5, 9, 2, 9]

In [27]:
wordsOne = re.split("\W+",quote)
wordsOne

['If',
 'you',
 'do',
 'not',
 'turn',
 'against',
 'yourself',
 'the',
 'Human',
 'Potential',
 'is',
 'limitless']

In [28]:
wordsA = re.split("\,",quote)

In [29]:
wordsA

['If you do not turn against yourself', ' the Human Potential is limitless']

In [30]:
print(f"Total words in wordsA: {len(wordsA)}")

Total words in wordsA: 2


#### Replace using sub()

In [31]:
if re.search(r"(H[a-z]+)",quote):
    newQuote = re.sub(r"(H[a-z]+)",'HumanBeing',quote)
    print(newQuote)
    

If you do not turn against yourself, the HumanBeing Potential is limitless


In [32]:
newQuoteA = re.sub(r"\,",'',quote)
newQuoteA

'If you do not turn against yourself the Human Potential is limitless'

#### re.compile() : Programming languages: matching start, end

In [33]:
languages = ["Javascript","Python","Go","Java","Kotlin","PHP","C#","Swift","R","Ruby","C and C++","Matlab","TypeScript","Scala","SQL","HTML","CSS","NoSQL","Rust","Perl"]

In [34]:
vowel_start = r"^[AEIOU]"
vowel_end = r".*[aeiouAEIOU]$"
patternS = re.compile(vowel_start)
patternE = re.compile(vowel_end)

In [35]:
print(f" Expression {vowel_start}, Type:{type(vowel_start)}")

 Expression ^[AEIOU], Type:<class 'str'>


In [36]:
print(f"{patternS}, {type(patternS)}")

re.compile('^[AEIOU]'), <class 're.Pattern'>


In [37]:
print(dir(patternS))

['__class__', '__class_getitem__', '__copy__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'findall', 'finditer', 'flags', 'fullmatch', 'groupindex', 'groups', 'match', 'pattern', 'scanner', 'search', 'split', 'sub', 'subn']


In [38]:
for language in languages:
    if re.match(patternS,language):
        print(f"{language} starts with vowel character")
    if re.match(patternE,language):
        print(f"{language} ends with vowel character")    

Go ends with vowel character
Java ends with vowel character
Scala ends with vowel character


### Common Regex Flags: re.MULTILINE | re.IGNORECASE

In [39]:
#re.MULTILINE (re.M, m) | re.IGNORECASE (re.I, i)

In [40]:
sentence = """Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Encoding: gzip, deflate
Accept-Language: en-US,en;q=0.9
Cache-Control: max-age=0
Connection: keep-alive
Cookie: ci_session=%22session_id%22404495f061c71aca87121e
Host: seismonepal.com
If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT
If-None-Match: 64a-5f9724a6cdcf2-gzip
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"""

In [41]:
sentence="""Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\n
Accept-Encoding: gzip, deflate\nAccept-Language: en-US,en;q=0.9\nCache-Control: max-age=0\nConnection: keep-alive\nCookie: ci_session=%22session_id%22404495f061c71aca87121e\n
Host: anishchapagain.com\nIf-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT\nIf-None-Match: 64a-5f9724a6cdcf2-gzip\nUpgrade-Insecure-Requests: 1\n
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"""

In [42]:
re.findall("(if.*)",sentence) 

['if,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'ified-Since: Sun, 1 Apr 2023 11:22:33 GMT']

In [43]:
re.findall("\n(if.*)",sentence) 

[]

In [44]:
re.findall("(If.*)",sentence) 

['If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT',
 'If-None-Match: 64a-5f9724a6cdcf2-gzip']

In [45]:
re.findall("(if.*)",sentence)

['if,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'ified-Since: Sun, 1 Apr 2023 11:22:33 GMT']

In [46]:
re.findall("(if.*)",sentence,flags = re.IGNORECASE)  # re.IGNORECASE

['if,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT',
 'If-None-Match: 64a-5f9724a6cdcf2-gzip']

In [47]:
re.findall("(if.*)",sentence,re.IGNORECASE)

['if,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT',
 'If-None-Match: 64a-5f9724a6cdcf2-gzip']

In [48]:
re.findall(r"(?i)(if.*)",sentence)

['if,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT',
 'If-None-Match: 64a-5f9724a6cdcf2-gzip']

In [49]:
re.findall("\n(if.*)",sentence,re.IGNORECASE)  # re.IGNORECASE

['If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT',
 'If-None-Match: 64a-5f9724a6cdcf2-gzip']

In [50]:
re.findall("\n(if.*)",sentence,re.I)    # re.IGNORECASE

['If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT',
 'If-None-Match: 64a-5f9724a6cdcf2-gzip']

In [51]:
re.findall(r"(?i)\n(if.*)",sentence)  # inline flags (?i) - should be first regex expressions re.IGNORECASE

['If-Modified-Since: Sun, 1 Apr 2023 11:22:33 GMT',
 'If-None-Match: 64a-5f9724a6cdcf2-gzip']

In [52]:
re.findall(r"^Accept.*",sentence)

['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7']

In [53]:
re.findall(r"^Accept.*",sentence)

['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7']

In [54]:
re.findall(r"^accept.*",sentence,re.M)

[]

In [55]:
re.findall(r"^Accept.*",sentence,flags = re.MULTILINE)

['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'Accept-Encoding: gzip, deflate',
 'Accept-Language: en-US,en;q=0.9']

In [56]:
re.findall(r"^Accept.*",sentence,re.MULTILINE)

['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'Accept-Encoding: gzip, deflate',
 'Accept-Language: en-US,en;q=0.9']

In [57]:
re.findall(r"(?m)^Accept.*",sentence)

['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'Accept-Encoding: gzip, deflate',
 'Accept-Language: en-US,en;q=0.9']

In [58]:
re.findall(r"^accept.*",sentence,flags = re.MULTILINE|re.I)

['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'Accept-Encoding: gzip, deflate',
 'Accept-Language: en-US,en;q=0.9']

In [59]:
re.findall(r"(?im)^accept.*",sentence)

['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 'Accept-Encoding: gzip, deflate',
 'Accept-Language: en-US,en;q=0.9']