A **Regular Expressions (RegEx)** is a special sequence of characters that uses a search pattern to find a string or set of strings.


Python provides a re module that supports the use of regex in Python.


**Methods	Description**

1.   **findall**	:    Returns a list containing all matches
2.   **search**	:    Returns a Match object if there is a match anywhere in the string
3.   **split** :  	 Returns a list where the string has been split at each match
4.   **sub** :	   Replaces one or many matches with a string

**search()**

In [None]:
import re
 
ip = 'Hi Hello World Hello Hello'
 
abc = re.search(r'Hello', ip)
print(abc)
print('Start Index:', abc.start())
print('End Index:', abc.end())

<re.Match object; span=(3, 8), match='Hello'>
Start Index: 3
End Index: 8


In [None]:
import re

ip = 'Hi Hello World'
a = re.search("\s", ip)

print("The first white-space character is located in position:", a.start()) 


The first white-space character is located in position: 2


**findall()**

In [None]:
import re

ip = "Mani is awsome"
a = re.findall("Mani", ip)
print(a)

['Mani']


In [None]:
import re

ip = "Mani is awsome Mani"
x = re.findall("Mani", ip)
print(x)

['Mani', 'Mani']


In [None]:
#Return an empty list if no match was found
import re

ip = "Python is awsome"
a = re.findall("Mani", ip)
print(a)


[]


**split()**

In [None]:
import re

txt = "python is awsome"
a = re.split("\s", txt, 1)
print(a)

['python', 'is awsome']


In [None]:
import re

txt = "python is awsome"
a = re.split("\s", txt, 2)
print(a)

#string split
str_txt = "python is awsome"
b = str_txt.split(' ')
print(b)

['python', 'is', 'awsome']
['python', 'is', 'awsome']


In [None]:
import re

txt = "python is awsome"
a = re.split("\s", txt, 4)
print(a)

['python', 'is', 'awsome']


**sub()**

In [None]:
import re

ip = "Python is awsome"
a = re.sub("\s", ",", ip)
print(a)

#string replace
b = ip.replace(' ', ',')
print(b)

Python,is,awsome
Python,is,awsome


**.span()** returns a tuple containing the start-, and end positions of the match.

**.string** returns the string passed into the function.

**.group()** returns the part of the string where there was a match.

In [None]:
import re

#Search for an upper case "D" character in the beginning of a word, and print its position:

ip = "Hi, how are you Doing? I am Doing good."
a = re.search(r"\bD\w+", ip)
print(a)
print(a.span())


<re.Match object; span=(16, 21), match='Doing'>
(16, 21)


In [None]:
import re

#Search for an upper case "D" character in the beginning of a word, and print its position:

ip = "Hi, how are you Doing? I am Doing good."
a = re.search(r"\bz\w+", ip)
print(a.span())

AttributeError: ignored

In [None]:
import re

#string returns the string passed into the function.

ip = "Hi, how are you Doing? I am Doing good."
a = re.search(r"\bD\w+", ip)
print(a)
print(a.string)

<re.Match object; span=(16, 21), match='Doing'>
Hi, how are you Doing? I am Doing good.


In [None]:
import re

#Search for an upper case "D" character in the beginning of a word, and print its position:

ip = "Hi, how are you Doing? I am Doing good."
a = re.search(r"\bD\w+", ip)
print(a.group())

Doing


In [None]:
import re

#Search for an upper case "D" character in the beginning of a word, and print its position:

ip = "Hi, how are you Doing? I am Doing good."
a = re.search(r"\bz\w+", ip)
print(a.group())

AttributeError: ignored

**MetaCharacters	Description**

*   []	Represent a character class
*   \	Used to drop the special meaning of character following it
*   . Matches any character except newline
*   ^	Matches the beginning
*   $	Matches the end
*   |	Means OR (Matches with any of the characters separated by it.
*   ?	Matches zero or one occurrence
*   (*)	Any number of occurrences (including 0 occurrences)
*   (+)	One or more occurrences
*   {}	Indicate the number of occurrences of a preceding regex to match.
*   ()	Enclose a group of Regex

In [None]:
import re

txt = "Python is a high-level, cross-platform, and open-sourced programming language released under a GPL-compatible license."

#Find all lower case characters alphabetically between "a" and "m":

x = re.findall("[a-m]", txt)
print(x)


['h', 'i', 'a', 'h', 'i', 'g', 'h', 'l', 'e', 'e', 'l', 'c', 'l', 'a', 'f', 'm', 'a', 'd', 'e', 'c', 'e', 'd', 'g', 'a', 'm', 'm', 'i', 'g', 'l', 'a', 'g', 'a', 'g', 'e', 'e', 'l', 'e', 'a', 'e', 'd', 'd', 'e', 'a', 'c', 'm', 'a', 'i', 'b', 'l', 'e', 'l', 'i', 'c', 'e', 'e']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Find all digit characters:

x = re.findall("\d", txt)
y = re.findall("\.", txt)
print(x)
print(y)


['1', '9', '9', '1']
['.', '.']


In [None]:
import re
 
sentence = 'Hi Hello. World!'
 
# without using \
match = re.search(r'.', sentence)
print(match)
 
# using \
match = re.search(r'\.', sentence)
print(match)

<re.Match object; span=(0, 1), match='H'>
<re.Match object; span=(8, 9), match='.'>


In [None]:
import re

txt = "hello world"

#Check if the string starts with 'hello':

x = re.findall("^hello", txt)
print(x)
if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")

['hello']
Yes, the string starts with 'hello'


In [None]:
import re

txt = "hello world"

#Check if the string starts with 'hello':

x = re.findall("world$", txt)
print(x)
if x:
  print("Yes, the string ends with 'world'")
else:
  print("No match")

[]
No match


In [None]:
import re

txt = "hello hi world"

#

x = re.findall("he.*o", txt)

print(x)

['hello hi wo']


In [None]:
import re

txt = "hello hi world"

#Search for a sequence that starts with "he", followed by 1 or more  (any) characters, and an "o":

x = re.findall("he.+o", txt)

print(x)

['hello hi wo']


In [None]:
import re

txt = "hello world"

#Search for a sequence that starts with "he", followed by 0 or 1  (any) character, and an "o":

x = re.findall("hel.?o", txt)

print(x)

#This time we got no match, because there were not zero, not one, but two characters between "he" and the "o"


['hello']


In [None]:
import re

txt = "hello world"

#Search for a sequence that starts with "he", followed excactly 1 (any) characters, and an "o":

x = re.findall("he.{2}o", txt)

print(x)


['hello']


In [None]:
import re

txt = "Python supports object-oriented programming concepts such as class, inheritance, objects, module, namespace etc."

#Check if the string contains either "class" or "object":

x = re.findall("class|object", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['object', 'class', 'object']
Yes, there is at least one match!


**Python RegEx - Special sequences**

\A	Returns a match if the specified characters are at the beginning of the string

\d The sequence checks if there are any digits in the given string. (numbers from 0-9)	"\d"	

\D	Returns a match where the string DOES NOT contain digits	"\D"	

The \s sequence returns a match when the string contains white space characters.	

\S	Returns a match where the string DOES NOT contain a white space character	"\S"	

The \w sequence returns a match at every word character. (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	

\W	Returns a match where the string DOES NOT contain any word characters	"\W"



In [None]:
import re

txt = "Extensive basic data types are supported, Variables can be strongly typed as well as dynamic typed"

#Check if the string starts with "Extensive":

x = re.findall("\AExtensive", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['Extensive']
Yes, there is a match!


In [None]:
import re

txt = "Extensive basic The data types are supported, Variables can be strongly typed as well as dynamic typed"

#Check if "The" is present in the given input:
x = re.findall(r"\bThe", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")



['The']
Yes, there is a match!


In [None]:
import re

txt = "Extensive basic data types are supported, Variables can be strongly typed as well as dynamic typed"

#Check if "ic" is present at the end of a WORD:
x = re.findall(r"ic\b", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")



['ic', 'ic']
Yes, there is a match!


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Find all digit characters:

x = re.findall("\d", txt)
print(x)

['1', '9', '9', '1']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Return a match at every no-digit character:

x = re.findall("\D", txt)
print(x)

['P', 'y', 't', 'h', 'o', 'n', ' ', 'w', 'a', 's', ' ', 'c', 'r', 'e', 'a', 't', 'e', 'd', ' ', 'b', 'y', ' ', 'G', 'u', 'i', 'd', 'o', ' ', 'v', 'a', 'n', ' ', 'R', 'o', 's', 's', 'u', 'm', ',', ' ', 'a', 'n', 'd', ' ', 'r', 'e', 'l', 'e', 'a', 's', 'e', 'd', ' ', 'i', 'n', ' ', '.']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Return a match at every white-space character:

x = re.findall("\s", txt)
print(x)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Return a match at every NON white-space character:

x = re.findall("\S", txt)
print(x)

['P', 'y', 't', 'h', 'o', 'n', 'w', 'a', 's', 'c', 'r', 'e', 'a', 't', 'e', 'd', 'b', 'y', 'G', 'u', 'i', 'd', 'o', 'v', 'a', 'n', 'R', 'o', 's', 's', 'u', 'm', ',', 'a', 'n', 'd', 'r', 'e', 'l', 'e', 'a', 's', 'e', 'd', 'i', 'n', '1', '9', '9', '1', '.']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Return a match at every word character (characters from a to Z, digits from 0-9, and the underscore _ character):

x = re.findall("\w", txt)
print(x)

['P', 'y', 't', 'h', 'o', 'n', 'w', 'a', 's', 'c', 'r', 'e', 'a', 't', 'e', 'd', 'b', 'y', 'G', 'u', 'i', 'd', 'o', 'v', 'a', 'n', 'R', 'o', 's', 's', 'u', 'm', 'a', 'n', 'd', 'r', 'e', 'l', 'e', 'a', 's', 'e', 'd', 'i', 'n', '1', '9', '9', '1']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Return a match at every NON word character (characters NOT between a and Z. Like "!", "?" white-space etc.):

x = re.findall("\W", txt)
print(x)

[' ', ' ', ' ', ' ', ' ', ' ', ',', ' ', ' ', ' ', ' ', '.']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Check if the string ends with "1991.":

x = re.findall("1991.\Z", txt)
print(x)

['1991.']


**Python RegEx - Sets**

In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Returns a match when any of the mentioned characters is present

x = re.findall("[abc]", txt)
print(x)

['a', 'c', 'a', 'b', 'a', 'a', 'a']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Returns a match when any of the mentioned characters [a to c] is present

x = re.findall("[a-c]", txt)
print(x)

['a', 'c', 'a', 'b', 'a', 'a', 'a']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

##Returns a match when any of the mentioned characters is not present

x = re.findall("[^abc]", txt)
print(x)

['P', 'y', 't', 'h', 'o', 'n', ' ', 'w', 's', ' ', 'r', 'e', 't', 'e', 'd', ' ', 'y', ' ', 'G', 'u', 'i', 'd', 'o', ' ', 'v', 'n', ' ', 'R', 'o', 's', 's', 'u', 'm', ',', ' ', 'n', 'd', ' ', 'r', 'e', 'l', 'e', 's', 'e', 'd', ' ', 'i', 'n', ' ', '1', '9', '9', '1', '.']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Check if the string has any 0, 1, 2, 3, 4 or 5 digits:

x = re.findall("[012345]", txt)
#x = re.findall("[0-5]", txt)
print(x)

['1', '1']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991."

#Check if the string has any numbers between 0, to 9 digits:

x = re.findall("[0-9]", txt)
print(x)

['1', '9', '9', '1']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991. Jan 16 12:00 PM"

#Check if the string has any two-digit numbers, from 00 to 99:

x = re.findall("[0-9][0-9]", txt)
print(x)

['19', '91', '16', '12', '00']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991. Jan 16 12:00 PM"

#Check if the string has any two-digit numbers, from 00 to 99:

x = re.findall("[0-9][0-9][0-9][0-9]", txt)
print(x)

['1991']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991. Jan 16 12:00 PM"

#Check if the string has any characters from a to z lower case, and A to Z upper case:

x = re.findall("[a-zA-Z]", txt)
print(x)

['P', 'y', 't', 'h', 'o', 'n', 'w', 'a', 's', 'c', 'r', 'e', 'a', 't', 'e', 'd', 'b', 'y', 'G', 'u', 'i', 'd', 'o', 'v', 'a', 'n', 'R', 'o', 's', 's', 'u', 'm', 'a', 'n', 'd', 'r', 'e', 'l', 'e', 'a', 's', 'e', 'd', 'i', 'n', 'J', 'a', 'n', 'P', 'M']


In [None]:
import re

txt = "Python was created by Guido van Rossum, and released in 1991 & Jan 16 12:00 PM"

#Check if the string has any characters from a to z lower case, and A to Z upper case:

x = re.findall("[&]", txt)
print(x)

['&']


**email address validation using Python Regx**

In [None]:
ip = 'python support email id is ravichandranr@outlook.com'
match1 = re.search(r'[\w.+-]', ip)
print(match1)
match2 = re.search(r'[\w.+-]+@[\w-]', ip)
print(match2)
match = re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', ip)

print(match)
print(match.group(0))

<re.Match object; span=(0, 1), match='p'>
<re.Match object; span=(27, 42), match='ravichandranr@o'>
<re.Match object; span=(27, 52), match='ravichandranr@outlook.com'>
ravichandranr@outlook.com


In [None]:
import re

def check(s):
    pat = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    if re.match(pat,s):
        print("Valid Email")
    else:
        print("Invalid Email")

email = "test321@gmail.com"
result = check(email)
print(result)

Valid Email
None


In [None]:
import re

txt = "Python released in 1991 & Jan 16 12:00 ._PM"
txt1 = 'hi test_email@gmail.com'

txt2 = 'hi test_email@outlook.com'
#x = re.findall("\b[a-z]", txt)

match1 = re.findall(r"[A-Za-z0-9._%+-]", txt1)
match2 = re.findall(r"@[A-Za-z0-9.-]{2,6}", txt1)
match3 = re.findall(r"\.[A-Za-z]", txt1)

print(match1)
print(match2)
print(match3)

reg1 = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]", txt2)
print(reg1)

reg2 = re.findall(r"[A-Za-z0-9._%+-]+"
                  r"@[A-Za-z0-9.-]+"
                  r"\.[A-Za-z]{2,4}", txt2)
print(reg2)

['h', 'i', 't', 'e', 's', 't', '_', 'e', 'm', 'a', 'i', 'l', 'g', 'm', 'a', 'i', 'l', '.', 'c', 'o', 'm']
['@gmail.']
['.c']
['test_email@o']
['test_email@outlook.com']
