In [1]:
import re

In [2]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [3]:
pattern = re.compile(r"https?://(www.)?\w+\.\w+")
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [4]:
# Let's capture the domain(google/nasa) and top level domain(com/gov/org) using groups

pattern = re.compile(r"https?://(www.)?(\w+)(\.\w+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [5]:
# Above we have 3 groups:
# (www.) , (\w+), (\.\w+)
# There is group zero, and it is everything that we captured

In [6]:
pattern = re.compile(r"https?://(www.)?(\w+)(\.\w+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(0))

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


In [7]:
pattern = re.compile(r"https?://(www.)?(\w+)(\.\w+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(1))

www.
None
None
www.


In [8]:
pattern = re.compile(r"https?://(www.)?(\w+)(\.\w+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(2))

google
coreyms
youtube
nasa


In [9]:
pattern = re.compile(r"https?://(www.)?(\w+)(\.\w+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(3))

.com
.com
.com
.gov


In [10]:
# Backreference is something used to reference our captured groups.
# It is basically just a shorthand for accessing our group indexes.

In [11]:
# Here we are using the patterns to substitute out group 2 and 3 for all of the matches in url.
# Thus everytime a match is found, it would replace that with group 2 which is the domain and group 3 which is the top level domain
subbed_urls = pattern.sub(r'\2\3', urls)
print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



In [12]:
# finditer method does the best job to show the matches and the locations of those matches.
# There are some other methods as well

In [13]:
# finditer() returns matched objects with extra info and functionality.
# findall() will just return the matches as a list of strings. If its matching groups then it would only return the group

In [14]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

cat
mat
bat
pat

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

sentence = 'Start a sentence and then bring it to an end'

In [15]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')
matches = pattern.findall(text_to_search)
for match in matches:
    print(match)

Mr
Mr
Ms
Mrs
Mr


In [16]:
# It only printed out one group
# If there are multiple groups then it would return a list of tuples and the tuples will contain all of the groups
# If there are no groups, it would return the list of matches of all of the strings

In [17]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.findall(text_to_search)
for match in matches:
    print(match)

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234


In [18]:
# match() determines if the regex matches at the beginning of the string

In [19]:
pattern = re.compile(r'Start')
matches = pattern.match(sentence)
for match in matches:
    print(match)

TypeError: 're.Match' object is not iterable

In [None]:
# match doesn't return iterable, like finditer or findall, it just returns the first match and if no match it returns None
pattern = re.compile(r'Start')
matches = pattern.match(sentence)
print(matches)

In [None]:
# if no match it returns None
pattern = re.compile(r'sentence')
matches = pattern.match(sentence)
print(matches)

In [None]:
# The match is checkking for the word at the beginning of the string only
# sentence is present in the above string but match returns none as it cannot find it at the very start

In [20]:
# search() is used to search a pattern in the entire string
# search() prints the first match that it matches
pattern = re.compile(r'sentence')
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(8, 16), match='sentence'>


In [21]:
# If something doesn't exist, search returns None
pattern = re.compile(r'dne')
matches = pattern.search(sentence)
print(matches)

None


In [22]:
# Using Flags

In [23]:
# We want to match a word and match if it is in uppercase, lowercase or mixture of both

# Let's say we want to search for the sentence to start with "Start" in either uppercase or lowercase or mix of both
# re.compile(r'[Ss][Tt][Aa]'')  --> This is not the best solution as we can understand

In [24]:
pattern = re.compile(r'start', re.IGNORECASE)
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>


In [25]:
# although we passed a lowercase 'start' in the paramter, the actual sentence is having 'Start', but we still were able to match due to the re.IGNORECASE flag

In [26]:
pattern = re.compile(r'start', re.I)
matches = pattern.search(sentence)
print(matches)

# Instead of IGNORECASE we can just write I

<re.Match object; span=(0, 5), match='Start'>


In [27]:
# Several other such flags exist