In [9]:
import re

text = "My phone once, my phone twice"

# only gives the first occurance
pattern = "phone"
match = re.search(pattern, text)
print(f"Pattern in the text: {match}")
print(f"The span of the pattern is: {match.span()}")
print(f"The Start of the pattern is: {match.start()}")
print(f"The End of the pattern is: {match.end()}")

# for all of the occurance of the pattern
matches = re.findall(pattern, text)
print(f"All of the patterns in the text: {matches} and total number of matches: {len(matches)}")

#locations of the pattern in the text
print("Location of the patterns within the text are:")
for match in re.finditer(pattern, text):
    print(match.span())

Pattern in the text: <re.Match object; span=(3, 8), match='phone'>
The span of the pattern is: (3, 8)
The Start of the pattern is: 3
The End of the pattern is: 8
All of the patterns in the text: ['phone', 'phone'] and total number of matches: 2
Location of the patterns within the text are:
(3, 8)
(18, 23)


# PATTERNS

## IDENTIFIERS FOR CHARACTERS IN PATTERNS

Character	Description	Example     Pattern Code	Exammple Match
 \d	          A digit	             file_\d\d	       file_25
 \w	          Alphanumeric	         \w-\w\w\w	        A-b_1
 \s	          White space	         a\sb\sc	        a b c
 \D	          A non digit	         \D\D\D	            ABC
 \W	          Non-alphanumeric	    \W\W\W\W\W	        *-+=)
 \S	          Non-whitespace	     \S\S\S\S	        Yoyo


## QUANTIFIERS FOR CHARACTERS IN PATTERNS

Character	    Description	         Example Pattern Code	Exammple Match
+	       Occurs one or more times	  Version \w-\w+	    Version A-b1_1
{3}	       Occurs exactly 3 times	  \D{3}	                abc
{2,4}	   Occurs 2 to 4 times	      \d{2,4}	            123
{3,}	   Occurs 3 or more	          \w{3,}	            anycharacters
\*	       Occurs zero or more times  A\*B\*C*	            AAACC
?	       Once or none	              plurals?	            plural

In [21]:
text = "My phone number is 9511-071-5980"

# USING IDENTIFIERS
phone = re.search(r'\d\d\d\d-\d\d\d-\d\d\d\d', text)
print(phone)
# this would only match if the pattern is matched with the identifer but to find the actual text that matched with the pattern
print(phone.group())

#USING QUANTIFIERS
phone = re.search(r'\d{4}-\d{3}-\d{4}', text)
print(phone)
print(phone.group())

<re.Match object; span=(19, 32), match='9511-071-5980'>
9511-071-5980
<re.Match object; span=(19, 32), match='9511-071-5980'>
9511-071-5980


In [22]:
# let's we have to find the pattern but as well as get the area code then the group is used but with the help of compile method, we can't just use group method

phone.group(1)

IndexError: no such group

In [28]:
phone_pattern = re.compile(r'(\d{4})-(\d{3})-(\d{4})')

match = re.search(phone_pattern, text)

print(match)
print(match.group())
print(match.group(1)) # unlike index it starts with 1
print(match.group(2))
print(match.group(3))
print(match.group(4))

<re.Match object; span=(19, 32), match='9511-071-5980'>
9511-071-5980
9511
071
5980


IndexError: no such group

In [31]:
# or operator |
print(re.search(r"man|woman", "This  man was here."))
print(re.search(f"man|woman", "This womann was here."))



<re.Match object; span=(6, 9), match='man'>
<re.Match object; span=(5, 10), match='woman'>


In [39]:
#wildcard character
print(re.findall(r".at", "The cat in the hat sat here."))

print(re.findall(r'.....at', "The bat with the splat"))

# \S menas non-space characters and + means more than one times occurs before 'at'
print(re.findall(r'\S+at', "The bat went splat"))

['cat', 'hat', 'sat']
['The bat', 'e splat']
['bat', 'splat']


In [40]:
# ends with a number - $ symbol for ending
print(re.findall(r'\d$', 'This ends with a number 2'))

# starts with a number - ^ symbol for starting 
print(re.findall(r'^\d', "1 is the first number"))

['2']
['1']


In [47]:
# exclusion - [] in conjuction with ^ is used for this
phrase = "there are 3 numbers 34 inside 5 this sentence."

# character by character
print(re.findall(r'[^\d]', phrase))

# grouping the words before and after the number 
print(re.findall(r'[^\d]+', phrase))

# To remove puncuations exclusion is used
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

print(re.findall(r'[^!.? ]+', test_phrase))

# to clean the sentennce 
" ".join(re.findall(r'[^!.? ]+', test_phrase))

['t', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ', ' ', 'n', 'u', 'm', 'b', 'e', 'r', 's', ' ', ' ', 'i', 'n', 's', 'i', 'd', 'e', ' ', ' ', 't', 'h', 'i', 's', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.']
['there are ', ' numbers ', ' inside ', ' this sentence.']
['This', 'is', 'a', 'string', 'But', 'it', 'has', 'punctuation', 'How', 'can', 'we', 'remove', 'it']


'This is a string But it has punctuation How can we remove it'

In [48]:
# [] for grouping 
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'


re.findall(r'[\w]+-[\w]+',text)

['hypen-words', 'long-ish']

In [53]:
# () parennthesis for muultiple options

# Find words that start with cat and end with one of these options: 'fish','nap', or 'erpillar'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

print(re.search(r'cat(fish|nap|erpillar)', text))
print(re.search(r'cat(fish|nap|erpillar)', texttwo))
print(re.search(r'cat(fish|nap|erpillar)', textthree))

<re.Match object; span=(27, 34), match='catfish'>
<re.Match object; span=(32, 38), match='catnap'>
<re.Match object; span=(26, 37), match='caterpillar'>
