In [2]:
import re

### The most common uses of regular expressions are:
* Search a String(Search and match)
* Finding a string (findall)
* Break string into a sub strings (split)
* Replace part of a string(sub)

### Various method of Regular Expressions:

1. re.match()
2. re.search()
3. re.findall()
4. re.split()
5. re.sub()
6. re.compile()


In [3]:
### re.match(pattern, string) - This will help to match if it occurs at start of the string

result = re.match(r'AV', 'AV Analytica Vidhya - AV')

print(result.group(0))
print(result.start())
print(result.end())
print(result.group(0))

AV
0
2
AV


In [4]:
# re.search(pattern, string) -> unlike match function, it helps to find the pattern from the complete string
result = re.search(r'Vidhya', 'Analytics Vidhya - AV')
print(result.group(0))

Vidhya


In [5]:
### re.finall(pattern, string) -> it will find all the matching string and return as a list
result = re.findall(r'AV', 'Analytics Vidhya - AV')
print(result)

result = re.findall(r'AV', 'AV Analytics Vidhya - AV')
print(result)

['AV']
['AV', 'AV']


In [6]:
### re.split(pattern, string, [maxsplit=0]) -> split the string by occurance of the given pattern

result = re.split(r'y', 'Analytics')
print(result)

result = re.split(r'i', 'Analytics Vidhya')
print(result)

result = re.split(r'i', 'Analytics Vidhya', maxsplit=1) # --> split the string by max 2 string
print(result)

['Anal', 'tics']
['Analyt', 'cs V', 'dhya']
['Analyt', 'cs Vidhya']


In [7]:
### re.sub(pattern, repl, string) -> search and replace the string based on the pattern

result = re.sub(r'India', 'the world', 'AV is largest Analytics community of India')
result

'AV is largest Analytics community of the world'

In [9]:
### re.compile(pattern, repl, string) -> We can combine a regular expression pattern into pattern objects. Which can be used to work with other function as well.

pattern = re.compile('AV')
result = pattern.findall('AV Analytics Vidhya AV')
print(result)
result = pattern.findall('AV is largest Analytics community of India')
result

['AV', 'AV']


['AV']

# Regular Expression with the Operators

In [12]:
### Problem 1: Return the first word of the given String
### Solution 1 -> Extract each char using '\w'

result = re.findall(r'.', 'AV is largest Analytics community of India')
print(result)

# Above solution will return the (space) as we;ll with the results.
# '/w' -> will return the alphanumeric character from the given string
print('With \w')
result = re.findall(r'\w', 'AV is largest Analytics community of India')
print(result)

['A', 'V', ' ', 'i', 's', ' ', 'l', 'a', 'r', 'g', 'e', 's', 't', ' ', 'A', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's', ' ', 'c', 'o', 'm', 'm', 'u', 'n', 'i', 't', 'y', ' ', 'o', 'f', ' ', 'I', 'n', 'd', 'i', 'a']
With \w
['A', 'V', 'i', 's', 'l', 'a', 'r', 'g', 'e', 's', 't', 'A', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's', 'c', 'o', 'm', 'm', 'u', 'n', 'i', 't', 'y', 'o', 'f', 'I', 'n', 'd', 'i', 'a']


In [14]:
### Solution 2 -> Extract each word using '+' or '*'

result = re.findall(r'\w*', 'AV is largest Analytics community of India')
print(result)

# Above solution will return the (space) as well with the results, since it will return 0 or more words

# '+' -> will return the 1 or more words from the given string
print('With +')
result = re.findall(r'\w+', 'AV is largest Analytics community of India')
print(result)

['AV', '', 'is', '', 'largest', '', 'Analytics', '', 'community', '', 'of', '', 'India', '']
With +
['AV', 'is', 'largest', 'Analytics', 'community', 'of', 'India']


In [19]:
### Solution 3 -> Extract the first string from the given sentance
result = re.findall(r'^\w+', 'AV is largest Analytics community of India')
print(result)

result = re.findall(r'\w+$', 'AV is largest Analytics community of India')
print(result)

['AV']
['India']


In [20]:
### Problem 2: Retuen the first two character of each word
### Solution 1 - Extract consecutive two characters of each word, execluding spaces (using '\w')
result = re.findall(r'\w\w', 'AV is largest Analytics community of India')
print(result)

['AV', 'is', 'la', 'rg', 'es', 'An', 'al', 'yt', 'ic', 'co', 'mm', 'un', 'it', 'of', 'In', 'di']


In [24]:
### Solution 2 -> Extract 2 words from the sentance using boundary function
result = re.findall(r'\b\w.', 'AV is largest Analytics community of India')
print(result)

['AV', 'is', 'la', 'An', 'co', 'of', 'In']


In [30]:
### Problem 3: Return the domain type of given email_ids
# Step 1: Extract the characters after @

result = re.findall(r'@\w+', 'abc.test@gmail.com, xyz@yahoo.in, test.first@acc.in')
print(result)

result = re.findall(r'@\w+.\w+', 'abc.test@gmail.com, xyz@yahoo.in, test.first@acc.in')
print(result)

# Step 2: Extract only the domain name
result = re.findall(r'@\w+.(\w+)', 'abc.test@gmail.com, xyz@yahoo.in, test.first@acc.in')
print(result)


['@gmail', '@yahoo', '@acc']
['@gmail.com', '@yahoo.in', '@acc.in']
['com', 'in', 'in']


In [31]:
### Problem 4: Return the date from the given String
# Solution 1: using '\d' we can extract the numeric values
result = re.findall(r'\d{2}-\d{2}-\d{4}', 'Amit 34-2332 12-05-2021, XYX 23-2342 12-12-2020')
print(result) 

['12-05-2021', '12-12-2020']


In [32]:
# To print only the year
result = re.findall(r'\d{2}-\d{2}-(\d{4})', 'Amit 34-2332 12-05-2021, XYX 23-2342 12-12-2020')
print(result) 

['2021', '2020']


In [35]:
### Problem 5: Return all words of a string those starts with Vowel
# Solution 1:
result = re.findall(r'\b[aeiouAEIOU]\w+', 'AV is largest Analytics community of India')
print(result) 

['AV', 'is', 'Analytics', 'of', 'India']


In [37]:
### Problem 6: Return all words of a string those not starts with Vowel
# Solution 1:
result = re.findall(r'\b[^aeiouAEIOU ]\w+', 'AV is largest Analytics community of India')
print(result) 

['largest', 'community']
