### References taken from: 
https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/

In [1]:
import re

In [2]:
text = 'VG Vineet Gupta VG'

### re.search(pattern, string):
search() method is able to find a pattern from any position of the string but it only returns the first occurrence of the search pattern.

In [12]:
result = re.search('VG',text)
print(result.group(0))

VG


### re.findall (pattern, string):
It helps to get a list of all matching patterns. It has no constraints of searching from start or end. If we will use method findall to search ‘VG’ in given string it will return both occurrence of VG. While searching a string, I would recommend you to use re.findall() always, it can work like re.search() and re.match() both.

In [11]:
result = re.findall('VG',text)
print(result)

['VG', 'VG']


### re.split(pattern, string, [maxsplit=0]):

In [15]:
result=re.split(r'y','Analytics')
result

['Anal', 'tics']

Above, we have split the string “Analytics” by “y”. Method split() has another argument “maxsplit“. It has default value of zero. In this case it does the maximum splits that can be done, but if we give value to maxsplit, it will split the string. Let’s look at the example below:

In [16]:
# Split all occurences
result = re.split(r'i', 'Analytics Vidhya')
print(result)
# Split only 1 occurence as maxsplit = 1
result = re.split(r'i','Analytics Vidhya', maxsplit=1)
print(result)

['Analyt', 'cs V', 'dhya']
['Analyt', 'cs Vidhya']


### re.sub(pattern, replaceWith, string):
It helps to search a pattern and replace with a new sub string. If the pattern is not found, string is returned unchanged.

In [18]:
result=re.sub(r'India','the World','AV is largest Analytics community of India')
result

'AV is largest Analytics community of the World'

### re.compile(pattern):
We can combine a regular expression pattern into pattern objects, which can be used for pattern matching. It also helps to search a pattern again without rewriting it.

In [19]:
pattern = re.compile('VG')
result = pattern.findall('VG Vineet Gupta VG')
result

['VG', 'VG']

### Problem 1: Return the first word of a given string

In [23]:
text = 'AV is largest Analytics community of India'
# '.' Matches with any single character except newline ‘\n’.
result=re.findall(r'.', text)
print(result)
print()

# Using \w doesn't include spaces
# '\w' Matches with a alphanumeric character whereas '\W' (upper case W) matches non alphanumeric character.
result = re.findall(r'\w', text)
print(result)

['A', 'V', ' ', 'i', 's', ' ', 'l', 'a', 'r', 'g', 'e', 's', 't', ' ', 'A', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's', ' ', 'c', 'o', 'm', 'm', 'u', 'n', 'i', 't', 'y', ' ', 'o', 'f', ' ', 'I', 'n', 'd', 'i', 'a']

['A', 'V', 'i', 's', 'l', 'a', 'r', 'g', 'e', 's', 't', 'A', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's', 'c', 'o', 'm', 'm', 'u', 'n', 'i', 't', 'y', 'o', 'f', 'I', 'n', 'd', 'i', 'a']


### Problem 2: Return words

In [24]:
# '*' : 0 or more occurrences of the pattern to its left
# as it also print 0 occurence, hence spaces are also present
result = re.findall(r'\w*', text)
print(result)
print()

# '+': 1 or more occurrences of the pattern to its left
result = re.findall(r'\w+', text)
print(result)

['AV', '', 'is', '', 'largest', '', 'Analytics', '', 'community', '', 'of', '', 'India', '']

['AV', 'is', 'largest', 'Analytics', 'community', 'of', 'India']


In [25]:
# ^ and $ match the start or end of the string respectively till a new line.
# Hence ^ results the first word and $ results the last word until a new line comes.
result = re.findall(r'^\w+', text)
print(result)
print()

result = re.findall(r'\w+$', text)
print(result)

['AV']

['India']


### Problem 3: Return the first two character of each word

In [28]:
result = re.findall(r'\w{2}',text)
print(result)
print()
# OR
result = re.findall(r'\w\w',text)
print(result)

['AV', 'is', 'la', 'rg', 'es', 'An', 'al', 'yt', 'ic', 'co', 'mm', 'un', 'it', 'of', 'In', 'di']

['AV', 'is', 'la', 'rg', 'es', 'An', 'al', 'yt', 'ic', 'co', 'mm', 'un', 'it', 'of', 'In', 'di']


Extract consecutive two characters those available at start of word boundary (using “\b“)

In [33]:
# 2 words only from the starting of any word
# \b: boundary between word and non-word and /B is opposite of /b
result = re.findall(r'\b\w{2}',text)
print(result)
print()
# OR
result = re.findall(r'\b\w.',text)
print(result)

['AV', 'is', 'la', 'An', 'co', 'of', 'In']

['AV', 'is', 'la', 'An', 'co', 'of', 'In']


### Problem 4: Return the domain type of given email-ids

In [36]:
# Extract all characters after “@”
text = 'abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com, first.test@rest.biz'
result = re.findall(r'@\w+',text)
print(result)
print()

# '()' Groups regex and returns the string which is part in bracket only
# In this our string will start matching from @ but it will print after @
result = re.findall(r'@(\w+)',text)
print(result)

['@gmail', '@test', '@analyticsvidhya', '@rest']

['gmail', 'test', 'analyticsvidhya', 'rest']


In [37]:
result = re.findall(r'@\w+.\w+',text)
print(result)
print()

result = re.findall(r'@(\w+.\w+)',text)
print(result)

['@gmail.com', '@test.in', '@analyticsvidhya.com', '@rest.biz']

['gmail.com', 'test.in', 'analyticsvidhya.com', 'rest.biz']


In [38]:
# Extract only domain name using “( )”
result = re.findall(r'@\w+.(\w+)',text)
print(result)

['com', 'in', 'com', 'biz']


### Problem 5: Return date and phone no from given string

In [40]:
# \d is used to extract digits
text = '''Phone No: 0124-236703, Date: 17/12/2021, Name: Vineet
, Phone No: 0171-236474, Date: 18/12/2021, Name: Vaibhav'''
result = re.findall(r'\d{2}/\d{2}/\d{4}',text)
print(result)
print()

result = re.findall(r'\d{4}-\d{6}',text)
print(result)

['17/12/2021', '18/12/2021']

['0124-236703', '0171-236474']


In [45]:
# For area code of phone number:
result = re.findall(r'(\d{4})-\d{6}',text)
print(result)
print()
# We can't do like this:
result = re.findall(r'\d{4}',text)
print(result)

['0124', '0171']

['0124', '2367', '2021', '0171', '2364', '2021']


### Problem 6: Return all words of a string those starts with vowel

In [46]:
text = 'AV is largest Analytics community of India'
result = re.findall(r'[aeiouAEIOU]\w+', text)
print(result)
# But We want only words which starts with vowels.

['AV', 'is', 'argest', 'Analytics', 'ommunity', 'of', 'India']


In [48]:
# All words that starts with vowels
result = re.findall(r'\b[aeiouAEIOU]\w+', text)
print(result) 

['AV', 'is', 'Analytics', 'of', 'India']


In [49]:
# If we want words that doesn't start with vowels.
result = re.findall(r'\b[^aeiouAEIOU]\w+', text)
print(result) 

[' is', ' largest', ' Analytics', ' community', ' of', ' India']


Above we can see that it has returned words starting with space eg(' is',' India'). To drop it from output, include space in square bracket[].

In [50]:
# is,India,Analytics has a space in starting
result = re.findall(r'\b[^aeiouAEIOU ]\w+', text)
print(result) 

['largest', 'community']


### Problem 7: Split a string with multiple delimiters

In [55]:
text = 'asdf fjdk;afed,fjek,asdf,foo' # String has multiple delimiters (";",","," ").
result= re.split(r'[;,\s]', text)
print(result)
print()
# We can also use method re.sub() to replace these multiple delimiters with one as space ” “.
result= re.sub(r'[;,\s]',' ', text)
print(result)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

asdf fjdk afed fjek asdf foo


### Problem 8: Validate a phone number (phone number must be of 10 digits and starts with 8 or 9) 

In [58]:
lst=['9999999999','8939929991','99999x9999']
for val in lst:
    if re.match(r'[8-9]{1}[0-9]{9}',val) and len(val)==10:
        print('yes')
    else:
        print('no')

yes
yes
no
