### Regular expression – learning to use *, +, and ?

In [1]:
import re

In [2]:
def text_match(text, patterns):
        if re.search(patterns,  text):
                return 'Found a match!'
        else:
                return('Not matched!')

In [3]:
print(text_match("ac", "ab?"))
print(text_match("abc", "ab?"))
print(text_match("abbc", "ab?"))

Found a match!
Found a match!
Found a match!


In [4]:
print(text_match("ac", "ab*"))
print(text_match("abc", "ab*"))
print(text_match("abbc", "ab*"))

Found a match!
Found a match!
Found a match!


In [5]:
print(text_match("ac", "ab+"))
print(text_match("abc", "ab+"))
print(text_match("abbc", "ab+"))

Not matched!
Found a match!
Found a match!


In [6]:
print(text_match("abbc", "ab{2}"))

Found a match!


In [7]:
print(text_match("aabbbbbbc", "ab{3,5}?"))

Found a match!


### Regular expression – learning to use $ and ^, and the non-start and non-end of a word

In [8]:
import re

In [9]:
def text_match(text, patterns):
        if re.search(patterns,  text):
                return 'Found a match!'
        else:
                return('Not matched!')

In [10]:
print("Pattern to test start and end with")
print(text_match("abbc", "^a.*c$"))

Pattern to test start and end with
Found a match!


In [11]:
print("Begin with a word")
print(text_match("Tuffy eats pie, Loki eats peas!", "^\w+"))

Begin with a word
Found a match!


In [12]:
print("End with a word and optional punctuation")
print(text_match("Tuffy eats pie, Loki eats peas!", "\w+\S*?$"))

End with a word and optional punctuation
Found a match!


In [13]:
print("Finding a word which contains character, not start or end of the word")
print(text_match("Tuffy eats pie, Loki eats peas!", "\Bu\B"))

Finding a word which contains character, not start or end of the word
Found a match!


### Searching multiple literal strings and substring occurrences

In [14]:
patterns = [ 'Tuffy', 'Pie', 'Loki' ]
text = 'Tuffy eats pie, Loki eats peas!'

In [15]:
for pattern in patterns:
    print('Searching for "%s" in "%s" ->' % (pattern, text),)
    if re.search(pattern,  text):
        print('Found!')
    else:
        print('Not Found!')

Searching for "Tuffy" in "Tuffy eats pie, Loki eats peas!" ->
Found!
Searching for "Pie" in "Tuffy eats pie, Loki eats peas!" ->
Not Found!
Searching for "Loki" in "Tuffy eats pie, Loki eats peas!" ->
Found!


In [16]:
text = 'Diwali is a festival of lights, Holi is a festival of colors!'
pattern = 'festival'

In [17]:
for match in re.finditer(pattern, text):
    s = match.start()
    e = match.end()
    print('Found "%s" at %d:%d' % (text[s:e], s, e))

Found "festival" at 12:20
Found "festival" at 42:50


### Learning to create date regex and a set of characters or ranges of character

In [18]:
url= "http://www.telegraph.co.uk/formula-1/2017/10/28/mexican-grand-prix-2017-time-does-start-tv-channel-odds-lewis1/2017/05/12/"
date_regex = '/(\d{4})/(\d{1,2})/(\d{1,2})/'

In [19]:
print("Date found in the URL :", re.findall(date_regex, url))

Date found in the URL : [('2017', '10', '28'), ('2017', '05', '12')]


In [20]:
def is_allowed_specific_char(string):
    charRe = re.compile(r'[^a-zA-Z0-9.]')
    string = charRe.search(string)
    return not bool(string)

In [21]:
print(is_allowed_specific_char("ABCDEFabcdef123450."))
print(is_allowed_specific_char("*&%@#!}{"))

True
False


### Find all five-character words and make abbreviations in some sentences

In [22]:
street = '21 Ramkrishna Road'
print(re.sub('Road', 'Rd', street))

21 Ramkrishna Rd


In [23]:
text = 'Diwali is a festival of light, Holi is a festival of color!'
print(re.findall(r"\b\w{5}\b", text))

['light', 'color']


### Learning to write your own regex tokenizer

In [24]:
raw = "I am big! It's the pictures that got small."
print(re.split(r' +', raw))

['I', 'am', 'big!', "It's", 'the', 'pictures', 'that', 'got', 'small.']


In [25]:
print(re.split(r'\W+', raw))

['I', 'am', 'big', 'It', 's', 'the', 'pictures', 'that', 'got', 'small', '']


In [26]:
print(re.findall(r'\w+|\S\w*', raw))

['I', 'am', 'big', '!', 'It', "'s", 'the', 'pictures', 'that', 'got', 'small', '.']


### Learning to write your own regex stemmer

In [27]:
def stem(word):
    splits = re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', word)
    stem = splits[0][0]
    return stem

In [28]:
raw = "Keep your friends close, but your enemies closer."
tokens = re.findall(r'\w+|\S\w*', raw)
print(tokens)

['Keep', 'your', 'friends', 'close', ',', 'but', 'your', 'enemies', 'closer', '.']


In [29]:
for t in tokens:
    print("'"+stem(t)+"'")

'Keep'
'your'
'friend'
'close'
','
'but'
'your'
'enem'
'closer'
'.'
