## RegEx

In [None]:
import re       # the standard package
# import regex  # offers additional functionality and a more thorough Unicode support

text = '''tarek is good, tarek is studying [whooo123] rearranging spree free preorders'''
re.findall('[^a]', text) # [] indicated a set of all chars not a'
re.findall('^abc', text, re.MULTILINE) # matches all line beginnings for abc
re.match('c', 'abc')     # search the beginning 
re.search('c', 'abc')    # search the whole string
re.findall('[+*.?]', text) # all special chars occurances
re.findall('[t][a-u]', text) # ta's or tu's
re.findall('[ta|tu]', text) # t + a or t + u
re.findall('^[^a]', text) # begins with other than a
re.findall('o{2,3}', text) # quantifiers: {min, max} or {#}
re.findall('o*', text) # quantifiers: + or ? or * or .
re.findall('[\w]*,', text) # every word of any count that comes before ',' 
re.split('[\[]', text)[1] # we put '\' to indicate special forms or to allow special characters
re.findall('(?P<group_name>[\w])', text) # search and put in a group
re.findall('(?:.* )', text) # group of any char followed by ' ' and group not returned 
re.findall('Isaac (?=Asimov)', 'Issac Asimov') # will match 'Isaac ' only if it’s followed by 'Asimov'
re.search('abc(?=def)', 'abcdef').group(0)    # look-ahead
re.search(r'(?<=-)\w+', 'spam-egg').group(0)    # look-behind
re.findall('\$\d+(\d\.)*', '$20.33')    # use groups '()' to apply operations to complex expressions

# Arranging into groups iterator
for item in re.finditer('\[[\w]*[\d]*\]', text):
    print(item.group())

re.findall('o{2,3}', text) # quantifiers: {min, max} or {#}


In [180]:
import re

text = 'Angeles Bay Sharks'
pattern = '(^(?:Los|St\.|San|New)?(?:\s)?(?:\w+)(?: Bay)?)(?:\s)((?:.+$))'
matches = re.findall(pattern, text)

print(matches)


[('Angeles Bay', 'Sharks Kings')]


In [12]:
import re       # the standard package

text = 'tarek saati'
re.findall('(\w*)(?:.* )(\w*)', text) # equivalent to re.findall('(\w*) (\w*)', text)


[('tarek', 'saati')]

In [None]:
import pandas as pd
df = pd.DataFrame()


### flags
- a, L, u  --> ascii, Locale, unicode
- m, s, x, i --> muli-line, match-all, vebose, ignore-case

In [None]:
# Read a wikipidea page on FERPA
with open('FERPA.txt', 'r') as f:
    wiki=f.read()
# search for titles
for item in re.finditer('(?P<title>[\w]*)(?P<edit_link>\[edit\])', wiki):
    print(item.groupdict())
# look-ahead [edit]
for item in re.finditer('(?P<title>[\w]+)(?=\[edit\n.*\])', wiki):
    print(item.groupdict()['title'])
# python provides verbosity for RegEx (# for comments)
pattern = '''
(?P<title>.*)           # the page title group
(?=\[edit\n.*\])        # all comes behind [edit] group
'''
prog = re.compile(pattern, re.VERBOSE)
for item in prog.finditer(wiki):
    print(item.groupdict()['title']) 

{'title': 'Overview', 'edit_link': '[edit]'}
records
records
also
References
links


In [None]:
import random
def repl(m):
    inner_word = list(m.group(2))
    random.shuffle(inner_word)
    return m.group(1) + "".join(inner_word) + m.group(3)
text = "Professor Abdolmalek, please report your absences promptly."
# \w --> [a-zA-Z0-9_]
# sub --> replace pattern occurences in 'text' by return from 'repl()'  
re.sub(r"(\w)(\w+)(\w)", repl, text)
'Poefsrosr Aealmlobdk, pslaee reorpt your abnseces plmrptoy.'

In [None]:
text = "He was carefully disguised but captured quickly by police."
# find adverbs & their positions
for m in re.finditer(r"\w+ly\b", text):
    print('%02d-%02d: %s' % (m.start(), m.end(), m.group(0)))

07-16: carefully
40-47: quickly


In [None]:
text = 'doi:10.1038/nphys1170'
re.findall('doi\:(\d+\.)+\d+\/\w+', text)

In [None]:
import re
loglist = []

with open("logdata.txt", "r") as file:
    logdata = file.read()
    for line in logdata.splitlines():   
        logdict = {}    
        logdict['host'] = re.findall('((?:\d{1,3}\.){3}\d{1,3})', line)[0]
        username = re.findall('(?<=-\s)\w+', line)
        logdict['user_name'] = username[0] if username else '-'
        logdict['time'] = re.findall('(?<=\[).+(?=\])', line)[0]
        logdict['request'] = re.findall('(?<=\").+(?=\")', line)[0]
        loglist.append(logdict)
        
    print(loglist[0])



In [2]:
import re
def result():
    s = 'ACAABAACAAABACDBADDDFSDDDFFSSSASDAFAAACBAAAFASD'

    result = []
    # compete the pattern below
    pattern = '(\w(?=AAA))'
    for item in re.finditer(pattern, s):
      # identify the group number below.
      result.append(item.group())
      
    return result

print(result())

['C', 'F', 'B']
