In [None]:
'''Matching Characters:

.: Matches any single character except newline.
[ ]: Matches any character inside the brackets.
[^ ]: Matches any character not inside the brackets.
\: Escapes special characters.
Quantifiers:

*: Matches zero or more occurrences of the preceding element.
+: Matches one or more occurrences of the preceding element. (at least)
?: Matches zero or one occurrence of the preceding element.
{n}: Matches exactly n occurrences of the preceding element.
{n,}: Matches n or more occurrences of the preceding element.
{n,m}: Matches between n and m occurrences of the preceding element.
Anchors:

^: Matches the start of the string.
$: Matches the end of the string.
Character Classes:

\d: Matches any digit.
\D: Matches any non-digit.
\w: Matches any word character (alphanumeric and underscore).
\W: Matches any non-word character.
\s: Matches any whitespace character.
\S: Matches any non-whitespace character.
Grouping and Alternation:

(): Groups patterns together.
|: Alternation, matches either the pattern before or after it.
Assertions:

(?=...): Positive lookahead assertion.
(?!...): Negative lookahead assertion.
(?<=...): Positive lookbehind assertion.
(?<!...): Negative lookbehind assertion.
Modifiers:

i: Case-insensitive matching.
m: Multiline matching.
s: Allows . to match newline characters.
x: Ignores whitespace and comments within the pattern.'''

In [None]:
#‘[er]’ : set of character
#character class
#gr[aely]ly = gray, grey
#[A-Z] = range of upper case letters. 
#\ to remove special meaning of dash -
'''
Meta-characters
[]  : Match set of characters
.   : Match any character except the newline character (\n)
^   : 1) Match characters not listed if within set or 
      2) match beginning of string
$   : Match end of string
|   : Functions as an "OR" operator


specify number of character in the pattern:
*: Matches previous character 0 or more times
+: Matches previous character 1 or more times
?: Matches previous character 0 or 1 times (optional)
{}: Matches previous characters however many times specified within:
{n} : Exactly n times 
{n,} : At least n times
{n,m} : Between n and m times


Character classes
\w: Any alphanumeric character. equivalent to [A-Za-z0-9_]
\W: Any non-alphanumeric character. equivalent to [^A-Za-z0-9_]
\d: Any numeric character. [0-9]
\D: Any non-numeric character. [^0-9]
\s: Any whitespace characters. [ \t\n\f]
\S: Any non-whitespace characters.
'''

In [1]:
import re #reg ex


In [3]:
text = "That person wears marvelous trousers."
pattern = 'pers'
strings = re.findall(pattern, text)
print(strings)

['pers']


In [None]:
re.findall(pattern, text)

In [None]:
pattern = 'a'
re.findall(pattern, text)

In [None]:
pattern = 'er'
re.findall(pattern, text)

In [4]:
pattern = '[er]'
re.findall(pattern, text)

['e', 'r', 'e', 'r', 'r', 'e', 'r', 'e', 'r']

In [None]:
text = "Is it spelled gray or grey or graey?"

pattern = 'gr[ae]y'
re.findall(pattern, text)

In [None]:
text = 'This is an A and B "conversation" - so C your way out of it.'

pattern = '[A-Z]'
re.findall(pattern, text)


In [None]:
text = "I'm not going to the party - \because 1) Karen is going, 2] I don't like her, and 3) I already have a headache."

pattern = '[0-9]' # '[0\-9] is different!
print(re.findall(pattern, text))

In [None]:
'''
Meta-characters
[]  : Match set of characters
.   : Match any character except the newline character (\n)
^   : 1) Match characters not listed if within set or 
      2) match beginning of string
$   : Match end of string
|   : Functions as an "OR" operator
'''
print('')

In [5]:
text = "My boss asked me to turn in my TPS reports. \nI told her they were done, but they are not."
print(text)
pattern = '.'
print(re.findall(pattern, text))

My boss asked me to turn in my TPS reports. 
I told her they were done, but they are not.
['M', 'y', ' ', 'b', 'o', 's', 's', ' ', 'a', 's', 'k', 'e', 'd', ' ', 'm', 'e', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'i', 'n', ' ', 'm', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'e', 'p', 'o', 'r', 't', 's', '.', ' ', 'I', ' ', 't', 'o', 'l', 'd', ' ', 'h', 'e', 'r', ' ', 't', 'h', 'e', 'y', ' ', 'w', 'e', 'r', 'e', ' ', 'd', 'o', 'n', 'e', ',', ' ', 'b', 'u', 't', ' ', 't', 'h', 'e', 'y', ' ', 'a', 'r', 'e', ' ', 'n', 'o', 't', '.']


In [6]:
pattern = '[^a-m]'
print(re.findall(pattern, text))

['M', 'y', ' ', 'o', 's', 's', ' ', 's', ' ', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'n', ' ', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'p', 'o', 'r', 't', 's', '.', ' ', '\n', 'I', ' ', 't', 'o', ' ', 'r', ' ', 't', 'y', ' ', 'w', 'r', ' ', 'o', 'n', ',', ' ', 'u', 't', ' ', 't', 'y', ' ', 'r', ' ', 'n', 'o', 't', '.']


In [7]:
pattern = '^[Mm]y'
print(re.findall(pattern, text))

['My']


In [8]:
pattern = 'they are not.$'
print(re.findall(pattern, text))


['they are not.']


In [None]:
print('**'+text+'**')

In [None]:
text = "My boss asked me to turn in myyy TPS reports. \n I told my boss they were done, but they are not."
print(text)

pattern = '[Mm]y|TPS|reports'
print(re.findall(pattern, text))

In [None]:
text.find('reports')  # does not use regex!

In [None]:
for x in re.finditer('[Mm]y|TPS|reports', text):
    print(x)
# list(re.finditer(pattern, text)) 

xlist = [x for x in re.finditer(r'[Mm]y|TPS|reports', text)]
xlist

In [None]:
for match in re.finditer(r'[Mm]y|TPS|reports', text):
    print (match.span()[0])
    print(match.group())

In [None]:
'''
specify number of character in the pattern:
*: Matches previous character 0 or more times
+: Matches previous character 1 or more times
?: Matches previous character 0 or 1 times (optional)
{}: Matches previous characters however many times specified within:
{n} : Exactly n times 
{n,} : At least n times
{n,m} : Between n and m times
'''
print('')

In [None]:
text = "The complicit cat interacted with the other cats exactly as we expected."

pattern = "ca*t"
print(re.findall(pattern, text))

In [9]:
text = "The complicit caaaat interacted with the other cats exactly as we expected."

pattern = "ca+t"
print(re.findall(pattern, text))

['caaaat', 'cat']


In [None]:
text = "Is the correct spelling color or colour or colouuuur?"

pattern = "colou?r"
print(re.findall(pattern, text))

In [None]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2}"
print(re.findall(pattern, text))

In [None]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,}"
print(re.findall(pattern, text))

In [None]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,4}"
print(re.findall(pattern, text))

In [None]:
'''
Character classes
\w: Any alphanumeric character. equivalent to [A-Za-z0-9_]
\W: Any non-alphanumeric character. equivalent to [^A-Za-z0-9_]
\d: Any numeric character. [0-9]
\D: Any non-numeric character. [^0-9]
\s: Any whitespace characters. [ \t\n\f]
\S: Any non-whitespace characters.
'''

In [None]:
text = "Th1s is going to_be a weird sentence: with @ bunch-of-$tuff in it <3."
pattern = '\w'
print(re.findall(pattern, text))

In [None]:
text = "Th1s is going to_be a weird sentence with @ bunch-of-$tuff in it <3."

pattern = '\W'
print(re.findall(pattern, text))

In [None]:
text = "Th1s is going to_be a weird sentence with @ bunch-of-$tuff in it <30."
pattern = '\d+'
print(re.findall(pattern, text))

In [None]:
#extracting words
text = "If you tell the_truth, you don't have to remember anything. ä ü ö á ó í ñ ß "

pattern = "[\w']+"
print(re.findall(pattern, text))

In [None]:
#words of specified minimum length
pattern = "[\w']{5,}"
print(re.findall(pattern, text))

In [None]:
# as a curiosity: (you are not going to need this)
#words of specified maximum length - have to use word boundary \b - need \\b 
# \\b matches the boundary, but not any characters
pattern="\\b[\w']{1,5}\\b"
print(re.findall(pattern, text))

In [None]:
text = "TerraPower, a nuclear-energy company founded by Bill Gates, is unlikely to follow through on building a demonstration reactor in China, due largely to the Trump administration’s crackdown on the country."
print(text)
pattern = '[A-Z][a-z]+'
print(re.findall(pattern, text))

In [None]:
pattern = '[A-Z][a-z]+ ?[A-Z][a-z]+'

# pattern1 = '[A-Z][a-z]+|[A-Z][a-z]+ ?[A-Z][a-z]+'
# pattern2 = '[A-Z][a-z]+ ?[A-Z][a-z]+|[A-Z][a-z]+'
pattern = '([A-Z][a-z]+ ?[A-Z][a-z]+)|([A-Z][a-z]+)'
print(re.findall(pattern, text))


In [None]:
text2 = "My boss asked me to turn in my TPS reports. \n I told him they were done, but they are not."
pattern2 = '(boss)|(reports)|(TPS)'
print(re.findall(pattern2, text2))


In [None]:
results = [i for j in re.findall(pattern2, text2) for i in j if i != '']
results

In [None]:
text = """
For eight young men the AP tracked down in Seattle, tech obsession has become something much darker, getting in the way of their normal lives.
"We’re talking flunk-your-classes, can’t-find-a-job, live-in-a-dark-hole kinds of problems, with depression, anxiety and sometimes suicidal thoughts part of the mix," the AP's Martha Irvine reports.
"""

print(text)
pattern = '".*"'
re.findall(pattern, text)

In [10]:

text = """
Aeromexico 800-237-6639
Air Canada 888-247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6567
Air Inuit 800-361-2965
Air North 800-661-0407
Air Tindi 888-545-6794
Air Transat 866-847-1112
Alaska Airlines 800-426-0333,866-516-1685
Allegiant Air 702-505-8888
American Airlines 800-433-7300
Bearskin Airlines 807-577-1141
Buffalo Airways 867-874-3333
Calm Air 800-839-2256
Cape Air 800-227-3247
Delta Air Lines 800-455-2720
First Air 800-267-1247
Flair Airlines 204-888-2665
Frontier Airlines 801-401-9000
Harb-or-Air 800-665-0212
Hawaiian Airlines 877-426-4537
Horizon Air 800-547-9308
InterJet 866-285-8307
Island Air 800-388-1105
JetBlue 800-538-2583
Porter Airlines 888-619-8622
Silver Airways 801-401-9100
Southwest Airlines 800-435-9792
Spirit Airlines 801-401-2222
Sun Country Airlines 800-359-6786
Sunwing 877-SUN-WING
Thunder Airlines 800-803-9943
United Airlines 800-864-8331
Virgin America 877-359-8474
VivaAerobus 888-935-988 
Volaris 855-865-2747
WestJet Airlines 888-937-8538
"""

In [11]:
pattern = '\d{3}-[0-9A-Z]{3}-[0-9A-Z]{3,4}'
re.findall(pattern, text)

['800-237-6639',
 '888-247-2262',
 '888-247-2262',
 '800-567-6567',
 '800-361-2965',
 '800-661-0407',
 '888-545-6794',
 '866-847-1112',
 '800-426-0333',
 '866-516-1685',
 '702-505-8888',
 '800-433-7300',
 '807-577-1141',
 '867-874-3333',
 '800-839-2256',
 '800-227-3247',
 '800-455-2720',
 '800-267-1247',
 '204-888-2665',
 '801-401-9000',
 '800-665-0212',
 '877-426-4537',
 '800-547-9308',
 '866-285-8307',
 '800-388-1105',
 '800-538-2583',
 '888-619-8622',
 '801-401-9100',
 '800-435-9792',
 '801-401-2222',
 '800-359-6786',
 '877-SUN-WING',
 '800-803-9943',
 '800-864-8331',
 '877-359-8474',
 '888-935-988',
 '855-865-2747',
 '888-937-8538']

In [None]:
print('\d')
print('\\d')
print('\\\d')

print("\p")
print("\\p")
print("\\\p")

print(r'\d')
print(r'\\d')
print(r'\\\d')

In [None]:
text = r'I am not going to the party - \\ because 1) Karen is going, 2] I do not like her, and 3) I already have a headache.'

pattern = r'\\'
re.findall(pattern, text)

In [None]:
len(re.findall(pattern, text)[1])

In [None]:
text = "testing testing one two three testing"
pattern1 = 'testing|testing testing'
pattern2 = 'testing testing|testing'
print(re.findall(pattern1, text))
print(re.findall(pattern2, text))
