# Function
findall  Returns a list containing all matches
search   Returns a Match object if there is a match anywhere in the string
split    Returns a list where the string has been split at each match
sub      Replaces one or many matches with a string


 Metacharacters
Metacharacters are characters with a special meaning:

[]	A set of characters	"[a-m]"	
\	Signals a special sequence (can also be used to escape special characters)	"\d"	
.	Any character (except newline character)	"he..o"	
^	Starts with	"^hello"

" Dollar Sign Ends with "planet$"	
" *	Zero or more occurrences	"he.*o"	
" +	One or more occurrences	"he.+o"	
" ?	Zero or one occurrences	"he.?o"	
" {}	Exactly the specified number of occurrences	"he.{2}o"	
" |	Either or	"falls|stays"	
" ()	Capture and group	 

# Set
[arn]	Returns a match where one of the specified characters (a, r, or n) is present	
[a-n]	Returns a match for any lower case character, alphabetically between a and n	
[^arn]	Returns a match for any character EXCEPT a, r, and n	
[0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present	
[0-9]	Returns a match for any digit between 0 and 9	
[0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59	
[a-zA-Z]	Returns a match for any character alphabetically between a and z, lower case OR upper case	
[+]	In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string

# Special Sequences
A special sequence is a \ followed by one of the characters in the list below, and has a special meaning:

Character	Description	Example	Try it
\A	Returns a match if the specified characters are at the beginning of the string	"\AThe"	
\b	Returns a match where the specified characters are at the beginning or at the end of a word
(the "r" in the beginning is making sure that the string is being treated as a "raw string")	r"\bain"
r"ain\b"	
\B	Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word
(the "r" in the beginning is making sure that the string is being treated as a "raw string")	r"\Bain"
r"ain\B"	
\d	Returns a match where the string contains digits (numbers from 0-9)	"\d"	
\D	Returns a match where the string DOES NOT contain digits	"\D"	
\s	Returns a match where the string contains a white space character	"\s"	
\S	Returns a match where the string DOES NOT contain a white space character	"\S"	
\w	Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	
\W	Returns a match where the string DOES NOT contain any word characters	"\W"	
\Z	Returns a match if the specified characters are at the end of the string	"Spain\Z"

In [None]:
import re
txt = "The rain in Spain"

# findall	
Returns a list containing all matches

In [None]:
x = re.findall("ai", txt)
print(x)
x = re.findall("Portugal", txt)
print(x)
x = re.findall("[a-m]", txt)
print(x)

# search	
Returns a Match object if there is a match anywhere in the string

In [None]:
#Check if the string starts with "The" and ends with "Spain":
x = re.search("^The.*Spain$", txt)
if x:
  print("YES! We have a match!")
else:
  print("No match")

In [None]:
x = re.search("\s", txt)
print("The first white-space character is located in position:", x.start())

x = re.search("Portugal", txt)
print(x)

x = re.search("ai", txt)
print(x) #this will print an object

x = re.search(r"\bS\w+", txt)
print(x.span())

# split
Returns a list where the string has been split at each match

In [None]:
x = re.split("\s", txt)
print(x)

x = re.split("\s", txt, 1)
print(x)

# sub
Replaces one or many matches with a string

In [None]:
x = re.sub("\s", "9", txt)
print(x)

x = re.sub("\s", "9", txt, 2)
print(x)

In [None]:
import re
sentiment_analysis = "@robot9! @robot4& I have a good feeling that the show isgoing to be amazing! @robot9$ @robot7%"
regex = r"\B@\w{5}\d\D"
# regex = r"@robot\d\W" Datacamp answer

# Find all matches of regex
print(re.findall(regex, sentiment_analysis))


In [None]:
sa = "Unfortunately one of those moments wasn't a giant squid monster. User_mentions:2, likes: 9, number of retweets: 7"
print(re.findall(r"User_mentions:\d", sa))
print(re.findall(r"likes:\s\d", sa))
print(re.findall(r"number.:\s\d", sa))
# print(re.findall(r"number\sof\sretweets:\s\d", sentiment_analysis)) Datacamp

In [None]:
saa = "He#newHis%newTin love with$newPscrappy. #8break%He is&newYmissing him@newLalready"
regex_sentence = r"\B\W\dbreak\W"
print(re.findall(regex_sentence, saa))

In [None]:
NameAge = '''
Janica is 22 and Theon is 33
Gabrial is 44 and Joey is 21
'''
regex = r"[A-Z][a-z]*|\d{1,3}"
ages = re.findall(r'\d{1,3}', NameAge)
name = re.findall(r'[A-Z][a-z]*', NameAge)
nameage = re.findall(regex, NameAge)
xx = dict()
x = 0
for names in name:
    xx[names] = ages[x]
    x += 1      
print(xx)

In [None]:
mt = 'BEER IS NICE'
print(re.findall('BEER', mt))
print(re.search('BEER', mt))
if re.search('BEER', mt):
    print("Beer is here")

In [None]:
str = "we need to inform him with the latest information"
for i in re.finditer("inform", str):
    print(i.span())
print(str[11:17])
print(str[38:44])

In [None]:
str = "Sat, hat, mat, pat"
print(re.findall("[Shmp]at", str)) # word start with 
print(re.findall("[h-z]at", str))  # exists on h se z
print(re.findall("[^h-z]at", str)) # not exists on h se z

In [None]:
food = "hat rat mat pat"
regex = re.compile("[r]at")
food = regex.sub("food", food)
print(food)

In [None]:
randstr = "here is \\mystring"
print(re.search(r"\\mystring", randstr))
print(randstr)

In [None]:
randstr = '''
Keep the blue flag
flying high
clelsea
'''
print(randstr)
print(re.sub("\n", " ", randstr))
regex = re.compile("\n")
randstr = regex.sub(" ", randstr)
print(randstr)

In [None]:
randstr = "12345abc"
print("Matches Digit:", len(re.findall("\d", randstr)))
print("Matches Non Digit:", len(re.findall("\D", randstr)))
num = "123 1234 12345 123456 1234567"
print("Matches:", len(re.findall("\d{3,10}", num)))

In [None]:
phone = "412-555-1212"
if re.search("\d{3}-\d{3}-\d{4}", phone):
    print("Right Phone Format")

if re.search("\w{2,20}\s\w{2,20}", "Shahzad Ahmed"):
    print("valid format")

In [None]:
email = "shaze@gmail.com md @.com @seo.com dc@.com"
print("Email Matches:", len(re.findall("[\w._%+-]{1,20}[\w.-]{2,20}.[A-Za-z]{2,3}", email)))

In [None]:
print(re.findall("[\w._%+-]{1,20}", email))
print(re.findall("[\w._%+-]{1,20}[\w.-]{2,20}", email))
print(re.findall("[\w._%+-]{1,20}[\w.-]{2,20}.[A-Za-z]{2,3}", email))

In [None]:
import urllib.request
from re import findall
url = "http://www.summet.com/dmsi/html/codesamples/addresses.html"
response = urllib.request.urlopen(url)
html = response.read()
htmlstr = html.decode()
pdata = findall("\(\d{3}\) \d{3}-\d{4}", htmlstr)
for item in pdata:
    print(item)

In [1]:
mystr = '''abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa

MetaCharacters (Need to be escaped):
.[{()\^$|?*+


321-555-4321
123.555.1234 

Mr. Shahzad
Mr Qasim
Ms Davis
Mrs. Robinson
Mr. T
Saylani@gmail.com
shaze.ahmed@university.edu
shahzad-321-Ahmed@my-work.net

https://www.google.com
http://github.com
https://youtube.com
https://www.nasa.gov
'''

In [2]:
import re
# regex = "\d{3}[.-]\d{3}[.-]\d{4}"
# regex = "M[r|s][rs]?\.?\s[A-Za-z]*\w+"
regex = "[A-Za-z0-9.-]+?@[A-Za-z-.]+"
print(re.findall(regex,mystr ))

['Saylani@gmail.com', 'shaze.ahmed@university.edu', 'shahzad-321-Ahmed@my-work.net']


In [177]:
# Getting Web Address
import re
# regex = "https?://[www\.]?"
regex = "https?://\w+\.\w+[\.\w+]?\w+"
xx = re.findall(regex, mystr)
print(xx)

['https://www.google.com', 'http://coreyms.com', 'https://youtube.com', 'https://www.nasa.gov']


In [214]:
# Phone Number
import re
# regex = "\d{3}[\.-]\d{3}[\.-]\d{4}
regex = "\d{3}[-|\.|\s]\d{3}[-|\.|\s]\d{4}"
xx = re.findall(regex, mystr)
print(xx)

['321-555-4321', '123.555.1234', '152 555 5452']


In [217]:
# Getting Names
import re
regex = "M[rs|r|s]\.?\s[A-Z]\w*"
xx = re.findall(regex, mystr)
print(xx)

['Mr. Schafer', 'Mr Smith', 'Ms Davis', 'Mr. T']


In [134]:
# Getting Emails Address
import re
regex = "[A-Za-z0-9.-]+?@[A-Za-z-.]+"
xx = re.findall(regex, mystr)
for x in xx:
    print(x)

CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
