In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
text = """Berkeley MFE is the best financial engineering program. 
     You can study disciplines like data science, derivatives, fixed income, calculus, optimization, statistics etc. 
    The previous batch of MFE started on 03-16-2019. 
    and the class graduated on on 03-13-2020. The phone number of the office is +91-510-999-9912. Also, phone is spelt this way, not as phon.
    The email address of the office is mfe@berkeley.edu.
    Congrats on making it here guys.   Yayyy!
    PS : If you would like to know, my VISA Credit card details are : 4111111111111111"""


# Trying the basics

1. Get text that starts with a capital letter and is at the beginning of the sentence

In [3]:
pattern = re.compile(r'^Berkeley')
re.findall(pattern,text)
#pattern = re.compile(r'\d{3}–\d{3}–\d{4}')


['Berkeley']

2. Get all words that have one or more capital letters

In [9]:
re.findall(re.compile(r'[A-Z]\w+'),text)  

#Start with One or more capital letter chars and one or more letters after that
#plus tells one or more small letters 

['Berkeley',
 'MFE',
 'You',
 'The',
 'MFE',
 'The',
 'Also',
 'The',
 'Congrats',
 'Yayyy',
 'PS',
 'If',
 'VISA',
 'Credit']

3. Using the question mark (?) operator - to match words containing one or more characters preceding the letter preceding '?'

In [6]:
#Phon should be there but e can or cannot be there 
re.findall(re.compile(r'phone?'),text)

['phone', 'phone', 'phon']

4. Using the {}. This is used to indicate the repetition of a character. In the word 'Yayyy', we can indicate that y occurs 2 or more times.

In [5]:
#How many times you want a sequence : {2,} is 2 or more times repeating 
re.findall(re.compile(r'Yay{2,}'),text)

['Yayyy']

# Identifying Phone Numbers from a transcript : They come in a format 

1. In the standard phone number format, we first need a country code, prefixed with a +. This means that we need **\+[0-9]+**. The escape sequence makes the first + a character and not an operator. Then we use [] to identify one or more digits. The plus sign at last indicates that one or more digits can be present.

2. Then we need a sequence of XXX-XXX-XXXX. This can be done by \d{3}-\d{3}-\d{4}. This indicates that we have a placeholder for digits (\d) followed by the digits indicated within {}.

In [7]:
#first plus sign is the characyer that we want = \+ says it is a character and not the operator 
#One or more other digits form 0-9 which can occur one or more times 
#We need 3 digits , {3} means it repeats 3 times 

pattern_mobile = re.compile(r'\+[0-9]+-\d{3}-\d{3}-\d{4}')

In [8]:
re.findall(pattern_mobile,text)

['+91-510-999-9912']

# Identifying dates

We will attempt to identify dates in format of MM-DD-YYYY.
1. For the first part (MM), we indicate the possible combinations - 0 followed by digits from 1-9, or 1 followed by 0,1 or 2. This covers all months from 1-12. This gives us 0[1-9]|1[012].

2. For the second part (DD), the possible combinations are - 0 followed by digits from 1-9, 1/2 followed by 0-9 or 3 followed by 0 or 1. This covers all dates from 01-31. This gives us 0[1-9]|[12][0-9]|3[01]

3. For the last part, we cover all years from 1900 to 2099. The pattern for this is 19|20\d\d. 


In [173]:
#0[1-9]|1[012] = Starts with 0 and followed by 1-9 or we have 1 and followed by 012 
#(\.|–|-|/) = can have . or - 

pattern_date = re.compile(r'(0[1-9]|1[012])(\.|–|-|/)(0[1-9]|[12][0-9]|3[01])(\.|–|-|/)(19|20\d\d)')

print(re.findall(pattern_date,text))
print(["".join(x) for x in results])

[('03', '-', '16', '-', '2019'), ('03', '-', '13', '-', '2020')]
['03-16-2019', '03-13-2020']


# Removing Extra Whitespace

\t indicates a tab so we use it to detect any extra whitespace and remove the same

In [13]:
# As it is in rectangular braces, either one should occur , either a tab or a space, and it can occur multiple times as ger gave + 

pattern3 = re.compile(r'[ \t]+')
re.sub(pattern3," ",text)

'Berkeley MFE is the best financial engineering program. \n You can study disciplines like data science, derivatives, fixed income, calculus, optimization, statistics etc. \n The previous batch of MFE started on 03-16-2019. \n and the class graduated on on 03-13-2020. The phone number of the office is +91-510-999-9912. Also, phone is spelt this way, not as phon.\n The email address of the office is mfe@berkeley.edu.\n Congrats on making it here guys. Yayyy!\n PS : If you would like to know, my VISA Credit card details are : 4111111111111111'

In [14]:
# As it is in rectangular braces, either one should occur , either a tab or a space, and it can occur multiple times as ger gave + 

pattern3 = re.compile(r'[\s\t]+')
re.sub(pattern3," ",text)

'Berkeley MFE is the best financial engineering program. You can study disciplines like data science, derivatives, fixed income, calculus, optimization, statistics etc. The previous batch of MFE started on 03-16-2019. and the class graduated on on 03-13-2020. The phone number of the office is +91-510-999-9912. Also, phone is spelt this way, not as phon. The email address of the office is mfe@berkeley.edu. Congrats on making it here guys. Yayyy! PS : If you would like to know, my VISA Credit card details are : 4111111111111111'

# Detecting Email Addresses

We tackle this by breaking it into 3 parts : username, domain (gmail) and top-level domain (.com). [balckrock.com etc]

The username can be a combination of alphabets, numbers and some special characters (like \_) : \[A-Z0-9._\%+-\] <br>
The domain name can have alphabets, numbers, periods (.) and hyphens(-) : \[A-Z0-9.-\] <br>
The top-level domain will have alphabets and will be 2 or more characters long : \[A-Z\]{2,} <br>


In [15]:
#username = [A-Z0-9._%+-] any of these can occur one or more times, a-Z or 0-9 etc 
#domain = thye dont have numbers = there can be 2 or more characters in that.None are usually small 

#Ignorecase will consider small and capital letters 

pattern_email = re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}',re.IGNORECASE)
re.findall(pattern_email,text)

['mfe@berkeley.edu']

# Detecting Credit Card Numbers

This is a popular problem, however the most feasible way to do this via regex is to consider the patterns with respect to different card providers : Visa, Mastercard, AmEx. <br>
Here we have taken the example of VISA, which starts with a 4 and then has either 16 digits (new cards) or 13 digits (older cards)


In [177]:
#starts with 4, and the remaining numbers can happen 12 tims like in old cars
#?:[0-9]{3}) =  It will search for th extra numbers from 0-9 , if they exist return with it or else return the remaining 
pattern_credit = re.compile(r'(4[0-9]{12}(?:[0-9]{3}))')
re.findall(pattern_credit,text)

['4111111111111111']

# Easier Way to find dates

In [178]:
#Simple library 
import datefinder
matches = datefinder.find_dates(text)
for date in matches:
    print(date)

2019-03-16 00:00:00
2020-03-13 00:00:00
