In [9]:
import re

In [8]:
# source: https://regex101.com/
# ctrl + enter to run
chat1 = 'codebasics: you ask lot of questions 😠  1235678912, abc@xyz.com, 9998881234' 
chat2 = 'codebasics: here it is: (123)-567-8912, abc@xyz.com' 
chat3 = 'codebasics: yes, phone: 1235678912 email: abc@xyz.com'

In [10]:
# \d: any digit (find one digit) 
# (), . are something specific, so we have to put \ before to select them
# a|b: or (match either a or b)
pattern = '\d{10}|\(\d{3}\)-\d{3}-\d{4}' 
# Find the matches with pattern in chat1 (10 digits) 
matches = re.findall(pattern, chat3) 
matches

['1235678912']

In [16]:
# [a-z]: a character in the range a-z (single character)
# [a-z0-9A-Z_]: character (also include number, uppercase, underscore(_)) in the range a-z, 0-9
# *: match a sequence of character
pattern = '[a-z0-9A-Z_]*@[a-z0-9A-Z]*\.[a-zA-Z]*'
matches = re.findall(pattern, chat3)

email = matches[0]
email

'abc@xyz.com'

In [2]:
chat1='codebasics: Hello, I am having an issue with my order # 412889912'
chat2='codebasics: I have a problem with my order number 412889912'
chat3='codebasics: My order 412889912 is having an issue, I was charged 300$ when online it says 280$'

In [5]:
# [^abc]: a character except a, b or c (not include)
# => [^\d]*: all except digit
# (...): capture everything enclosed (group)
# => (\d*): group all digits => get order number
pattern='order[^\d]*(\d*)'
matches = re.findall(pattern, chat3)

email=matches[0]
email

'412889912'

In [10]:
text='''
Born	Elon Reeve Musk
June 28, 1971 (age 50)
Pretoria, Transvaal, South Africa
Citizenship	
South Africa (1971–present)
Canada (1971–present)
United States (2002–present)
Education	University of Pennsylvania (BS, BA)
Title	
Founder, CEO and Chief Engineer of SpaceX
CEO and product architect of Tesla, Inc.
Founder of The Boring Company and X.com (now part of PayPal)
Co-founder of Neuralink, OpenAI, and Zip2
Spouse(s)	
Justine Wilson
​
​(m. 2000; div. 2008)​
Talulah Riley
​
​(m. 2010; div. 2012)​
​
​(m. 2013; div. 2016)
'''

In [11]:
# a+: one or more of a
# => \d+: one or more of digit
# (\d+): group age => get age
pattern = 'age (\d+)'

matches = re.findall(pattern, text)
matches

['50']

In [22]:
# .: any single character
# *: zero ore more characters
# => .*: the whole thing
# (.*): get information after 'Born' (in that line) -> (name)
# \n or without \n are both ok
pattern = 'Born(.*)\n'

matches = re.findall(pattern, text)
# matches[0]: just get one of the matches (don't have [])
# strip(): remove white space
matches[0].strip()

'Elon Reeve Musk'

In [18]:
# \n: go to a new line (endline)
# (.*): group information in the new line (June 28,...)
# \(age: stop at age, cuz () is a special thing, so we add a \ to recognize 
pattern = 'Born.*\n(.*)\(age'

matches = re.findall(pattern, text)
matches[0].strip()

'June 28, 1971'

In [19]:
# start finding from age -> go to new line, find all information in new line => birthplace
pattern = '\(age.*\n(.*)'

matches = re.findall(pattern, text)
matches[0].strip()

'Pretoria, Transvaal, South Africa'

In [20]:
def get_pattern_match(pattern, text):
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]

In [21]:
get_pattern_match('\(age.*\n(.*)', text)

'Pretoria, Transvaal, South Africa'

In [27]:
def get_personal_information(text):
    age = get_pattern_match('age (\d+)', text)
    full_name = get_pattern_match('Born(.*)\n', text)
    birth_day = get_pattern_match('Born.*\n(.*)\(age', text)
    birth_place = get_pattern_match('\(age.*\n(.*)', text)
    
    return {
        # change age(string) to int (type casting)       
        'age': int(age),
        'name': full_name.strip(),
        'birth_date': birth_day.strip(),
        'birth_place': birth_place.strip()
    }

In [28]:
get_personal_information(text)

{'age': 50,
 'name': 'Elon Reeve Musk',
 'birth_date': 'June 28, 1971',
 'birth_place': 'Pretoria, Transvaal, South Africa'}

In [29]:
text = '''
Born	Mukesh Dhirubhai Ambani
19 April 1957 (age 64)
Aden, Colony of Aden
(present-day Yemen)[1][2]
Nationality	Indian
Alma mater	
St. Xavier's College, Mumbai
Institute of Chemical Technology (B.E.)
Stanford University (drop-out)
Occupation	Chairman and MD, Reliance Industries
Spouse(s)	Nita Ambani ​(m. 1985)​[3]
Children	3
Parent(s)	
Dhirubhai Ambani (father)
Kokilaben Ambani (mother)
Relatives	Anil Ambani (brother)
Tina Ambani (sister-in-law)
'''

In [30]:
get_personal_information(text)

{'age': 64,
 'name': 'Mukesh Dhirubhai Ambani',
 'birth_date': '19 April 1957',
 'birth_place': 'Aden, Colony of Aden'}