<a href="https://colab.research.google.com/github/VimalChamyal/Regular-Expressions/blob/main/Regular_Expression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regular Expressions (RegEx)

In [1]:
import re
# Module in python that allows us to do regular expression

## Extract just the phone number

In [2]:
text = "Patient's phone no. is 9123445667. Bill amount is 1240$."
pattern = '\d{10}'
# We will extract only that number which has exactly 10 digits

match = re.findall(pattern, text)

# re.findall(find_what, find_where)
match

['9123445667']

## Extract the amount

In [3]:
text = "Patient's phone no. is 9123445667. Bill amount is 1240$."
pattern = '\d+\$'
# We will extract only that number which is a currency (has $ after it)

match = re.findall(pattern, text)

# re.findall(find_what, find_where)
match

['1240$']

## Extract the amount (without currency symbol)
### Applying the grouping concept. Grouping using ()

In [4]:
text = "Patient's phone no. is 9123445667. Bill amount is 1240$."
pattern = '(\d+)\$'
# We will extract only that number which is a currency (has $ after it)

match = re.findall(pattern, text)

# re.findall(find_what, find_where)
match

['1240']

## Extract the phone numbers with all the available formats

In [5]:
text = "Patient's phone number is (732)-201-2020. His cousin's number is 9193124959"
pattern = '\(\d{3}\)-\d+-\d{4}|\d{10}'
# We will extract phone numbers in both the available formats

match = re.findall(pattern, text)

# re.findall(find_what, find_where)
match

['(732)-201-2020', '9193124959']

## Extract amount and the phone number when called

In [6]:
text = "Patient's phone no. is 9123445667. Bill amount is 1240$."
pattern = '(\d{10})\D+(\d+)\$'
# We will extract only that number which is a currency (has $ after it)

match = re.search(pattern, text)

# re.search(find_what, find_where)
match

#findall returns list whereas search returns list

<re.Match object; span=(23, 55), match='9123445667. Bill amount is 1240$'>

In [7]:
phone_number, bill_amount = match.groups()

In [8]:
phone_number

'9123445667'

In [9]:
bill_amount

'1240'

## Working on the real text

In [10]:
text = '''
Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-111-2222

Name: Marta Sharapova Date: 5/11/2022

Address: 9 tennis court, new Russia, DC

Prednisone 20 mg
Lialda 2.4 gram

Directions:

Prednisone, Taper 5 mg every 3 days,
Finish in 2.5 weeks a
Lialda - take 2 pill everyday for 1 month

Refill: 2 times
'''

## Extract the name

In [11]:
pattern = "Name:(.+)Date:"

match = re.findall(pattern, text)
match

[' Marta Sharapova ']

In [12]:
# Remove the spaces from first and end
match[0].strip()

'Marta Sharapova'

## Extract the address

In [13]:
pattern = "Address: (.*)\n"

match = re.findall(pattern, text)
print(match[0].strip())

9 tennis court, new Russia, DC


## Extract the mediciences

In [14]:
pattern = "Address:.*\n\n(.*)\n(.*)\n"

match = re.findall(pattern,text)
match

[('Prednisone 20 mg', 'Lialda 2.4 gram')]

## Extract the directions

In [15]:
pattern = "Directions:\n*(.*)Refill:"

match = re.findall(pattern, text, flags = re.DOTALL)
print(match[0].strip())

Prednisone, Taper 5 mg every 3 days,
Finish in 2.5 weeks a
Lialda - take 2 pill everyday for 1 month


In [16]:
pattern = "Refill: (.*)"

match = re.findall(pattern, text)
match

['2 times']

In [17]:
pattern = "Address:[^\n]*(.*)Directions"
match = re.findall(pattern, text, flags=re.DOTALL)

print(match[0].strip())

Prednisone 20 mg
Lialda 2.4 gram
