# Exercise: Regular Expressions

In [4]:
import re

In [5]:
#1 Question: Match all email addresses (SOLVED)

inputs = ["My email is john@email.com and mary@otherplace.net",
          "Visit us at support@ourcompany.co.uk for help",
          "No emails here"]

pattern = r"(\w+)@(\w+)\.(\w+)"

matches = []
for input in inputs:
    matches += re.findall(pattern, input)

for m in matches:
    username = m[0]
    mailserver = m[1]
    domain = m[2]
    print(f"Username: {username}")
    print(f"Mailserver: {mailserver}")
    print(f"Domain: {domain}")
    print('')

Username: john
Mailserver: email
Domain: com

Username: mary
Mailserver: otherplace
Domain: net

Username: support
Mailserver: ourcompany
Domain: co



In [6]:
#2 Question: Extract domain from email


def extract_domains(emails):
    pattern = r"@(.+)$"
    matches = [re.search(pattern, email).group(1) for email in emails if re.search(pattern, email)]
    return matches

inputs = ["john@email.com",
          "mary+newsletter@gmail.com",
          "support@ourcompany.co.uk"]

domains = extract_domains(inputs)

for domain in domains:
    print(f"Domain: {domain}")
    print('')

Domain: email.com

Domain: gmail.com

Domain: ourcompany.co.uk



In [7]:
#3 Question: Validate phone number

def validate_phone_numbers(texts):
    pattern = r'\b(\d{3}[-.\s]?\d{3}[-.\s]?\d{4}|\(\d{3}\)\s?\d{3}[-.\s]?\d{4})\b'
    phone_numbers = [match for text in texts for match in re.findall(pattern, text)]
    return phone_numbers

inputs = ["555-123-4567",
          "1 (234) 567-8910",
          "notaphonenumber"]

phone_numbers = validate_phone_numbers(inputs)

for phone_number in phone_numbers:
    print(phone_number)

555-123-4567


In [8]:
#4 Question: Extract area code

inputs = ["(555) 123-4567",
          "1 (234) 567-8910",
          "5551234567"]

pattern = r'\((\d{3})\)|(\d{3})\s|-|\s'

def extract_area_code(phone_number):
    matches = [group for match in re.findall(pattern, phone_number) for group in match if group]
    return matches[0] if matches else None

for phone_number in inputs:
    area_code = extract_area_code(phone_number)
    print(area_code)

555
234
None


In [9]:
#5 Question: Match URLs and extract host

inputs = ["Visit https://www.example.com for more info",
          "Our website is example.com",
          "No URLs here"]

pattern = r'https?://([A-Za-z0-9.-]+)'

def extract_host_from_urls(texts):
    hosts = [re.findall(pattern, text)[0] for text in texts if re.findall(pattern, text)]
    return hosts

hosts = extract_host_from_urls(inputs)

for host in hosts:
    print(host)

www.example.com


In [10]:
#6 Question: Remove non-alphabetic characters


inputs = ["Hello world!",
          "123 Main St.",
          "greetings&more"]

def remove_non_alphabetic(texts):
    cleaned_texts = [re.sub(r'[^a-zA-Z ]', '', text) for text in texts]
    return cleaned_texts

cleaned_inputs = remove_non_alphabetic(inputs)

for cleaned_text in cleaned_inputs:
    print(cleaned_text)

Hello world
 Main St
greetingsmore


In [11]:
#7 Question: Find words containing "tion"

inputs = ["This is a test sentence with the word station in it.",
          "No words containing tion here",
          "motion activation vacation"]

# your code here ...:

def find_words_with_tion(texts):
    return [re.findall(r'\b\w*tion\w*\b', text) for text in texts]

tion_words_list = find_words_with_tion(inputs)

for tion_words in tion_words_list:
    if tion_words:
        print(tion_words)

['station']
['tion']
['motion', 'activation', 'vacation']


In [12]:
#8 Question: Replace all occurrences of "hello" with "goodbye"

inputs = ["hello world",
          "hello there",
          "no match"]

 # your code here ...:


def modify_texts(texts, search_pattern, replacement):
    return [re.sub(search_pattern, replacement, text) for text in texts]

modified_texts = modify_texts(inputs, 'hello', 'goodbye')

for text in modified_texts:
    print(text)

goodbye world
goodbye there
no match


In [13]:
#9 Question: Extract date strings in ISO8601 format

inputs = ["Log from 2023-01-15",
          "Meeting on 2023-02-01T13:00:00Z",
          "No dates"]

 # your code here ...:


def extract_iso8601_dates(texts):
    return [re.findall(r'\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}Z)?', text) for text in texts]

iso8601_dates_list = extract_iso8601_dates(inputs)

for iso8601_dates in iso8601_dates_list:
    if iso8601_dates:
        print(iso8601_dates)

['2023-01-15']
['2023-02-01T13:00:00Z']


In [14]:
#10 Question: Validate correctly formatted date

inputs = ["2023-01-15",
          "02/01/2023",
          "invalid date"]

 # your code here ...:



def validate_dates(dates):
    return [re.match(r'\d{4}-\d{2}-\d{2}', date) is not None for date in dates]

validity_list = validate_dates(inputs)

for date, is_valid in zip(inputs, validity_list):
    print(f'"{date}" is valid: {is_valid}')

"2023-01-15" is valid: True
"02/01/2023" is valid: False
"invalid date" is valid: False


In [16]:
#11 Question: Remove punctuation except hyphens

inputs = ["Hello! World?",
          "123-Main_St.",
          "Hi there."]

def remove_punctuation_except_hyphens(text):
    cleaned_text = re.sub(r'[^\w\s-]', '', text)
    return cleaned_text

for i in range(len(inputs)):
    inputs[i] = remove_punctuation_except_hyphens(inputs[i])

for cleaned_text in inputs:
    print(cleaned_text)

Hello World
123-Main_St
Hi there


In [17]:
#12 Question: Count occurrences of a word

inputs = ["Hello world. Hello!",
          "Hello hello world",
          "no match"]

# your code here ...:


def count_word_occurrences(word, text_list):
    word = word.lower()
    count = 0

    pattern = r'\b' + re.escape(word) + r'\b'

    for text in text_list:
        matches = re.findall(pattern, text.lower())
        count += len(matches)

    return count

word_to_count = "hello"
occurrences = count_word_occurrences(word_to_count, inputs)
print(f'The word "{word_to_count}" appears {occurrences} times.')


The word "hello" appears 4 times.


In [19]:
#13 Question: Extract IP addresses from log

inputs = ["127.0.0.1 - GET /",
          "User logged in from 192.168.1.1",
          "No IPs"]

# your code here ...:
def extract_ip_addresses(texts):
    return [re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', text) for text in texts]

ip_addresses_list = extract_ip_addresses(inputs)

for ip_addresses in ip_addresses_list:
    if ip_addresses:
        print(ip_addresses)

['127.0.0.1']
['192.168.1.1']


In [20]:
#14 Question: Redact credit card and SSN numbers

inputs = ["Visa: 4111-1111-1111-1111",
          "My SSN is 111-11-1111",
          "No numbers"]

# your code here ...:
def redact_numbers(texts):
    return [re.sub(r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', 'XXXX-XXXX-XXXX-XXXX',
                   re.sub(r'\b\d{3}-\d{2}-\d{4}\b', 'XXX-XX-XXXX', text)) for text in texts]

redacted_texts = redact_numbers(inputs)

for redacted_text in redacted_texts:
    print(redacted_text)

Visa: XXXX-XXXX-XXXX-XXXX
My SSN is XXX-XX-XXXX
No numbers
