Read the text file and perform all the regular expression tasks:

Find all occurrences of the word "to".
Extract all email addresses.
Validate US phone numbers.
Find dates in the formats YYYY-MM-DD and DD/MM/YYYY.
Extract words starting with the letter 't'.
Remove all HTML tags.
Validate IPv4 addresses.
Find duplicate words.
Extract all hashtags.
Validate URLs.

In [1]:
import re

def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None

# 1. Find all occurrences of the word "to"
def find_to(text):
    pattern = r'\bto\b'
    matches = re.findall(pattern, text, re.IGNORECASE)
    print("1. Occurrences of 'to':")
    print(matches)
    print(f"Count: {len(matches)}\n")

# 2. Extract email addresses
def extract_emails(text):
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    matches = re.findall(pattern, text)
    print("2. Email addresses:")
    print(matches, "\n")

# 3. Validate US phone numbers
def validate_phone_numbers(text):
    pattern = r'(?:\(\d{3}\)\s*|\d{3}-)\d{3}-\d{4}'
    matches = re.findall(pattern, text)
    print("3. Valid US phone numbers:")
    print(matches, "\n")

# 4. Find dates
def find_dates(text):
    pattern = r'\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}'
    matches = re.findall(pattern, text)
    print("4. Dates found:")
    print(matches, "\n")

# 5. Extract words starting with 't'
def find_t_words(text):
    pattern = r'\b[tT][a-zA-Z]*\b'
    matches = re.findall(pattern, text)
    print("5. Words starting with 't':")
    print(matches, "\n")

# 6. Remove HTML tags
def remove_html_tags(text):
    pattern = r'<[^>]+>'
    cleaned_text = re.sub(pattern, '', text)
    print("6. Text with HTML tags removed:")
    print(cleaned_text, "\n")

# 7. Validate IPv4 addresses
def validate_ipv4(text):
    pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    matches = re.findall(pattern, text)
    print("7. IPv4 addresses:")
    print(matches, "\n")

# 8. Find duplicate words
def find_duplicates(text):
    pattern = r'\b(\w+)\s+\1\b'
    matches = re.findall(pattern, text)
    print("8. Duplicate words:")
    print(matches, "\n")

# 9. Extract hashtags
def extract_hashtags(text):
    pattern = r'#[a-zA-Z0-9_]+'
    matches = re.findall(pattern, text)
    print("9. Hashtags:")
    print(matches, "\n")

# 10. Validate URLs
def validate_urls(text):
    pattern = r'https?://(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/\S*)?'
    matches = re.findall(pattern, text)
    print("10. URLs:")
    print(matches, "\n")

def run_all_regex(text):
    if text is not None:
        find_to(text)
        extract_emails(text)
        validate_phone_numbers(text)
        find_dates(text)
        find_t_words(text)
        remove_html_tags(text)
        validate_ipv4(text)
        find_duplicates(text)
        extract_hashtags(text)
        validate_urls(text)

# Main execution block
if __name__ == "__main__":
    # For Google Colab, you might want to upload the file first
    from google.colab import files
    print("Please upload your input file...")
    uploaded = files.upload()

    # Get the filename of the uploaded file
    filename = next(iter(uploaded))

    # Read the content of the uploaded file
    text = read_file(filename)

    # Run all regex operations
    run_all_regex(text)

Please upload your input file...


Saving Assignment 1.txt to Assignment 1.txt
1. Occurrences of 'to':
['to', 'to', 'to', 'to']
Count: 4

2. Email addresses:
['contact@hotel.com'] 

3. Valid US phone numbers:
['(123) 456-7890', '123-456-7890'] 

4. Dates found:
['2021-08-15'] 

5. Words starting with 't':
['trip', 'to', 'They', 'their', 'them', 'travel', 'the', 'the', 'to', 'their', 'they', 'to', 'the', 'the', 'they', 'through', 'They', 'travel', 'travelblog', 'their', 'they', 'the', 'that', 'to', 'the', 'their'] 

6. Text with HTML tags removed:
"Alice and Bob are planning a trip to Spain. They booked their flights for 2021-08-15 and reserved a hotel at contact@hotel.com. You can reach them at (123) 456-7890 or 123-456-7890. Alice loves sharing updates on social media, using hashtags like #travel and #Spain. Bob, on the other hand, is focused on finding the best places to eat and has a list saved at http://bestrestaurants.com. During their stay, they plan to visit several sites, including the famous Sagrada Familia at 