In [23]:
import re

# Read corpus from external file
with open('corpus.txt', 'r') as file:
    corpus = file.read()

# Function to demonstrate regex matching
def demonstrate_regex(pattern, description):
    matches = re.findall(pattern, corpus)
    print(f"\n{description}\nPattern: {pattern}\nMatches: {matches}")


In [24]:
# 1. Hyphen [2-5] and [b-f]
demonstrate_regex(r'[2-5]', "Matches any single digit from 2 to 5")
demonstrate_regex(r'[b-f]', "Matches any single lowercase letter from b to f")


Matches any single digit from 2 to 5
Pattern: [2-5]
Matches: ['3', '4', '2', '3', '2', '3', '4', '5']

Matches any single lowercase letter from b to f
Pattern: [b-f]
Matches: ['e', 'e', 'e', 'c', 'd', 'd', 'e', 'd', 'd', 'c', 'c', 'b', 'c', 'b', 'b', 'c', 'b', 'b', 'b', 'c', 'b', 'c', 'c', 'd', 'e', 'c', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e']


In [42]:
# 2. Caret symbol (^)
demonstrate_regex(r'^H', "Matches lines starting with 'H'")


Matches lines starting with 'H'
Pattern: ^H
Matches: []


In [26]:
# 3. Question mark (?) operator
demonstrate_regex(r'colou?r', "Matches 'colo' followed by 'r' or 'ur'")


Matches 'colo' followed by 'r' or 'ur'
Pattern: colou?r
Matches: ['color', 'colour']


In [27]:
# 4. Kleene star (*) operator
demonstrate_regex(r'ab*', "Matches 'a' followed by zero or more 'b's")


Matches 'a' followed by zero or more 'b's
Pattern: ab*
Matches: ['a', 'a', 'a', 'ab', 'abb', 'abbb', 'a', 'a', 'a', 'a', 'a']


In [28]:
# 5. Kleene plus (+) operator
demonstrate_regex(r'ab+', "Matches 'a' followed by one or more 'b's")


Matches 'a' followed by one or more 'b's
Pattern: ab+
Matches: ['ab', 'abb', 'abbb']


In [29]:
# 6. Dot (.) operator
demonstrate_regex(r'.1', "Matches any single character followed by '1'")


Matches any single character followed by '1'
Pattern: .1
Matches: ['a1']


In [38]:
# 7. Pipe (|) symbol
demonstrate_regex(r'dog|hat', "Matches 'cat' or 'dog'")


Matches 'cat' or 'dog'
Pattern: dog|hat
Matches: ['dog', 'dog', 'hat']


In [31]:
# 8. Word boundary (\b)
demonstrate_regex(r'\bthe\b', "Matches the word 'the'")


Matches the word 'the'
Pattern: \bthe\b
Matches: ['the', 'the', 'the']


In [32]:
# 9. Non-word boundary (\B)
demonstrate_regex(r'\Bthe\B', "Matches 'the' not at the word boundary")


Matches 'the' not at the word boundary
Pattern: \Bthe\B
Matches: ['the']


In [33]:
# 10. Regex /[^a-zA-Z][tT]he[^a-zA-Z]/
demonstrate_regex(r'[^a-zA-Z][tT]he[^a-zA-Z]', "Matches 'the' or 'The' surrounded by non-letter characters")


Matches 'the' or 'The' surrounded by non-letter characters
Pattern: [^a-zA-Z][tT]he[^a-zA-Z]
Matches: ['\nthe ', ' the ', ' the ', '1the ', '2The ', '3the4', '5the ', '6The7']
