In [None]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import re


paragraph1 = """
Parsing means taking an input and producing some sort of linguistic structure for it. 
We will use the term parsing very broadly throughout this book, including many kinds 
of structures that might be produced; morphological, syntactic, semantic, discourse; in
the form of a string, or a tree, or a network. Morphological parsing or stemming applies 
to many affixes other than plurals; for example we might need to take any English verb
form ending in-ing (going, talking, congratulating) and parse it into its verbal stem
plus the-ing morpheme.
"""

# Tokenize and clean words
words = word_tokenize(paragraph1.lower())
words = [word for word in words if word.isalpha()]

# Total and unique words
total_words = len(words)
unique_words = len(set(words))
print("Total words:", total_words)
print("Unique words:", unique_words)

# Word frequencies
freq = Counter(words)
print("\nWord Frequencies:")
print(freq)

# Most frequent word
most_freq = freq.most_common(1)[0]
print("\nMost frequent word:", most_freq)

# Least frequent word
least_freq = freq.most_common()[-1]
print("Least frequent word:", least_freq)

# Longest word
longest_word = max(words, key=len)
print("Longest word:", longest_word)

Total words: 85
Unique words: 67

Word Frequencies:
Counter({'parsing': 3, 'of': 3, 'a': 3, 'or': 3, 'and': 2, 'for': 2, 'it': 2, 'we': 2, 'the': 2, 'many': 2, 'might': 2, 'morphological': 2, 'form': 2, 'to': 2, 'means': 1, 'taking': 1, 'an': 1, 'input': 1, 'producing': 1, 'some': 1, 'sort': 1, 'linguistic': 1, 'structure': 1, 'will': 1, 'use': 1, 'term': 1, 'very': 1, 'broadly': 1, 'throughout': 1, 'this': 1, 'book': 1, 'including': 1, 'kinds': 1, 'structures': 1, 'that': 1, 'be': 1, 'produced': 1, 'syntactic': 1, 'semantic': 1, 'discourse': 1, 'in': 1, 'string': 1, 'tree': 1, 'network': 1, 'stemming': 1, 'applies': 1, 'affixes': 1, 'other': 1, 'than': 1, 'plurals': 1, 'example': 1, 'need': 1, 'take': 1, 'any': 1, 'english': 1, 'verb': 1, 'ending': 1, 'going': 1, 'talking': 1, 'congratulating': 1, 'parse': 1, 'into': 1, 'its': 1, 'verbal': 1, 'stem': 1, 'plus': 1, 'morpheme': 1})

Most frequent word: ('parsing', 3)
Least frequent word: ('morpheme', 1)
Longest word: congratulating


In [7]:
paragraph2 = """
Lorem ipsum dolor sit amet. Et quia voluptas et deleniti delectus ea obcaecati perferendis et veniam eveniet. Ea vero unde rem internos impedit et dicta fuga ut dolorem error et facere eius eos laboriosam vero. Ex debitis provident id repudiandae pariatur eos quia dolor vel dolore voluptatum. Ad Quis quas non dolores dolorem aut possimus cupiditate rem cumque ipsum ut sint voluptate aut dolores similique.
"""

# Tokenize and clean words
words = word_tokenize(paragraph2.lower())
words = [word for word in words if word.isalpha()]

# Total and unique words
total_words = len(words)
unique_words = len(set(words))
print("Total words:", total_words)
print("Unique words:", unique_words)

# Word frequencies
freq = Counter(words)
print("\nWord Frequencies:")
print(freq)

# Most frequent word
most_freq = freq.most_common(1)[0]
print("\nMost frequent word:", most_freq)

# Least frequent word
least_freq = freq.most_common()[-1]
print("Least frequent word:", least_freq)

# Longest word
longest_word = max(words, key=len)
print("Longest word:", longest_word)

Total words: 65
Unique words: 50

Word Frequencies:
Counter({'et': 5, 'ipsum': 2, 'dolor': 2, 'quia': 2, 'ea': 2, 'vero': 2, 'rem': 2, 'ut': 2, 'dolorem': 2, 'eos': 2, 'dolores': 2, 'aut': 2, 'lorem': 1, 'sit': 1, 'amet': 1, 'voluptas': 1, 'deleniti': 1, 'delectus': 1, 'obcaecati': 1, 'perferendis': 1, 'veniam': 1, 'eveniet': 1, 'unde': 1, 'internos': 1, 'impedit': 1, 'dicta': 1, 'fuga': 1, 'error': 1, 'facere': 1, 'eius': 1, 'laboriosam': 1, 'ex': 1, 'debitis': 1, 'provident': 1, 'id': 1, 'repudiandae': 1, 'pariatur': 1, 'vel': 1, 'dolore': 1, 'voluptatum': 1, 'ad': 1, 'quis': 1, 'quas': 1, 'non': 1, 'possimus': 1, 'cupiditate': 1, 'cumque': 1, 'sint': 1, 'voluptate': 1, 'similique': 1})

Most frequent word: ('et', 5)
Least frequent word: ('similique', 1)
Longest word: perferendis


In [9]:

print("2.1 (a) - All alphabetic strings")
regex_a = r'\b[a-zA-Z]+\b'
text_a = "Hello123 this is NLP_lab@2025"
print("Matches:", re.findall(regex_a, text_a))
print()

print("2.1 (b) - Lowercase strings ending in 'b'")
regex_b = r'\b[a-z]*b\b'
text_b = "cab grab slab Crib"
print("Matches:", re.findall(regex_b, text_b))
print()

print("2.1 (c) - Two consecutive repeated words")
regex_c = r'\b(\w+)\s+\1\b'
text_c = "He said the the word twice, but not like bug bugged him."
print("Matches:", re.findall(regex_c, text_c))
print()

print("2.1 (d) - Each 'a' is immediately preceded and followed by 'b'")
regex_d = r'^(b|bab)*$'
test_strings_d = ["babbbbab", "baab", "bbababb", "bab"]
for s in test_strings_d:
    print(f"'{s}':", bool(re.fullmatch(regex_d, s)))
print()

print("2.1 (e) - Line starts with integer and ends with a word")
regex_e = r'^\d+\b.*\b[a-zA-Z]+$'
text_e = "42 the answer is always life"
print("Matches:", bool(re.match(regex_e, text_e)))
print()

print("2.1 (f) - String contains both 'grotto' and 'raven'")
regex_f = r'(?=.*\bgrotto\b)(?=.*\braven\b)'
text_f1 = "The raven flew above the dark grotto at night."
text_f2 = "The grottos were creepy but no raven was seen."
print(f"text_f1: {bool(re.search(regex_f, text_f1))}")
print(f"text_f2: {bool(re.search(regex_f, text_f2))}")
print()

print("2.1 (g) - Capture first word of a sentence")
regex_g = r'^[\"\'(]*([A-Z][a-z]*)'
text_g = '"Hello there, how are you?"'
match_g = re.match(regex_g, text_g)
print("First word:", match_g.group(1) if match_g else "No match")

2.1 (a) - All alphabetic strings
Matches: ['this', 'is']

2.1 (b) - Lowercase strings ending in 'b'
Matches: ['cab', 'grab', 'slab']

2.1 (c) - Two consecutive repeated words
Matches: ['the']

2.1 (d) - Each 'a' is immediately preceded and followed by 'b'
'babbbbab': True
'baab': False
'bbababb': False
'bab': True

2.1 (e) - Line starts with integer and ends with a word
Matches: True

2.1 (f) - String contains both 'grotto' and 'raven'
text_f1: True
text_f2: False

2.1 (g) - Capture first word of a sentence
First word: Hello


In [None]:
text = """
Hello! How's NLP2025 treating you?
dog CAT mouse Mouse fish
Paris is in France and Earth is round.
This test will find four words like done.
He said the the thing was weird, not go go now.
I was singing and running while eating snacks.
This book has a letter and a cool story.
"""

# 2.2 - Regular expressions for word-based patterns

# (a) Match a single alphabetic word
print("2.2 (a): Single alphabetic words")   
print(re.findall(r'\b[a-zA-Z]+\b', text), '\n')

# (b) Match only lowercase alphabetic words
print("2.2 (b): Lowercase words")
print(re.findall(r'\b[a-z]+\b', text), '\n')

# (c) Match words starting with a capital letter
print("2.2 (c): Capitalized words")
print(re.findall(r'\b[A-Z][a-z]*\b', text), '\n')

# (d) Match all 4-letter words
print("2.2 (d): Words exactly 4 letters long")
print(re.findall(r'\b[a-zA-Z]{4}\b', text), '\n')

# (e) Match repeated words (like "go go")
print("2.2 (e): Repeated consecutive words")
print(re.findall(r'\b(\w+)\s+\1\b', text), '\n')

# (f) Match words ending in 'ing'
print("2.2 (f): Words ending in 'ing'")
print(re.findall(r'\b\w+ing\b', text), '\n')

# (g) Match words with at least one double letter
print("2.2 (g): Words with double letters")
matches = re.finditer(r'\b\w*(\w)\1\w*\b', text)
print([m.group(0) for m in matches])

2.2 (a): Single alphabetic words
['Hello', 'How', 's', 'treating', 'you', 'dog', 'CAT', 'mouse', 'Mouse', 'fish', 'Paris', 'is', 'in', 'France', 'and', 'Earth', 'is', 'round', 'This', 'test', 'will', 'find', 'four', 'words', 'like', 'done', 'He', 'said', 'the', 'the', 'thing', 'was', 'weird', 'not', 'go', 'go', 'now', 'I', 'was', 'singing', 'and', 'running', 'while', 'eating', 'snacks', 'This', 'book', 'has', 'a', 'letter', 'and', 'a', 'cool', 'story'] 

2.2 (b): Lowercase words
['s', 'treating', 'you', 'dog', 'mouse', 'fish', 'is', 'in', 'and', 'is', 'round', 'test', 'will', 'find', 'four', 'words', 'like', 'done', 'said', 'the', 'the', 'thing', 'was', 'weird', 'not', 'go', 'go', 'now', 'was', 'singing', 'and', 'running', 'while', 'eating', 'snacks', 'book', 'has', 'a', 'letter', 'and', 'a', 'cool', 'story'] 

2.2 (c): Capitalized words
['Hello', 'How', 'Mouse', 'Paris', 'France', 'Earth', 'This', 'He', 'I', 'This'] 

2.2 (d): Words exactly 4 letters long
['fish', 'This', 'test', 'wil