In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [20]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hema\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hema\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hema\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
# Tokenization

with open('Document.txt', 'r') as file:
    document = file.readlines()
tokenized_lines = [word_tokenize(line.strip()) for line in document]

for i, tokens in enumerate(tokenized_lines):
    print(f"Tokens for line {i+1}: {tokens}")

Tokens for line 1: ['In', 'the', 'heart', 'of', 'the', 'bustling', 'city', ',', 'there', 'lies', 'a', 'small', 'park', 'that', 'serves', 'as', 'a', 'refuge', 'for', 'weary', 'souls', '.', 'The', 'park', 'is', 'adorned', 'with', 'vibrant', 'flowers', ',', 'towering', 'trees', ',', 'and', 'a', 'serene', 'pond', 'that', 'reflects', 'the', 'sky', '.', 'Every', 'morning', ',', 'joggers', 'can', 'be', 'seen', 'making', 'their', 'way', 'along', 'the', 'winding', 'paths', ',', 'while', 'children', 'play', 'joyfully', 'on', 'the', 'swings', '.', 'The', 'sound', 'of', 'laughter', 'fills', 'the', 'air', ',', 'mingling', 'with', 'the', 'chirping', 'of', 'birds', 'perched', 'on', 'branches', '.']
Tokens for line 2: ['As', 'the', 'sun', 'rises', 'higher', ',', 'the', 'park', 'becomes', 'a', 'gathering', 'place', 'for', 'people', 'from', 'all', 'walks', 'of', 'life', '.', 'Some', 'come', 'to', 'read', 'their', 'favorite', 'books', ',', 'while', 'others', 'engage', 'in', 'deep', 'conversations', 'with

In [22]:
# Stemming

stemmer = PorterStemmer()
stemmed_lines = [[stemmer.stem(token) for token in tokens] for tokens in tokenized_lines]

# Output the stemmed words
for i, stemmed in enumerate(stemmed_lines):
    print(f"Stemmed words for line {i + 1}: {stemmed}")

# Stemmed words to a new text file
with open('stemmed_words.txt', 'w') as output_file:
    for i, stemmed in enumerate(stemmed_lines):
        output_file.write(f"Stemmed words for line {i + 1}: {stemmed}\n")

Stemmed words for line 1: ['in', 'the', 'heart', 'of', 'the', 'bustl', 'citi', ',', 'there', 'lie', 'a', 'small', 'park', 'that', 'serv', 'as', 'a', 'refug', 'for', 'weari', 'soul', '.', 'the', 'park', 'is', 'adorn', 'with', 'vibrant', 'flower', ',', 'tower', 'tree', ',', 'and', 'a', 'seren', 'pond', 'that', 'reflect', 'the', 'sky', '.', 'everi', 'morn', ',', 'jogger', 'can', 'be', 'seen', 'make', 'their', 'way', 'along', 'the', 'wind', 'path', ',', 'while', 'children', 'play', 'joy', 'on', 'the', 'swing', '.', 'the', 'sound', 'of', 'laughter', 'fill', 'the', 'air', ',', 'mingl', 'with', 'the', 'chirp', 'of', 'bird', 'perch', 'on', 'branch', '.']
Stemmed words for line 2: ['as', 'the', 'sun', 'rise', 'higher', ',', 'the', 'park', 'becom', 'a', 'gather', 'place', 'for', 'peopl', 'from', 'all', 'walk', 'of', 'life', '.', 'some', 'come', 'to', 'read', 'their', 'favorit', 'book', ',', 'while', 'other', 'engag', 'in', 'deep', 'convers', 'with', 'friend', '.', 'the', 'elderli', 'often', 'sit

In [23]:
# Lemmatization

from nltk import pos_tag

sample_sentence = "For a sentence containing  words like visit, visitor, visiting, visited"
tokens = word_tokenize(sample_sentence)

token_tags = pos_tag(tokens)
print(token_tags)

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    """Convert treebank tags to wordnet tags."""
    if treebank_tag.startswith('J'):
        return 'a'  
    elif treebank_tag.startswith('V'):
        return 'v'  
    elif treebank_tag.startswith('N'):
        return 'n'  
    elif treebank_tag.startswith('R'):
        return 'r'  
    else:
        return None  
    
lemmatized_words = []
for token, tag in token_tags:
    wordnet_pos = get_wordnet_pos(tag) or 'n' # Default to noun if no tag found
    lemmatized_words.append(lemmatizer.lemmatize(token, pos=wordnet_pos))

print("Lemmatized words:")
print(lemmatized_words)

with open('lemmatized_words.txt', 'w') as output_file:
    output_file.write("Lemmatized words:\n")
    output_file.write(', '.join(lemmatized_words) + '\n')

[('For', 'IN'), ('a', 'DT'), ('sentence', 'NN'), ('containing', 'VBG'), ('words', 'NNS'), ('like', 'IN'), ('visit', 'NN'), (',', ','), ('visitor', 'NN'), (',', ','), ('visiting', 'VBG'), (',', ','), ('visited', 'VBD')]
Lemmatized words:
['For', 'a', 'sentence', 'contain', 'word', 'like', 'visit', ',', 'visitor', ',', 'visit', ',', 'visit']


In [27]:
# Stop Words Removal

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(stop_words)

tokenized_lines = [word_tokenize(line.strip()) for line in document]

filtered_lines = []
for tokens in tokenized_lines:
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    filtered_lines.append(filtered_tokens)

for i, filtered in enumerate(filtered_lines):
    print(f"Filtered words for line {i + 1}: {filtered}")

{"wasn't", 'here', 'can', 'from', 'you', "you've", 'those', 'their', 'being', 'not', 'mightn', 'why', 'about', 'are', 'him', "she's", 'there', 'ma', 'so', 'they', 'now', 'm', 'more', 'up', 'was', "you'll", 'for', 'only', 'were', 'a', 'against', 'out', 'itself', 'the', 'few', 'just', 'by', 'who', 't', "wouldn't", 'above', 'any', 'such', 've', 'won', 'because', 'it', "isn't", 'both', 'what', 'but', 'before', 'wasn', 'same', 'over', 'hasn', 'yourselves', 'shan', 'should', 'don', "weren't", 'while', 'wouldn', 'this', 'd', 'have', 'aren', 'himself', 'o', "shouldn't", "you'd", 'or', 'down', 'doesn', "mightn't", 'am', 'needn', 'further', 'hadn', 'themselves', 'where', 're', "shan't", 'mustn', 'some', "couldn't", 'his', 'its', 'yourself', "should've", 'when', 'that', 'each', 'off', 'how', 'until', 'your', "it's", 'does', 'between', 'ourselves', 'has', "that'll", 'ain', 'didn', 'with', 's', 'through', 'll', 'isn', 'whom', 'under', "doesn't", 'no', 'than', 'our', 'as', 'we', 'myself', 'of', 'own