In [3]:
pip install nltk




In [4]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

def split_sentences(document):
    sentences = sent_tokenize(document)
    return sentences

if __name__ == "__main__":
    # Example document
    document = "This is a sample document. It has multiple sentences. We want to split it. We will identify the POS tags from the given sentences and words."

    # Splitting the document into sentences
    sentences = split_sentences(document)

    # Printing the result
    print("Original Document:")
    print(document)
    print("\nSplit Sentences:")
    for i, sentence in enumerate(sentences, 1):
        print(f"{i}. {sentence}")


Original Document:
This is a sample document. It has multiple sentences. We want to split it. We will identify the POS tags from the given sentences and words.

Split Sentences:
1. This is a sample document.
2. It has multiple sentences.
3. We want to split it.
4. We will identify the POS tags from the given sentences and words.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')

def tokenize_and_stem(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Initialize Porter Stemmer
    porter_stemmer = PorterStemmer()

    # Stem each word
    stemmed_words = [porter_stemmer.stem(word) for word in words]

    return words, stemmed_words

if __name__ == "__main__":
    # Example input string
    input_string = "Tokenization involves breaking text into individual words or tokens, and stemming is the process of reducing words to their base or root form."

    # Tokenize and stem the input string
    original_tokens, stemmed_tokens = tokenize_and_stem(input_string)

    # Print the results
    print("Original Tokens:")
    print(original_tokens)

    print("\nStemmed Tokens:")
    print(stemmed_tokens)


Original Tokens:
['Tokenization', 'involves', 'breaking', 'text', 'into', 'individual', 'words', 'or', 'tokens', ',', 'and', 'stemming', 'is', 'the', 'process', 'of', 'reducing', 'words', 'to', 'their', 'base', 'or', 'root', 'form', '.']

Stemmed Tokens:
['token', 'involv', 'break', 'text', 'into', 'individu', 'word', 'or', 'token', ',', 'and', 'stem', 'is', 'the', 'process', 'of', 'reduc', 'word', 'to', 'their', 'base', 'or', 'root', 'form', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords_and_rare_words(document, stop_words, rare_threshold=1):
    # Tokenize the document into words
    words = word_tokenize(document.lower())  # Convert to lowercase for case-insensitive matching

    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]

    # Count word frequencies
    word_freq = Counter(filtered_words)

    # Remove rare words based on the threshold
    filtered_words = [word for word in filtered_words if word_freq[word] > rare_threshold]

    return filtered_words

if __name__ == "__main__":
    # Example document
    document = "This is a sample document. It has some stop words and rare words. We want to remove them. WE will perform POS Tagging on sentences and words."

    # Get NLTK English stop words
    stop_words = set(stopwords.words('english'))

    # Remove stop words and rare words (words occurring only once)
    filtered_words = remove_stopwords_and_rare_words(document, stop_words)

    # Print the result
    print("Original Document:")
    print(document)

    print("\nFiltered Words:")
    print(filtered_words)


Original Document:
This is a sample document. It has some stop words and rare words. We want to remove them. WE will perform POS Tagging on sentences and words.

Filtered Words:
['.', 'words', 'words', '.', '.', 'words', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def identify_parts_of_speech(document):
    # Tokenize the document into words
    words = word_tokenize(document)

    # Perform part-of-speech tagging
    pos_tags = pos_tag(words)

    return pos_tags

if __name__ == "__main__":
    # Example document
    document = "This is a sample document. It has multiple sentences. We want to identify the parts of speech. We perform POS Tagging on sentences and words."

    # Identify parts of speech
    pos_tags = identify_parts_of_speech(document)

    # Print the result
    print("Original Document:")
    print(document)

    print("\nParts of Speech:")
    for word, pos_tag in pos_tags:
        print(f"{word}: {pos_tag}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original Document:
This is a sample document. It has multiple sentences. We want to identify the parts of speech. We perform POS Tagging on sentences and words.

Parts of Speech:
This: DT
is: VBZ
a: DT
sample: JJ
document: NN
.: .
It: PRP
has: VBZ
multiple: JJ
sentences: NNS
.: .
We: PRP
want: VBP
to: TO
identify: VB
the: DT
parts: NNS
of: IN
speech: NN
.: .
We: PRP
perform: VBP
POS: NNP
Tagging: NNP
on: IN
sentences: NNS
and: CC
words: NNS
.: .
