In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


# Download required NLTK resources
nltk.download('punkt_tab')           # Tokenizer model
nltk.download('stopwords')       # Stop words list
nltk.download('averaged_perceptron_tagger_eng')  # POS tagger model
nltk.download('wordnet')         # Lemmatizer dictionary
nltk.download('omw-1.4')         # Lemmatizer corpora

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
import shutil
shutil.rmtree('/root/nltk_data/tokenizers/punkt', ignore_errors=True)

In [None]:
sententce = "The stripped bats are hanging on best for test"
word_tokens = word_tokenize(sententce)
print(word_tokens)

['The', 'stripped', 'bats', 'are', 'hanging', 'on', 'best', 'for', 'test']


In [None]:
sentence = "There are lots of tress but the tress with green leaves are very few"
word_tokens= word_tokenize(sentence)
print(word_tokens)

['There', 'are', 'lots', 'of', 'tress', 'but', 'the', 'tress', 'with', 'green', 'leaves', 'are', 'very', 'few']


In [None]:
sentence = "Sure! Let's go through each core NLP concept with explanations and Python code examples using NLTK and spaCy (two widely used NLP libraries). We’ll use a sample sentence:"
sent_tokesn = sent_tokenize(sentence)
print(sent_tokesn)

['Sure!', "Let's go through each core NLP concept with explanations and Python code examples using NLTK and spaCy (two widely used NLP libraries).", 'We’ll use a sample sentence:']


In [None]:
sentence = "Sentence tokenization is the process of dividing a text document or a large block of text into individual sentences called tokens. It is a fundamental step in Natural Language Processing (NLP) that allows algorithms to handle and analyze text at the sentence level rather than as a continuous stream. By breaking text into sentences, NLP models can perform tasks like sentiment analysis, summarization, or translation more effectively on each distinct sentence."
sent_tokesn = sent_tokenize(sentence)
print(sent_tokesn)

['Sentence tokenization is the process of dividing a text document or a large block of text into individual sentences called tokens.', 'It is a fundamental step in Natural Language Processing (NLP) that allows algorithms to handle and analyze text at the sentence level rather than as a continuous stream.', 'By breaking text into sentences, NLP models can perform tasks like sentiment analysis, summarization, or translation more effectively on each distinct sentence.']


In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
print("lammatized 'bats':", lemmatizer.lemmatize("bats"))
print("lammatized 'hanging':", lemmatizer.lemmatize("hanging", pos=wordnet.VERB))

lammatized 'bats': bat
lammatized 'hanging': hang


In [None]:
print("lemmatize 'sentences':", lemmatizer.lemmatize("sentences"))
print("lemmatze 'dividing':", lemmatizer.lemmatize("dividing", pos=wordnet.VERB))

lemmatize 'sentences': sentence
lemmatze 'dividing': divide


In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words= [word for word in word_tokens if word.lower() not in stop_words]
print(filtered_words)

['lots', 'tress', 'tress', 'green', 'leaves']


In [None]:
nltk.download('averaged_perceptron_tagger_eng')
pos_tages = nltk.pos_tag(word_tokens)
print(pos_tages)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('There', 'EX'), ('are', 'VBP'), ('lots', 'NNS'), ('of', 'IN'), ('tress', 'NN'), ('but', 'CC'), ('the', 'DT'), ('tress', 'NN'), ('with', 'IN'), ('green', 'JJ'), ('leaves', 'NNS'), ('are', 'VBP'), ('very', 'RB'), ('few', 'JJ')]


In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)

In [None]:
print("Tokens:", [token.text for token in doc])

Tokens: ['Sentence', 'tokenization', 'is', 'the', 'process', 'of', 'dividing', 'a', 'text', 'document', 'or', 'a', 'large', 'block', 'of', 'text', 'into', 'individual', 'sentences', 'called', 'tokens', '.', 'It', 'is', 'a', 'fundamental', 'step', 'in', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'that', 'allows', 'algorithms', 'to', 'handle', 'and', 'analyze', 'text', 'at', 'the', 'sentence', 'level', 'rather', 'than', 'as', 'a', 'continuous', 'stream', '.', 'By', 'breaking', 'text', 'into', 'sentences', ',', 'NLP', 'models', 'can', 'perform', 'tasks', 'like', 'sentiment', 'analysis', ',', 'summarization', ',', 'or', 'translation', 'more', 'effectively', 'on', 'each', 'distinct', 'sentence', '.']


In [None]:
print("LemmaS:", [token.lemma_ for token in doc])

LemmaS: ['sentence', 'tokenization', 'be', 'the', 'process', 'of', 'divide', 'a', 'text', 'document', 'or', 'a', 'large', 'block', 'of', 'text', 'into', 'individual', 'sentence', 'call', 'token', '.', 'it', 'be', 'a', 'fundamental', 'step', 'in', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'that', 'allow', 'algorithm', 'to', 'handle', 'and', 'analyze', 'text', 'at', 'the', 'sentence', 'level', 'rather', 'than', 'as', 'a', 'continuous', 'stream', '.', 'by', 'break', 'text', 'into', 'sentence', ',', 'NLP', 'model', 'can', 'perform', 'task', 'like', 'sentiment', 'analysis', ',', 'summarization', ',', 'or', 'translation', 'more', 'effectively', 'on', 'each', 'distinct', 'sentence', '.']


In [None]:
print("POS Tage:", [(token.text,token.pos_) for token in doc])

POS Tage: [('Sentence', 'NOUN'), ('tokenization', 'NOUN'), ('is', 'AUX'), ('the', 'DET'), ('process', 'NOUN'), ('of', 'ADP'), ('dividing', 'VERB'), ('a', 'DET'), ('text', 'NOUN'), ('document', 'NOUN'), ('or', 'CCONJ'), ('a', 'DET'), ('large', 'ADJ'), ('block', 'NOUN'), ('of', 'ADP'), ('text', 'NOUN'), ('into', 'ADP'), ('individual', 'ADJ'), ('sentences', 'NOUN'), ('called', 'VERB'), ('tokens', 'NOUN'), ('.', 'PUNCT'), ('It', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('fundamental', 'ADJ'), ('step', 'NOUN'), ('in', 'ADP'), ('Natural', 'PROPN'), ('Language', 'PROPN'), ('Processing', 'PROPN'), ('(', 'PUNCT'), ('NLP', 'PROPN'), (')', 'PUNCT'), ('that', 'PRON'), ('allows', 'VERB'), ('algorithms', 'NOUN'), ('to', 'PART'), ('handle', 'VERB'), ('and', 'CCONJ'), ('analyze', 'VERB'), ('text', 'NOUN'), ('at', 'ADP'), ('the', 'DET'), ('sentence', 'NOUN'), ('level', 'NOUN'), ('rather', 'ADV'), ('than', 'ADP'), ('as', 'ADP'), ('a', 'DET'), ('continuous', 'ADJ'), ('stream', 'NOUN'), ('.', 'PUNCT'), ('By'

In [None]:
print("stop words removal:", [token.text for token in doc if not token.is_stop])

stop words removal: ['Sentence', 'tokenization', 'process', 'dividing', 'text', 'document', 'large', 'block', 'text', 'individual', 'sentences', 'called', 'tokens', '.', 'fundamental', 'step', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'allows', 'algorithms', 'handle', 'analyze', 'text', 'sentence', 'level', 'continuous', 'stream', '.', 'breaking', 'text', 'sentences', ',', 'NLP', 'models', 'perform', 'tasks', 'like', 'sentiment', 'analysis', ',', 'summarization', ',', 'translation', 'effectively', 'distinct', 'sentence', '.']
