#NLTK (Natural Language Toolkit)

In [None]:
! pip install nltk



In [None]:
import nltk
nltk.download('all')# for other  languages

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

#Tokenization
Tokenization refers to break down the text into smaller units. It splits paragraphs into sentences and sentences into words. It is one of the initial steps of any NLP pipeline. Let us have a look at the two major kinds of tokenization that NLTK provides:

In [None]:
from nltk import word_tokenize, sent_tokenize

sentence = "Natural Language Processing is a very popular topic in Machine Learning. It is very interesting"

print(word_tokenize(sentence))
print(sent_tokenize(sentence))


['Natural', 'Language', 'Processing', 'is', 'a', 'very', 'popular', 'topic', 'in', 'Machine', 'Learning', '.', 'It', 'is', 'very', 'interesting']
['Natural Language Processing is a very popular topic in Machine Learning.', 'It is very interesting']


# Stemming
Stemming generates the base word from the given word by removing the affixes of the word.

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

print(stemmer.stem("playing"))
print(stemmer.stem("plays"))
print(stemmer.stem("played"))

play
play
play


 # Lemmatization
Lemmatization involves grouping together the inflected forms of the same word. Unlike stemming which simply removes prefixes or suffixes, it considers the word's meaning and part of speech (POS) and ensures that the base form is a valid word. This makes lemmatization more accurate as it avoids generating non-dictionary words.

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("playing",'v'))
print(lemmatizer.lemmatize("plays",'v'))
print(lemmatizer.lemmatize("played",'v'))


play
play
play


# Part of Speech Tagging
Part of Speech (POS) tagging refers to assigning each word of a sentence to its part of speech. It is significant as it helps to give a better syntactic overview of a sentence.

In [None]:
from nltk import pos_tag
from nltk import word_tokenize

text = "GeeksforGeeks is best platform for Computer Science Students"
tokens = word_tokenize(text)
print(pos_tag(tokens))

[('GeeksforGeeks', 'NNP'), ('is', 'VBZ'), ('best', 'JJS'), ('platform', 'NN'), ('for', 'IN'), ('Computer', 'NNP'), ('Science', 'NNP'), ('Students', 'NNS')]


#Tokenization Using Spacy
we are using SpaCy's blank model (spacy.blank("en")) which initializes a minimal pipeline without pre-trained components like part-of-speech tagging or named entity recognition.

In [None]:
import spacy

nlp = spacy.blank("en")

doc = nlp("Natural Language Processing is a very popular topic in Machine Learning. It is very interesting")


for token in doc:
  print(token)

Natural
Language
Processing
is
a
very
popular
topic
in
Machine
Learning
.
It
is
very
interesting


In [None]:
dir(doc[0])

In [None]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/,
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc = nlp(text)

for token in doc:
  if token.like_url:
    print(token.text)



http://www.data.gov/
http://www.science
http://data.gov.uk/.
http://www3.norc.org/gss+website/
http://www.europeansocialsurvey.org/.


In [None]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 ‚Ç¨ to Steve"
doc = nlp(transactions)

for i in range(len(doc)-1):
  if doc[i].like_num and doc[i+1].is_currency:
    print(doc[i].text,' ',doc[i+1].text)

two   $
500   ‚Ç¨


#Displaying the Pipeline Components
We use the pre-trained en_core_web_sm model which includes various components for NLP tasks. After loading the model, we can display the available components in the pipeline

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

| Pipeline Step       | Purpose                              | Example                       |
| ------------------- | ------------------------------------ | ----------------------------- |
| **tok2vec**         | Converts tokens into numeric vectors | ‚ÄúApple‚Äù ‚Üí [0.1, -0.2, 0.5‚Ä¶]   |
| **tagger**          | Assigns part-of-speech tags          | ‚Äúbuying‚Äù ‚Üí VERB               |
| **parser**          | Finds dependency relations           | ‚ÄúApple‚Äù ‚Üí subject of ‚Äúbuying‚Äù |
| **attribute_ruler** | Adjusts linguistic attributes        | Fixes contractions, rules     |
| **lemmatizer**      | Finds base form of words             | ‚Äúrunning‚Äù ‚Üí ‚Äúrun‚Äù             |
| **ner**             | Detects named entities               | ‚ÄúApple‚Äù ‚Üí ORG                 |

| Model            | Vector Source                       | Dimension  | Type               |
| ---------------- | ----------------------------------- | ---------- | ------------------ |
| `en_core_web_sm` | ‚ö†Ô∏è **No pre-trained word vectors**  | 0D (empty) | Small, lightweight |
| `en_core_web_md` | Medium-sized pre-trained embeddings | 300D       | GloVe-style        |
| `en_core_web_lg` | Large pre-trained embeddings        | 300D       | Better quality     |


In [None]:
print(doc[0].text, doc[0].vector[:5])

print(doc[0].vector.shape)


Natural []
(0,)


In [None]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m33.5/33.5 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load("en_core_web_md")

doc = nlp("Natural Language Processing is a very popular topic in Machine Learning. It is very interesting")


print(doc[0].text,doc[0].vector[:5],doc[0].vector.shape)

print(doc[15].similarity(doc[6]))

Natural [-0.66059   0.2348   -0.021227 -0.32737  -0.062493] (300,)
0.497653067111969


In [None]:
for token in doc:
  print(token.text,"   ", token.tag_)

Natural     NNP
Language     NNP
Processing     NNP
is     VBZ
a     DT
very     RB
popular     JJ
topic     NN
in     IN
Machine     NNP
Learning     NNP
.     .
It     PRP
is     VBZ
very     RB
interesting     JJ


In [None]:
for token in doc:
    print(f"{token.text:<10} {token.dep_:<10} ‚Üí {token.head.text}")


Natural    compound   ‚Üí Language
Language   compound   ‚Üí Processing
Processing nsubj      ‚Üí is
is         ROOT       ‚Üí is
a          det        ‚Üí topic
very       advmod     ‚Üí popular
popular    amod       ‚Üí topic
topic      attr       ‚Üí is
in         prep       ‚Üí topic
Machine    compound   ‚Üí Learning
Learning   pobj       ‚Üí in
.          punct      ‚Üí is
It         nsubj      ‚Üí is
is         ROOT       ‚Üí is
very       advmod     ‚Üí interesting
interesting acomp      ‚Üí is


In [None]:
for token in doc:
    print(token.text," " ,token.lemma_)

Natural   Natural
Language   Language
Processing   Processing
is   be
a   a
very   very
popular   popular
topic   topic
in   in
Machine   Machine
Learning   Learning
.   .
It   it
is   be
very   very
interesting   interesting


In [None]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_,"->",spacy.explain(ent.label_))

Tesla Inc ORG -> Companies, agencies, institutions, etc.
$45 billion MONEY -> Monetary values, including unit


#TextBlob
TextBlob is a simple, beginner-friendly NLP library built on top of NLTK and Pattern.
It provides an easy API for common text-processing tasks ‚Äî without needing to deal with complex NLP pipelines.

| Purpose                                 | Description                                   | Example                                     |
| --------------------------------------- | --------------------------------------------- | ------------------------------------------- |
| üó£Ô∏è **Sentiment Analysis**              | Detect positive/negative/neutral tone in text | ‚ÄúI love this phone‚Äù ‚Üí `polarity = 0.5`      |
| üß© **Tokenization**                     | Split text into words/sentences               | `"I like NLP." ‚Üí ['I', 'like', 'NLP', '.']` |
| üè∑Ô∏è **POS Tagging**                     | Identify parts of speech (noun, verb, etc.)   | `"running" ‚Üí VERB`                          |
| üß† **Noun Phrase Extraction**           | Extract important phrases                     | `"the smart student"`                       |
| üî§ **Lemmatization / Word Inflection**  | Convert between word forms                    | `"better" ‚Üí "good"`                         |
| üåç **Translation & Language Detection** | Translate text (using online API)             | English ‚Üí Spanish                           |


In [None]:

from textblob import TextBlob

blob = TextBlob("TextBlob makes NLP simple. It is also fun to use!")

blob2 = TextBlob("I hate bugs in my code.")


#Sentiment Analysis
print(blob.sentiment,blob2.sentiment)

#Polarity: from -1 (negative) to 1 (positive)

#Subjectivity: from 0 (objective) to 1 (subjective)

#Tokenization

print(blob.words,blob.sentences)

#Noun Phrase Extraction
print(blob.noun_phrases)

Sentiment(polarity=0.1875, subjectivity=0.2785714285714286) Sentiment(polarity=-0.8, subjectivity=0.9)
['TextBlob', 'makes', 'NLP', 'simple', 'It', 'is', 'also', 'fun', 'to', 'use'] [Sentence("TextBlob makes NLP simple."), Sentence("It is also fun to use!")]
['textblob', 'nlp']
