<a href="https://colab.research.google.com/github/ShubhamVermaDev9/Natural_Language_Processing-/blob/main/NLP_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Natural Language Processing â€” 9 Key Techniques Explained**
Natural language processing (NLP) is a branch of artificial intelligence that  helps computers comprehend and interact with human language.

# **1.Tokenization**:Split text


In [None]:
import nltk
from nltk.tokenize import word_tokenize
import spacy

nltk.download('punkt')
nltk.download('punkt_tab')


nlp = spacy.load("en_core_web_sm")
text = "Tokenization is a key process in NLP. Let's see how it's #working."
nltk_tokens = word_tokenize(text)
spacy_sent = nlp(text)
spacy_tokens = [token.text for token in spacy_sent]

print("NLTK Tokens:", nltk_tokens)
print("spaCy Tokens:", spacy_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK Tokens: ['Tokenization', 'is', 'a', 'key', 'process', 'in', 'NLP', '.', 'Let', "'s", 'see', 'how', 'it', "'s", '#', 'working', '.']
spaCy Tokens: ['Tokenization', 'is', 'a', 'key', 'process', 'in', 'NLP', '.', 'Let', "'s", 'see', 'how', 'it', "'s", '#', 'working', '.']


# **2. Part-of-Speech (POS) Tagging**:Grammar labeling


In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger_eng')
sentence = "John likes to watch action movies. I like playing soccer"
tokens = word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('John', 'NNP'), ('likes', 'VBZ'), ('to', 'TO'), ('watch', 'VB'), ('action', 'NN'), ('movies', 'NNS'), ('.', '.'), ('I', 'PRP'), ('like', 'IN'), ('playing', 'VBG'), ('soccer', 'NN')]


# **3. Stemming and Lemmatization**:Normalize words


In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer

text = "Lemmatization important language techniques. " \
       "Caring testable eaten was beautifully better driver"
tokens = word_tokenize(text)
porter = PorterStemmer()
snowball = SnowballStemmer(language='english')
lancaster = LancasterStemmer()

print("{0:16}{1:16}{2:18}{3:18}".
      format("Word","Porter","Snowball","Lancaster"))
for w in tokens:
    print("{0:16}{1:16}{2:18}{3:18}".
          format(w,porter.stem(w),snowball.stem(w),lancaster.stem(w)))

Word            Porter          Snowball          Lancaster         
Lemmatization   lemmat          lemmat            lem               
important       import          import            import            
language        languag         languag           langu             
techniques      techniqu        techniqu          techn             
.               .               .                 .                 
Caring          care            care              car               
testable        testabl         testabl           test              
eaten           eaten           eaten             eat               
was             wa              was               was               
beautifully     beauti          beauti            beauty            
better          better          better            bet               
driver          driver          driver            driv              


# **4. Stop Words**:Normalize words


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


nltk.download('stopwords')
text = "How did you collect those books in Eugene?"
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))


filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print(f"Original Tokens: {tokens}")
print(f"After Removing Stop Words: {filtered_tokens}")

[nltk_data] Downloading package stopwords to /root/nltk_data...


Original Tokens: ['How', 'did', 'you', 'collect', 'those', 'books', 'in', 'Eugene', '?']
After Removing Stop Words: ['collect', 'books', 'Eugene', '?']


[nltk_data]   Unzipping corpora/stopwords.zip.


# **5. Bag of Words (BoW) with Count Vector**:Word frequency


In [None]:
from sklearn.feature_extraction.text import CountVectorizer


text_data = [
    "It was the best of times",
    "it was the worst of times",
    "it was the age of wisdom",
    "it was the age of foolishness was"
]


vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(text_data)
feature_names = vectorizer.get_feature_names_out()
bow_array = bow_matrix.toarray()
print("Vocabulary:\n", feature_names)
print("BoW Document Term Matrix:\n", bow_array)

Vocabulary:
 ['age' 'best' 'foolishness' 'it' 'of' 'the' 'times' 'was' 'wisdom' 'worst']
BoW Document Term Matrix:
 [[0 1 0 1 1 1 1 1 0 0]
 [0 0 0 1 1 1 1 1 0 1]
 [1 0 0 1 1 1 0 1 1 0]
 [1 0 1 1 1 1 0 2 0 0]]


# **6. TF-IDF**:Word importance


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third third third one.',
    'Is this the first document?',
]

vectorizer = TfidfVectorizer()


tfidf_vectors = vectorizer.fit_transform(documents)
words = vectorizer.get_feature_names_out()
tfidf_scores = {}
for i, doc in enumerate(documents):
    feature_index = tfidf_vectors[i, :].nonzero()[1]
    tfidf_doc = zip([words[idx] for idx in feature_index],
                    [tfidf_vectors[i, x] for x in feature_index])
    tfidf_scores[i] = {word: score for word, score in tfidf_doc}
for doc_id, scores in tfidf_scores.items():
    print(f"Document {doc_id + 1}:")
    for word, score in scores.items():
        print(f"\t{word}: {score:.4f}")

Document 1:
	this: 0.3841
	is: 0.3841
	the: 0.3841
	first: 0.5803
	document: 0.4698
Document 2:
	this: 0.2811
	is: 0.2811
	the: 0.2811
	document: 0.6876
	second: 0.5386
Document 3:
	this: 0.1518
	is: 0.1518
	the: 0.1518
	and: 0.2909
	third: 0.8727
	one: 0.2909
Document 4:
	this: 0.3841
	is: 0.3841
	the: 0.3841
	first: 0.5803
	document: 0.4698


# **7. N-grams**:Capture context


In [None]:
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
text = "I love reading books about human history."
tokens = word_tokenize(text)
N = 2
word_ngrams = list(ngrams(tokens, N))
print(f"{N}-grams:")
for gram in word_ngrams:
    print(gram)

2-grams:
('I', 'love')
('love', 'reading')
('reading', 'books')
('books', 'about')
('about', 'human')
('human', 'history')
('history', '.')


# **8. Word Embeddings**:Capture context


In [None]:
!pip install gensim
from gensim.models import Word2Vec
import nltk

nltk.download('brown')
from nltk.corpus import brown

sents = brown.sents()

model = Word2Vec(sentences=sents, vector_size=100, window=3, min_count=1)
print(model)
word_vectors = model.wv
output = word_vectors.most_similar(positive=['woman', 'king'],
                                   negative=['man'], topn=3)
print(f"king - man + woman = {output}")

print(f"similarity('king', 'queen'):{model.wv.similarity('king','queen')}")
print(f"similarity('best', 'worst'):{model.wv.similarity('best','worst')}")
print(f"similarity('best', 'was'): {model.wv.similarity('best', 'was')}")
similar_words = word_vectors.most_similar("Paris", topn=3)

print(f"Words similar to '{word}': {similar_words}")



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Word2Vec<vocab=56057, vector_size=100, alpha=0.025>
king - man + woman = [('heroes', 0.9279902577400208), ('throats', 0.924342930316925), ('inventor', 0.9220629334449768)]
similarity('king', 'queen'):0.9242252111434937
similarity('best', 'worst'):0.7875627279281616
similarity('best', 'was'): 0.38989323377609253
Words similar to 'document': [('London', 0.984093427658081), ('Rome', 0.9817825555801392), ('Boston', 0.9792966842651367)]


# **9. Named Entity Recognition (NER)**:Extract entities


In [2]:
import spacy

NER = spacy.load("en_core_web_sm")
text = "Apple is a technology company based in Cupertino, California, " \
       "founded by Steve Jobs 48 years ago."
text2 = "Daniel McDonald's son went to McDonald's and ordered a Happy Meal"
text3 = "hi my name is shubham verma"
doc = NER(text)
named_entities = [(entity.text, entity.label_) for entity in doc.ents]

for entity, label in named_entities:
    print(f"Entity: {entity}, Label: {label} ({spacy.explain(label)})")

Entity: Apple, Label: ORG (Companies, agencies, institutions, etc.)
Entity: Cupertino, Label: GPE (Countries, cities, states)
Entity: California, Label: GPE (Countries, cities, states)
Entity: Steve Jobs, Label: PERSON (People, including fictional)
Entity: 48 years ago, Label: DATE (Absolute or relative dates or periods)
