<a href="https://colab.research.google.com/github/Raaj-4320/NLP/blob/main/NLP_Techniques_Revision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!python -m spacy info
!python -m pip freeze | grep spacy
!python -m spacy download en_core_web_sm


[1m

spaCy version    3.7.4                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-6.1.58+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.7.1), en_core_web_lg (3.7.1)



# Bag Of Words

## Define some training utterances


In [63]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ["i love the book", "this is a great book", "the fit is great", "i love the shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

## Fit vectorizer to transform text to bag-of-words vectors

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names_out())
print(train_x_vectors.toarray())

['book' 'fit' 'great' 'is' 'love' 'shoes' 'the' 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


## Train SVM Model


In [65]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

### Test new utterances on trained model


In [66]:
test_x = vectorizer.transform(['i love the books'])

clf_svm.predict(test_x)

array(['CLOTHING'], dtype='<U8')

# Word Vectors


In [67]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [68]:
print(train_x)


['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']


In [69]:
docs = [nlp(text) for text in train_x]
train_x_word_vectors = [x.vector for x in docs]

In [70]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

In [71]:
test_x = ["The fitting is good"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors =  [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

array(['CLOTHING'], dtype='<U8')

# Regexes

* **Pattern matching of strings in Python** ---------- 123-124-123, 444-555-555, +1-(123)-124-1223

* Password checkers, phone numbers, emails and more!


In [72]:
import re

In [73]:
regexp = re.compile(r"\bread\b|\bstory\b|book")

phrases = ["I liked that story","I like the car that treated up the hill", "This hat is nice"]

matches = []

for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)


print(matches)

['I liked that story']



# Stemming and lemmatization

Techniques to normalize the text

reading ---> read
Books   ---> Book
Stories ---> `Stori` for `stemming` and  `story` for `lemmatizing`



In [74]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [75]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "reading the books"
words = word_tokenize(phrase)
print(words)

stemmed_words = []
for word in words:
  stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

['reading', 'the', 'books']


'read the book'

In [77]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word))

" ".join(lemmatized_words)


'reading the book'

# Stopwords removal

* The a set of most common words in english
* This, that, he, it

In [78]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
# print(stop_words)

phrase = "Here is an example sentence demonstrating the removal of stopwords"

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
  if word not in stop_words:
    stripped_phrase.append(word)

" ".join(stripped_phrase)

'Here example sentence demonstrating removal stopwords'

# Various other techniques (spell correction, sentiment,& pos (part of speech tagging)

In [55]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [79]:
from textblob import TextBlob

phrase = "I book sucked it was the worst"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

tb_phrase.tags

tb_phrase.sentiment

Sentiment(polarity=-1.0, subjectivity=1.0)

# Recurrent Neural Networks

* **Drawbacks:**
* Longer dependendencies don't always perfotm well
* "I need to go to the bank today so that i can make a deposite, i hope it's not closed"

* sequential nature of RNNs make it tough to parallelize and effectively use modern GPUs.




## Transformner architecture

In [80]:
!pip install spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/197.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/197.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (313 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.0/

In [81]:
!python -m spacy download en_trf_bertbaseuncased_lg


[38;5;1m✘ No compatible package found for 'en_trf_bertbaseuncased_lg' (spaCy
v3.7.4)[0m

