## Bag of words approach

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_x = ["i love the book and book", "this is a great book", "the fit is great", "i love the shoes"]

vectorizer = CountVectorizer(binary=True, ngram_range=(1,2)) # ngram for the numbers of words
train_x_vectors = vectorizer.fit_transform(train_x) # fix a dictionary from our data

print(vectorizer.get_feature_names())
print(train_x_vectors.toarray())

['and', 'and book', 'book', 'book and', 'fit', 'fit is', 'great', 'great book', 'is', 'is great', 'love', 'love the', 'shoes', 'the', 'the book', 'the fit', 'the shoes', 'this', 'this is']
[[1 1 1 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0]
 [0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 1 1 1 0 1 1 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0]]


In [3]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [4]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors,train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [5]:
test_x = vectorizer.transform(['i like the book'])
test_x1 = vectorizer.transform(['i do not like shoes'])

print(clf_svm.predict(test_x))
print(clf_svm.predict(test_x1))

['BOOKS']
['CLOTHING']


## Word Vectors

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

In [7]:
import spacy
import en_core_web_md
nlp = en_core_web_md.load()

In [8]:
print(train_x)
print(train_y)

['i love the book and book', 'this is a great book', 'the fit is great', 'i love the shoes']
['BOOKS', 'BOOKS', 'CLOTHING', 'CLOTHING']


In [9]:
docs = [nlp(text) for text in train_x]

train_x_word_vector = [x.vector for x in docs]

In [10]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vector,train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:

test_x = ["i love the books"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors = [x.vector for x in test_docs] 

clf_svm_wv.predict(test_x_word_vectors)

array(['BOOKS'], dtype='<U8')

## Regexes

In [12]:
import re

regexp = re.compile(r"\bread\b|\bstory\b|book")

phrases = ["i like that story", "i like that book", "this hat is nice", "the car tread up the hill"]

matches = []
for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)
print(matches)

['i like that story', 'i like that book']


## Stemming/Lemmatization

In [13]:
# technique to normalize text
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "reading the books."

words = word_tokenize(phrase)
print(words)

stemmed_words = []
for word in words:
  stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

['reading', 'the', 'books', '.']


'read the book .'

In [15]:
from nltk.stem import WordNetLemmatizer # reduce words simple form

lemmatizer = WordNetLemmatizer()

# need to tokenize everything

phrase = "reading the books."

words = word_tokenize(phrase)
print(words)

lemmatizer_words = []
for word in words:
  lemmatizer_words.append(lemmatizer.lemmatize(word, pos='v')) # expect the part of speech

" ".join(lemmatizer_words)


['reading', 'the', 'books', '.']


'read the book .'

## Stopwords removal

In [16]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')  # caputring more meaning from the phrase
print(len(stop_words))


phrase = "Here is an example sentecne demonstrating the removal of stepwords"

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
  if word not in stop_words:
    stripped_phrase.append(word)

" ".join(stripped_phrase)


179


'Here example sentecne demonstrating removal stepwords'

## Others techniques (spell correction, sentiment, pos tagging)

In [17]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [18]:
from textblob import TextBlob

phrase = "i read the bo0ok and it was great"

tb_phrase = TextBlob(phrase)

print(tb_phrase.correct())
print(tb_phrase.tags)
print(tb_phrase.sentiment)


i read the book and it was great
[('i', 'NN'), ('read', 'VBP'), ('the', 'DT'), ('bo0ok', 'NN'), ('and', 'CC'), ('it', 'PRP'), ('was', 'VBD'), ('great', 'JJ')]
Sentiment(polarity=0.8, subjectivity=0.75)


## Transformer architecture

In [None]:
!pip install spacy-transformers
!python -m spacy download en_trf_bertbaseuncased_lg

In [33]:
import spacy
import torch

nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")


In [34]:
class Category:
  BOOKS = "BOOKS"
  BANK = "BANK"

train_x = ["good characters and plot progression", "check out the book", "good story. would recommend", "novel recommendation", "need to make a deposit to the bank", "balance inquiry savings", "save money"]
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK]

In [35]:
from sklearn import svm

docs = [nlp(text) for text in train_x]
train_x_vectors = [doc.vector for doc in docs]
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x = ["check this story out"]
docs = [nlp(text) for text in test_x]
test_x_vectors = [doc.vector for doc in docs]

clf_svm.predict(test_x_vectors)

array(['BOOKS'], dtype='<U5')