# Setup



## Install necessary libraries & download models here

In [1]:
!pip install spacy
!python -m spacy download en_core_web_md

ERROR: Exception:
Traceback (most recent call last):
  File "D:\ProgramData\Anaconda3\envs\torch\lib\site-packages\pip\_internal\cli\base_command.py", line 173, in _main
    status = self.run(options, args)
  File "D:\ProgramData\Anaconda3\envs\torch\lib\site-packages\pip\_internal\cli\req_command.py", line 203, in wrapper
    return func(self, options, args)
  File "D:\ProgramData\Anaconda3\envs\torch\lib\site-packages\pip\_internal\commands\install.py", line 315, in run
    requirement_set = resolver.resolve(
  File "D:\ProgramData\Anaconda3\envs\torch\lib\site-packages\pip\_internal\resolution\resolvelib\resolver.py", line 94, in resolve
    result = self._result = resolver.resolve(
  File "D:\ProgramData\Anaconda3\envs\torch\lib\site-packages\pip\_vendor\resolvelib\resolvers.py", line 472, in resolve
    state = resolution.resolve(requirements, max_rounds=max_rounds)
  File "D:\ProgramData\Anaconda3\envs\torch\lib\site-packages\pip\_vendor\resolvelib\resolvers.py", line 341, in res

# Bag of Words

#### Define some training utterances

In [2]:
class Category:
  BOOKS = "BOOKS"
  CLOTHING = "CLOTHING"

train_x = ["i love the book", "this is a great book", "the fit is great", "i love the shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

#### Fit vectorizer to transform text to bag-of-words vectors

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names())
print(train_x_vectors.toarray())

['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]




#### Train SVM Model

In [4]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

#### Test new utterances on trained model

In [5]:
test_x = vectorizer.transform(['i love the books'])

clf_svm.predict(test_x)

array(['CLOTHING'], dtype='<U8')

# Word Vectors

In [6]:
import spacy

nlp = spacy.load("en_core_web_md")

ModuleNotFoundError: No module named 'spacy'

In [None]:
print(train_x)

In [None]:
docs = [nlp(text) for text in train_x]
train_x_word_vectors = [x.vector for x in docs]

In [None]:
from sklearn import svm

clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

In [None]:
test_x = ["I went to the bank and wrote a check", "let me check that out"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors =  [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

# Regexes

In [None]:
import re

regexp = re.compile(r"\bread\b|\bstory\b|book")

phrases = ["I liked that story.", "the car treaded up the hill", "this hat is nice"]

matches = []
for phrase in phrases:
  if re.search(regexp, phrase):
    matches.append(phrase)

print(matches)




# Stemming/Lemmatization

### Setup

In [None]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

### Stemming

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "reading the books"
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
  stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

### Lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(lemmatized_words)



# Stopwords
### Tokenize, then remove Stopwords

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = "Here is an example sentence demonstrating the removal of stopwords"

words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
  if word not in stop_words:
    stripped_phrase.append(word)

" ".join(stripped_phrase)


# Various other techniques (spell correction, sentiment, & pos tagging)

In [None]:
!python -m textblob.download_corpora

In [None]:
from textblob import TextBlob

phrase = "the book was horrible"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()

tb_phrase.tags

tb_phrase.sentiment

## Transformer Architecture

### Setup

In [None]:
!pip install spacy-transformers
!python -m spacy download en_trf_bertbaseuncased_lg

### Using Spacy to utilize BERT Model

In [None]:
import spacy
import torch

nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")

In [None]:
class Category:
  BOOKS = "BOOKS"
  BANK = "BANK"

train_x = ["good characters and plot progression", "check out the book", "good story. would recommend", "novel recommendation", "need to make a deposit to the bank", "balance inquiry savings", "save money"]
train_y = [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK]

In [None]:
from sklearn import svm

docs = [nlp(text) for text in train_x]
train_x_vectors = [doc.vector for doc in docs]
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x = ["check this story out"]
docs = [nlp(text) for text in test_x]
test_x_vectors = [doc.vector for doc in docs]

clf_svm.predict(test_x_vectors)