# This notebook is for trialling out BoW and Embeddings as input for training and predicting classifiers

In [4]:
from collections import Counter
import re
import nltk
nltk.download('wordnet')
from nltk import tokenize

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [54]:
sentences = ['This is a house.', 'The car is blue.', 'The boy ran quickly.', 'The girl wore a nice dress']

In [55]:
def preprocess(message):
    """
    # Parameters
    # ----------
    #     message : The text message to be preprocessed.
    #
    # Returns
    # -------
    #     tokens: The preprocessed text into tokens.
    """
    text = message.lower()
    text = re.sub(r'[^\w\s]|_', ' ', text)
    tokens = text.split()
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(token) for token in tokens if len(token)>1]
    return tokens

In [56]:
tokenized = [preprocess(sentence) for sentence in sentences]
tokenized

[['this', 'is', 'house'],
 ['the', 'car', 'is', 'blue'],
 ['the', 'boy', 'ran', 'quickly'],
 ['the', 'girl', 'wore', 'nice', 'dress']]

In [57]:
bow1 = Counter([j for i in tokenized for j in i])
bow1

Counter({'this': 1,
         'is': 2,
         'house': 1,
         'the': 3,
         'car': 1,
         'blue': 1,
         'boy': 1,
         'ran': 1,
         'quickly': 1,
         'girl': 1,
         'wore': 1,
         'nice': 1,
         'dress': 1})

In [12]:
bow2 = Counter([j for j in i for i in tokenized])
bow2

NameError: name 'i' is not defined

In [15]:
bow3 = Counter([(j for j in i) for i in tokenized])
bow3
# Using [] instead of () raises an unhashable type error because it's a list

Counter({<generator object <listcomp>.<genexpr> at 0x000001F751D63D00>: 1,
         <generator object <listcomp>.<genexpr> at 0x000001F751D63CA8>: 1,
         <generator object <listcomp>.<genexpr> at 0x000001F751D63DB0>: 1,
         <generator object <listcomp>.<genexpr> at 0x000001F751D63E08>: 1})

In [14]:
bow4 = Counter([[j for i in tokenized] for j in i])
bow4

NameError: name 'i' is not defined

In [58]:
bow1.values()

dict_values([1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [59]:
bow1.keys()

dict_keys(['this', 'is', 'house', 'the', 'car', 'blue', 'boy', 'ran', 'quickly', 'girl', 'wore', 'nice', 'dress'])

In [99]:
new_sentences = ['The car drove fast','The house is blue']
new_tokenized = [preprocess(sentence) for sentence in new_sentences]
new_tokenized

[['the', 'car', 'drove', 'fast'], ['the', 'house', 'is', 'blue']]

In [89]:
filtered_new_tokenized2 = []
for sentence in new_tokenized:
    filtered_new_tokenized2.append([word for word in sentence if word in bow1.keys()])
filtered_new_tokenized2

[['the', 'car'], ['the', 'house', 'is', 'blue']]

In [91]:
# This is the fastest method
filtered_new_tokenized = [[word for word in sentence if word in bow1.keys()] for sentence in new_tokenized]
filtered_new_tokenized

[['the', 'car'], ['the', 'house', 'is', 'blue']]

In [93]:
for sentence in new_tokenized:
    for word in sentence:
        if word not in bow1.keys():
            sentence.remove(word) 
                
new_tokenized

[['the', 'car', 'fast'], ['the', 'house', 'is', 'blue']]

In [100]:
[list(filter(lambda a: a in bow1.keys(), sentence)) for sentence in new_tokenized]


[['the', 'car'], ['the', 'house', 'is', 'blue']]