In [1]:
# load data
filename = 'Book_data.txt'
file = open(filename, 'rt')
text = file.read()
file.close()
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

# Lesson 03: Bag-of-Words Model

# Sklearn bag-of-words

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

# summarize
print("Vocabulary: ",vectorizer.vocabulary_,"\n")
print("idf: ",vectorizer.idf_,"\n")

# encode document
vector = vectorizer.transform([text[0]])

# summarize encoded vector
print("Shape: ",vector.shape,"\n")
print("Array: ",vector.toarray(),"\n")

Vocabulary:  {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1} 

idf:  [1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ] 

Shape:  (1, 8) 

Array:  [[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]] 



# Keras bag-of-words

In [3]:
from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!']
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)
# summarize what was learned
print("Word count: ",t.word_counts,"\n")
print("Document count: ",t.document_count,"\n")
print("Word index: ",t.word_index,"\n")
print("Word docs: ",t.word_docs,"\n")
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')
print("Document encoded :\n",encoded_docs)

Word count:  OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)]) 

Document count:  5 

Word index:  {'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8} 

Word docs:  defaultdict(<class 'int'>, {'well': 1, 'done': 1, 'good': 1, 'work': 2, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1}) 

Document encoded :
 [[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


# Personal implementation

In [4]:
#Import
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
# load data
filename = 'Book_data.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# Data cleaning

In [6]:
# split into words
tokens = word_tokenize(text)

# convert to lower case
tokens = [w.lower() for w in tokens]

# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

# Keras Bag of word (BOW)

In [7]:
from keras.preprocessing.text import Tokenizer

# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(words)
# summarize what was learned
print("Word count: ",t.word_counts,"\n")

Word count:  OrderedDict([('early', 7), ('history', 3), ('original', 1), ('inhabitants', 1), ('new', 105), ('jersey', 70), ('h', 1), ('indians', 2), ('delaware', 50), ('tribe', 1), ('belonged', 1), ('algonquin', 1), ('family', 1), ('white', 3), ('settlers', 2), ('different', 4), ('nationalities', 1), ('dutch', 9), ('swedes', 2), ('english', 2), ('french', 1), ('geographical', 2), ('names', 2), ('state', 147), ('interesting', 1), ('reminders', 1), ('various', 2), ('occupants', 1), ('first', 14), ('exploration', 1), ('made', 12), ('henry', 1), ('hudson', 22), ('employ', 1), ('east', 14), ('india', 2), ('company', 1), ('sailed', 2), ('week', 1), ('waters', 8), ('bay', 36), ('river', 58), ('called', 5), ('south', 18), ('search', 1), ('northwest', 6), ('passage', 1), ('failing', 1), ('find', 1), ('north', 17), ('entered', 1), ('york', 50), ('still', 8), ('founded', 3), ('amsterdam', 2), ('afterwards', 2), ('time', 8), ('established', 4), ('trading', 1), ('post', 1), ('county', 30), ('later'

In [8]:
print("Document count: ",t.document_count,"\n")

Document count:  6363 



In [9]:
print("Word index: ",t.word_index,"\n")

Word index:  {'state': 1, 'new': 2, 'city': 3, 'jersey': 4, 'river': 5, 'many': 6, 'delaware': 7, 'york': 8, 'products': 9, 'cities': 10, 'part': 11, 'industries': 12, 'bay': 13, 'water': 14, 'important': 15, 'population': 16, 'towns': 17, 'illustration': 18, 'county': 19, 'parts': 20, 'newark': 21, 'large': 22, 'located': 23, 'manufacturing': 24, 'states': 25, 'manufacture': 26, 'industrial': 27, 'trenton': 28, 'miles': 29, 'great': 30, 'industry': 31, 'passaic': 32, 'hudson': 33, 'along': 34, 'facilities': 35, 'goods': 36, 'one': 37, 'extensive': 38, 'also': 39, 'south': 40, 'northern': 41, 'atlantic': 42, 'iron': 43, 'north': 44, 'west': 45, 'transportation': 46, 'found': 47, 'counties': 48, 'section': 49, 'near': 50, 'railroad': 51, 'residential': 52, 'line': 53, 'raritan': 54, 'land': 55, 'paterson': 56, 'ocean': 57, 'philadelphia': 58, 'first': 59, 'east': 60, 'town': 61, 'among': 62, 'rock': 63, 'may': 64, 'feet': 65, 'streams': 66, 'long': 67, 'kinds': 68, 'camden': 69, 'purpos

In [10]:
print("Word docs: ",t.word_docs,"\n")

Word docs:  defaultdict(<class 'int'>, {'early': 7, 'history': 3, 'original': 1, 'inhabitants': 1, 'new': 105, 'jersey': 70, 'h': 1, 'indians': 2, 'delaware': 50, 'tribe': 1, 'belonged': 1, 'algonquin': 1, 'family': 1, 'white': 3, 'settlers': 2, 'different': 4, 'nationalities': 1, 'dutch': 9, 'swedes': 2, 'english': 2, 'french': 1, 'geographical': 2, 'names': 2, 'state': 147, 'interesting': 1, 'reminders': 1, 'various': 2, 'occupants': 1, 'first': 14, 'exploration': 1, 'made': 12, 'henry': 1, 'hudson': 22, 'employ': 1, 'east': 14, 'india': 2, 'company': 1, 'sailed': 2, 'week': 1, 'waters': 8, 'bay': 36, 'river': 58, 'called': 5, 'south': 18, 'search': 1, 'northwest': 6, 'passage': 1, 'failing': 1, 'find': 1, 'north': 17, 'entered': 1, 'york': 50, 'still': 8, 'founded': 3, 'amsterdam': 2, 'afterwards': 2, 'time': 8, 'established': 4, 'trading': 1, 'post': 1, 'county': 30, 'later': 1, 'attempted': 1, 'settlements': 4, 'southwestern': 5, 'part': 40, 'expelled': 1, 'places': 11, 'proved': 

In [11]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')
print("Document encoded :\n",encoded_docs)

Document encoded :
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Sklearn BOW

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(words)

# summarize
print("Vocabulary: ",vectorizer.vocabulary_,"\n")

Vocabulary:  {'early': 479, 'history': 777, 'original': 1155, 'inhabitants': 832, 'new': 1096, 'jersey': 862, 'indians': 822, 'delaware': 406, 'tribe': 1767, 'belonged': 133, 'algonquin': 36, 'family': 572, 'white': 1876, 'settlers': 1522, 'different': 435, 'nationalities': 1075, 'dutch': 475, 'swedes': 1680, 'english': 519, 'french': 661, 'geographical': 693, 'names': 1069, 'state': 1621, 'interesting': 846, 'reminders': 1403, 'various': 1821, 'occupants': 1124, 'first': 609, 'exploration': 554, 'made': 960, 'henry': 767, 'hudson': 793, 'employ': 509, 'east': 482, 'india': 820, 'company': 305, 'sailed': 1470, 'week': 1866, 'waters': 1855, 'bay': 117, 'river': 1438, 'called': 210, 'south': 1600, 'search': 1493, 'northwest': 1111, 'passage': 1185, 'failing': 569, 'find': 601, 'north': 1106, 'entered': 523, 'york': 1912, 'still': 1631, 'founded': 647, 'amsterdam': 49, 'afterwards': 27, 'time': 1728, 'established': 536, 'trading': 1749, 'post': 1275, 'county': 363, 'later': 901, 'attempte

In [13]:
print("idf: ",vectorizer.idf_,"\n")

idf:  [8.37211803 9.06526521 9.06526521 ... 9.06526521 7.19346303 8.37211803] 



In [14]:
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print("Shape: ",vector.shape,"\n")

Shape:  (1, 1918) 



In [15]:
print("Array: ",vector.toarray(),"\n")

Array:  [[0. 0. 0. ... 0. 0. 0.]] 



# Lesson 4:  Word  Embeddings

# Task

In [7]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
            ['this', 'is', 'the', 'second', 'sentence'],
            ['yet', 'another', 'sentence'],
            ['one', 'more', 'sentence'],
            ['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences, min_count=1)

# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

[['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'], ['this', 'is', 'the', 'second', 'sentence'], ['yet', 'another', 'sentence'], ['one', 'more', 'sentence'], ['and', 'the', 'final', 'sentence']]


In [3]:
#Imports
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import sent_tokenize

In [4]:
# load data
filename = 'Book_data.txt'
file = open(filename, 'rt')
text = file.read()
file.close()
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

In [5]:
# split into words
tokens = sent_tokenize(text)

# convert to lower case
tokens = [w.lower() for w in tokens]

# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# filter out stop words
stop_words = set(stopwords.words('english'))
sentence = [s for s in stripped ]
words = [w for w in sentence ]
print(sentence[0])

early history


the original inhabitants of new jersey h were indians of the delaware
tribe which belonged to the algonquin family


In [None]:
# split into words
tokens = word_tokenize(text)

# convert to lower case
tokens = [w.lower() for w in tokens]

# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

import numpy as np

arr = np.array(words[:200])

sentences = np.array_split(arr, 20)



# train model
model = Word2Vec(sentences, min_count=1)

# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

Exception in thread Thread-24:
Traceback (most recent call last):
  File "/home/david/anaconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/david/anaconda3/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/david/anaconda3/lib/python3.8/site-packages/gensim/models/base_any2vec.py", line 210, in _worker_loop
    tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem)
  File "/home/david/anaconda3/lib/python3.8/site-packages/gensim/models/word2vec.py", line 638, in _do_train_job
    tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
  File "gensim/models/word2vec_inner.pyx", line 628, in gensim.models.word2vec_inner.train_batch_cbow
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


# Lesson 05: Learned Embedding

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# define problem
vocab_size = 100
max_length = 32

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 8)             800       
_________________________________________________________________
flatten (Flatten)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 1,057
Trainable params: 1,057
Non-trainable params: 0
_________________________________________________________________
None


In [3]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        
        'Could have done better.']

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

[[28, 27], [16, 21], [44, 43], [24, 21], [25], [1], [11, 43], [4, 16], [11, 21], [44, 46, 27, 16]]
[[28 27  0  0]
 [16 21  0  0]
 [44 43  0  0]
 [24 21  0  0]
 [25  0  0  0]
 [ 1  0  0  0]
 [11 43  0  0]
 [ 4 16  0  0]
 [11 21  0  0]
 [44 46 27 16]]
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 89.999998


# Lesson 07: Movie Review Sentiment Analysis Project