In [1]:
# load data
filename = 'Book_data.txt'
file = open(filename, 'rt')
text = file.read()
file.close()
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

# Lesson 03: Bag-of-Words Model

# Sklearn bag-of-words

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

# summarize
print("Vocabulary: ",vectorizer.vocabulary_,"\n")
print("idf: ",vectorizer.idf_,"\n")

# encode document
vector = vectorizer.transform([text[0]])

# summarize encoded vector
print("Shape: ",vector.shape,"\n")
print("Array: ",vector.toarray(),"\n")

ModuleNotFoundError: No module named 'sklearn'

# Keras bag-of-words

In [3]:
from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!']
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)
# summarize what was learned
print("Word count: ",t.word_counts,"\n")
print("Document count: ",t.document_count,"\n")
print("Word index: ",t.word_index,"\n")
print("Word docs: ",t.word_docs,"\n")
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')
print("Document encoded :\n",encoded_docs)

Word count:  OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)]) 

Document count:  5 

Word index:  {'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8} 

Word docs:  defaultdict(<class 'int'>, {'well': 1, 'done': 1, 'work': 2, 'good': 1, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1}) 

Document encoded :
 [[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


# Personal implementation

In [4]:
#Import
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
# load data
filename = 'Book_data.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# Data cleaning

In [6]:
# split into words
tokens = word_tokenize(text)

# convert to lower case
tokens = [w.lower() for w in tokens]

# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

# Keras Bag of word (BOW)

In [7]:
from keras.preprocessing.text import Tokenizer

# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(words)
# summarize what was learned
print("Word count: ",t.word_counts,"\n")

Word count:  OrderedDict([('early', 7), ('history', 3), ('original', 1), ('inhabitants', 1), ('new', 105), ('jersey', 70), ('h', 1), ('indians', 2), ('delaware', 50), ('tribe', 1), ('belonged', 1), ('algonquin', 1), ('family', 1), ('white', 3), ('settlers', 2), ('different', 4), ('nationalities', 1), ('dutch', 9), ('swedes', 2), ('english', 2), ('french', 1), ('geographical', 2), ('names', 2), ('state', 147), ('interesting', 1), ('reminders', 1), ('various', 2), ('occupants', 1), ('first', 14), ('exploration', 1), ('made', 12), ('henry', 1), ('hudson', 22), ('employ', 1), ('east', 14), ('india', 2), ('company', 1), ('sailed', 2), ('week', 1), ('waters', 8), ('bay', 36), ('river', 58), ('called', 5), ('south', 18), ('search', 1), ('northwest', 6), ('passage', 1), ('failing', 1), ('find', 1), ('north', 17), ('entered', 1), ('york', 50), ('still', 8), ('founded', 3), ('amsterdam', 2), ('afterwards', 2), ('time', 8), ('established', 4), ('trading', 1), ('post', 1), ('county', 30), ('later'

In [8]:
print("Document count: ",t.document_count,"\n")

Document count:  6363 



In [9]:
print("Word index: ",t.word_index,"\n")

Word index:  {'state': 1, 'new': 2, 'city': 3, 'jersey': 4, 'river': 5, 'many': 6, 'delaware': 7, 'york': 8, 'products': 9, 'cities': 10, 'part': 11, 'industries': 12, 'bay': 13, 'water': 14, 'important': 15, 'population': 16, 'towns': 17, 'illustration': 18, 'county': 19, 'parts': 20, 'newark': 21, 'large': 22, 'located': 23, 'manufacturing': 24, 'states': 25, 'manufacture': 26, 'industrial': 27, 'trenton': 28, 'miles': 29, 'great': 30, 'industry': 31, 'passaic': 32, 'hudson': 33, 'along': 34, 'facilities': 35, 'goods': 36, 'one': 37, 'extensive': 38, 'also': 39, 'south': 40, 'northern': 41, 'atlantic': 42, 'iron': 43, 'north': 44, 'west': 45, 'transportation': 46, 'found': 47, 'counties': 48, 'section': 49, 'near': 50, 'railroad': 51, 'residential': 52, 'line': 53, 'raritan': 54, 'land': 55, 'paterson': 56, 'ocean': 57, 'philadelphia': 58, 'first': 59, 'east': 60, 'town': 61, 'among': 62, 'rock': 63, 'may': 64, 'feet': 65, 'streams': 66, 'long': 67, 'kinds': 68, 'camden': 69, 'purpos

In [10]:
print("Word docs: ",t.word_docs,"\n")

Word docs:  defaultdict(<class 'int'>, {'early': 7, 'history': 3, 'original': 1, 'inhabitants': 1, 'new': 105, 'jersey': 70, 'h': 1, 'indians': 2, 'delaware': 50, 'tribe': 1, 'belonged': 1, 'algonquin': 1, 'family': 1, 'white': 3, 'settlers': 2, 'different': 4, 'nationalities': 1, 'dutch': 9, 'swedes': 2, 'english': 2, 'french': 1, 'geographical': 2, 'names': 2, 'state': 147, 'interesting': 1, 'reminders': 1, 'various': 2, 'occupants': 1, 'first': 14, 'exploration': 1, 'made': 12, 'henry': 1, 'hudson': 22, 'employ': 1, 'east': 14, 'india': 2, 'company': 1, 'sailed': 2, 'week': 1, 'waters': 8, 'bay': 36, 'river': 58, 'called': 5, 'south': 18, 'search': 1, 'northwest': 6, 'passage': 1, 'failing': 1, 'find': 1, 'north': 17, 'entered': 1, 'york': 50, 'still': 8, 'founded': 3, 'amsterdam': 2, 'afterwards': 2, 'time': 8, 'established': 4, 'trading': 1, 'post': 1, 'county': 30, 'later': 1, 'attempted': 1, 'settlements': 4, 'southwestern': 5, 'part': 40, 'expelled': 1, 'places': 11, 'proved': 

In [11]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')
print("Document encoded :\n",encoded_docs)

Document encoded :
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Sklearn BOW

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(words)

# summarize
print("Vocabulary: ",vectorizer.vocabulary_,"\n")

ModuleNotFoundError: No module named 'sklearn'

In [13]:
print("idf: ",vectorizer.idf_,"\n")

NameError: name 'vectorizer' is not defined

In [30]:
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print("Shape: ",vector.shape,"\n")

Shape:  (1, 1918) 



In [31]:
print("Array: ",vector.toarray(),"\n")

Array:  [[0. 0. 0. ... 0. 0. 0.]] 



In [32]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject