https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/


/content/drive/My Drive/Work_FTFL_GIGATECH/FTFL_codes/txt_sentoken

In [41]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from string import punctuation
from os import listdir
from collections import Counter

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)
 
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)


# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('/content/drive/My Drive/Work_FTFL_GIGATECH/FTFL_codes/txt_sentoken/neg', vocab, True)
process_docs('/content/drive/My Drive/Work_FTFL_GIGATECH/FTFL_codes/txt_sentoken/pos', vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

41865
[('film', 7128), ('one', 4387), ('movie', 4123), ('like', 2823), ('even', 1983), ('good', 1853), ('time', 1823), ('story', 1703), ('films', 1700), ('much', 1616), ('also', 1608), ('would', 1605), ('characters', 1552), ('get', 1510), ('character', 1498), ('two', 1468), ('first', 1412), ('see', 1370), ('way', 1358), ('well', 1354), ('life', 1228), ('make', 1218), ('really', 1213), ('little', 1162), ('people', 1124), ('could', 1088), ('movies', 1086), ('scene', 1086), ('plot', 1077), ('best', 1067), ('never', 1056), ('many', 1032), ('man', 1030), ('new', 1029), ('bad', 1018), ('scenes', 989), ('doesnt', 950), ('great', 939), ('dont', 937), ('know', 919), ('hes', 903), ('another', 889), ('us', 883), ('love', 870), ('action', 853), ('still', 846), ('go', 842), ('seems', 835), ('something', 835), ('back', 827)]


In [50]:
print(type(vocab))
backup_vocab = vocab.copy()
print(type(backup_vocab))

# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
 
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

<class 'collections.Counter'>
<class 'collections.Counter'>
24217


In [51]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc, vocab):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# filter out tokens not in vocab
	tokens = [w for w in tokens if w in vocab]
	tokens = ' '.join(tokens)
	return tokens
 
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = clean_doc(doc, vocab)
		# add to list
		documents.append(tokens)
	return documents
 
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load all training reviews
positive_docs = process_docs('/content/drive/My Drive/Work_FTFL_GIGATECH/FTFL_codes/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('/content/drive/My Drive/Work_FTFL_GIGATECH/FTFL_codes/txt_sentoken/neg', vocab, True)
train_docs = negative_docs + positive_docs
print(len(train_docs))

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
print(ytrain)

1800
[0 0 0 ... 1 1 1]


In [52]:
# load all test reviews
positive_docs = process_docs('/content/drive/My Drive/Work_FTFL_GIGATECH/FTFL_codes/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('/content/drive/My Drive/Work_FTFL_GIGATECH/FTFL_codes/txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
print(len(test_docs))

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])
 
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)


200
24218


In [53]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1313, 100)         2421800   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1306, 32)          25632     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 653, 32)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 20896)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                208970    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 2,656,413
Trainable params: 2,656,413
Non-trainable params: 0
____________________________________________

##4. Train word2vec Embedding