<a href="https://colab.research.google.com/github/Saranyanv/test/blob/master/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn.model_selection import train_test_split

import re  # regular expressions
from collections import defaultdict
import nltk
from nltk.tokenize import TreebankWordTokenizer

RANDOM_SEED = 42

def reset_graph(seed= RANDOM_SEED):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
	
def load_embedding_from_disks(embeddings_filename, with_indexes=True):
    """
    Read a embeddings txt file. If `with_indexes=True`, 
    we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, 
    otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping 
    from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
  
    else:
        word_to_embedding_dict = dict()

    with open(embeddings_filename, 'r') as embeddings_file:
        for (i, line) in enumerate(embeddings_file):

            split = line.split(' ')

            word = split[0]

            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )

            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation

    # Empty representation for unknown words.
    _WORD_NOT_FOUND = [0.0] * len(representation)
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(
            lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(
            index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

def default_factory():
    return EVOCABSIZE  # last/unknown-word row in limited_index_to_embedding


def listdir_no_hidden(path):
    start_list = os.listdir(path)
    end_list = []
    for file in start_list:
        if (not file.startswith('.')):
            end_list.append(file)
    return(end_list)

def text_parse(string):
    # replace non-alphanumeric with space 
    temp_string = re.sub('[^a-zA-Z]', '  ', string)    
    # replace codes with space
    for i in range(len(codelist)):
        stopstring = ' ' + codelist[i] + '  '
        temp_string = re.sub(stopstring, '  ', temp_string)      
    # replace single-character words with space
    temp_string = re.sub('\s.\s', ' ', temp_string)   
    # convert uppercase to lowercase
    temp_string = temp_string.lower()    
    if REMOVE_STOPWORDS:
        # replace selected character strings/stop-words with space
        for i in range(len(stoplist)):
            stopstring = ' ' + str(stoplist[i]) + ' '
            temp_string = re.sub(stopstring, ' ', temp_string)        
    # replace multiple blank characters with one blank character
    temp_string = re.sub('\s+', ' ', temp_string)    
    return(temp_string)

def read_data(filename):

  with open(filename) as f:
    data = tf.compat.as_str(f.read())
    data = data.lower()
    data = text_parse(data)
    data = TreebankWordTokenizer().tokenize(data)  # The Penn Treebank

  return data	

In [0]:
REMOVE_STOPWORDS = False  # no stopword removal 
#EVOCABSIZE = 20000  # specify desired size of pre-defined embedding vocabulary
EVOCABSIZE = 400000

embeddings_directory = 'embeddings/glove.6B'
filename = 'glove.6B.50d.txt'
#embeddings_directory = 'embeddings/glove.twitter.27B'
#filename = 'glove.twitter.27B.100d.txt'



def simplify(embeddings_directory, filename):
	embeddings_filename = os.path.join(embeddings_directory, filename)
	print('\nLoading embeddings from', embeddings_filename)
	word_to_index, index_to_embedding = \
		load_embedding_from_disks(embeddings_filename, with_indexes=True)
	print("Embedding loaded from disks.")

	vocab_size, embedding_dim = index_to_embedding.shape
	print("Embedding is of shape: {}".format(index_to_embedding.shape))
	print("This means (number of words, number of dimensions per word)\n")
	print("The first words are words that tend occur more often.")

	print("Note: for unknown words, the representation is an empty vector,\n"
		  "and the index is the last one. The dictionnary has a limit:")
	print("    {} --> {} --> {}".format("A word", "Index in embedding", 
		  "Representation"))
	word = "worsdfkljsdf"  # a word obviously not in the vocabulary
	idx = word_to_index[word] # index for word obviously not in the vocabulary
	complete_vocabulary_size = idx 
	embd = list(np.array(index_to_embedding[idx], dtype=int)) # "int" compact print
	print("    {} --> {} --> {}".format(word, idx, embd))
	word = "the"
	idx = word_to_index[word]
	embd = list(index_to_embedding[idx])  # "int" for compact print only.
	print("    {} --> {} --> {}".format(word, idx, embd))

	a_typing_test_sentence = 'The quick brown fox jumps over the lazy dog'
	print('\nTest sentence: ', a_typing_test_sentence, '\n')
	words_in_test_sentence = a_typing_test_sentence.split()

	print('Test sentence embeddings from complete vocabulary of', 
		  complete_vocabulary_size, 'words:\n')
	for word in words_in_test_sentence:
		word_ = word.lower()
		embedding = index_to_embedding[word_to_index[word_]]
		print(word_ + ": ", embedding)

	limited_word_to_index = defaultdict(default_factory, \
		{k: v for k, v in word_to_index.items() if v < EVOCABSIZE})

	limited_index_to_embedding = index_to_embedding[0:EVOCABSIZE,:]
	limited_index_to_embedding = np.append(limited_index_to_embedding, 
		index_to_embedding[index_to_embedding.shape[0] - 1, :].\
			reshape(1,embedding_dim), 
		axis = 0)

	del index_to_embedding

	# Verify the new vocabulary: should get same embeddings for test sentence
	# Note that a small EVOCABSIZE may yield some zero vectors for embeddings
	print('\nTest sentence embeddings from vocabulary of', EVOCABSIZE, 'words:\n')
	for word in words_in_test_sentence:
		word_ = word.lower()
		embedding = limited_index_to_embedding[limited_word_to_index[word_]]
		print(word_ + ": ", embedding)

	codelist = ['\r', '\n', '\t']   

	if REMOVE_STOPWORDS:
		print(nltk.corpus.stopwords.words('english'))

	more_stop_words = ['cant','didnt','doesnt','dont','goes','isnt','hes',\
			'shes','thats','theres','theyre','wont','youll','youre','youve', 'br'\
			've', 're', 'vs'] 

	some_proper_nouns_to_remove = ['dick','ginger','hollywood','jack',\
			'jill','john','karloff','kudrow','orson','peter','tcm','tom',\
			'toni','welles','william','wolheim','nikita']

	stoplist = nltk.corpus.stopwords.words('english') + more_stop_words +\
			some_proper_nouns_to_remove

	dir_name = "/content/gdrive/My Drive/MSPA/Predict 422/Week 8/movie-reviews-negative"
	#dir_name = '/root/movie-reviews-negative'
		
	filenames = listdir_no_hidden(path=dir_name)
	num_files = len(filenames)

	for i in range(len(filenames)):
		file_exists = os.path.isfile(os.path.join(dir_name, filenames[i]))
		assert file_exists
	print('\nDirectory:',dir_name)    
	print('%d files found' % len(filenames))

	# Read data for negative movie reviews
	# Data will be stored in a list of lists where the each list represents 
	# a document and document is a list of words.
	# We then break the text into words.

	negative_documents = []

	print('\nProcessing document files under', dir_name)
	for i in range(num_files):
		## print(' ', filenames[i])

		words = read_data(os.path.join(dir_name, filenames[i]))

		negative_documents.append(words)
		# print('Data size (Characters) (Document %d) %d' %(i,len(words)))
		# print('Sample string (Document %d) %s'%(i,words[:50]))

	# -----------------------------------------------
	# gather data for 500 positive movie reviews
	# -----------------------------------------------
	#dir_name = '/root/movie-reviews-positive'
	dir_name = "/content/gdrive/My Drive/MSPA/Predict 422/Week 8/movie-reviews-positive"
	filenames = listdir_no_hidden(path=dir_name)
	num_files = len(filenames)

	for i in range(len(filenames)):
		file_exists = os.path.isfile(os.path.join(dir_name, filenames[i]))
		assert file_exists
	print('\nDirectory:',dir_name)    
	print('%d files found' % len(filenames))

	# Read data for positive movie reviews
	# Data will be stored in a list of lists where the each list 
	# represents a document and document is a list of words.
	# We then break the text into words.

	positive_documents = []

	print('\nProcessing document files under', dir_name)
	for i in range(num_files):
		## print(' ', filenames[i])

		words = read_data(os.path.join(dir_name, filenames[i]))

		positive_documents.append(words)
		# print('Data size (Characters) (Document %d) %d' %(i,len(words)))
		# print('Sample string (Document %d) %s'%(i,words[:50]))

	# -----------------------------------------------------
	# convert positive/negative documents into numpy array
	# note that reviews vary from 22 to 1052 words   
	# so we use the first 20 and last 20 words of each review 
	# as our word sequences for analysis
	# -----------------------------------------------------
	max_review_length = 0  # initialize
	for doc in negative_documents:
		max_review_length = max(max_review_length, len(doc))    
	for doc in positive_documents:
		max_review_length = max(max_review_length, len(doc)) 
	print('max_review_length:', max_review_length) 

	min_review_length = max_review_length  # initialize
	for doc in negative_documents:
		min_review_length = min(min_review_length, len(doc))    
	for doc in positive_documents:
		min_review_length = min(min_review_length, len(doc)) 
	print('min_review_length:', min_review_length) 

	# construct list of 1000 lists with 40 words in each list
	from itertools import chain
	documents = []
	for doc in negative_documents:
		doc_begin = doc[0:20]
		doc_end = doc[len(doc) - 20: len(doc)]
		documents.append(list(chain(*[doc_begin, doc_end])))    
	for doc in positive_documents:
		doc_begin = doc[0:20]
		doc_end = doc[len(doc) - 20: len(doc)]
		documents.append(list(chain(*[doc_begin, doc_end])))    

	# create list of lists of lists for embeddings
	embeddings = []    
	for doc in documents:
		embedding = []
		for word in doc:
		   embedding.append(limited_index_to_embedding[limited_word_to_index[word]]) 
		embeddings.append(embedding)
	
	embeddings_array = np.array(embeddings) 
	
	# Define the labels to be used 500 negative (0) and 500 positive (1)
	thumbs_down_up = np.concatenate((np.zeros((500), dtype = np.int32), 
						  np.ones((500), dtype = np.int32)), axis = 0)


	# Random splitting of the data in to training (80%) and test (20%)  
	X_train, X_test, y_train, y_test = \
		train_test_split(embeddings_array, thumbs_down_up, test_size=0.20, 
						 random_state = RANDOM_SEED)
	
	return (X_train, X_test, y_train, y_test)