In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Get corpus from URL's > Word2vec each word in each sentence from each of the 3 books > Create X and y data from it

In [4]:
'''
Author: Olac Fuentes
Modified by: R Noah Padilla

The goal of this assignment is written below.

The program read_sentences.py reads sentences form online classic books and 
converts them to a list of sentences, where each sentence is a list of words.
    


    [x]1. Write a function that receives a sentence and returns a 2D array containing 
        the embeddings of the words in the sentence. Your function should receive the embeddings 
        dictionary, the sentence and the desired length of the representation; if the 
        sentence is shorter than the desired length, path the array with zeros; if it’s longer, 
        truncate the representation.
    
    [x]2. Apply the function to produce an embedding representation of each of the 
        sentences in the three books used in the read_sentences.py program and generate
        a dataset containing examples of 3 classes, one for each book.
        
                > apply function to each sentence from each book and save all of them into a data set X and y
    
    [x]3. Randomly split the data into training and testing.
    
    [x]4. Train and test a system to determine the book each sentence belongs to.
    
    CLASSIFICATION PROBLEM

'''

import bs4 as bs
import urllib.request
import numpy as np
import sys

'''
TODO
sent_embedder():
        - receives a sentence, word embeddings dictionary, and the desired length of the sentence represention
        - returns a 2D array containing each of the words embeddings for a given sentence| row = word and columns are the embedding values

'''
def sent_embedder(sent, emb, desLen):
    
    #>>> Trim/Truncate sentences based on 'uSeRs' desires
    if len(sent) < desLen:
        extraZeros = [0]*(desLen - len(sent))#add zeros instead
        extraZeros = [str(x) for x in extraZeros] 
        sent.extend(extraZeros)
    elif len(sent) > desLen:    #truncate
        sent = sent[:desLen]
    
    #>>> Get embeddings for each word | word = row , col = emb values
    sent_emb = [] #contains all a word embeddings for each word in 'sent' that we will return
    for word in sent:
        if emb.get(word) is None:
          sent_emb.append(emb.get(str(0)))
        else:
          sent_emb.append(emb.get(word))
    return sent_emb

def read_embeddings(n=1000):
    #Fuentes: Reads n embeddings from file
    #Fuentes: Returns a dictionary were embedding[w] is the embeding of string w
    embedding = {}
    count = 0
    with open('/content/drive/MyDrive/MachineLearningData/glove.6B.50d.txt', encoding="utf8") as f: 
        for line in f: 
            count+=1
            ls = line.split(" ")
            emb = [np.float32(x) for x in ls[1:]]
            embedding[ls[0]]=np.array(emb)
            if count>= n:
                break
    return embedding

def get_words(st):
    st = st.lower()
    st = st.replace('\r\n', ' ')
    st = ''.join( c for c in st if  c in lowercase)
    words = st.split()
    return words

def get_sentence_list(url):
    paragraphs = []
    word_lists = []
    sentence_list = []
    data = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(data,'lxml')
    count = 0
    for paragraph in soup.find_all('p'):
        par  = paragraph.string
        if par:
            par = par.replace('\r\n', ' ')
            sent = par.split('.')
            for s in sent:
                sentence_list.append(s+'.')         
                words = get_words(s)
                if len(words)>0:
                    word_lists.append(words)
    return word_lists

if __name__ == "__main__":  
    url_list = ['http://www.gutenberg.org/files/215/215-h/215-h.htm', 'http://www.gutenberg.org/files/345/345-h/345-h.htm', 'http://www.gutenberg.org/files/1661/1661-h/1661-h.htm']
    lowercase = ''.join(chr(i) for i in range(97,123)) + ' '       
    
    allSentences = [] #contains all the sentences or word lists from all 3 books combined
    numSentEachBook = [] # number of sentences in each book | index 0 = num sentences in book 0
    for u, url in enumerate(url_list):
        word_lists = get_sentence_list(url)
        print('Book {} contains {} sentences'.format(u,len(word_lists)))
        lengths = np.array([len(wl) for wl in word_lists])
        print('Sentence length stats (min,max and mean words in a sentence):')
        print('min = {} max = {} mean = {:4f}'.format(np.min(lengths),np.max(lengths),np.mean(lengths)))
        numSentEachBook.append(len(lengths)) #len(lengths) = total number of sentences for book 'u'
        allSentences.extend(word_lists)
        
    print('Total number of sentences in all 3 books: ', len(allSentences))
    
    vocabulary_size = 22500        
    embedding = read_embeddings(vocabulary_size)
    '''
    #Fuentes: See if the protagonists appear in the embedding list    
    #Fuentes: I recommend increasing vocabulary size until all 3 appear in vocabulary
    for w in ['buck','dracula','holmes']:
        try:
            print(w,'embedding:\n',embedding[w])
        except:
            print(w,'is not in dictionary')
            pass
    '''
    
    #Regarding y > Each sentence is mapped to a book > 2D array mapped to a number(1-3)
    
    #get each word embedding from each book - contains duplicated embeddings
    desiredLength = 7
    all_word_emb = [] #should be a list of 2D arrays where each 2D array is a sentence embedding (represents the Word2vec)
    for sent in allSentences:
        all_word_emb.append(sent_embedder(sent, embedding,desiredLength))

    print("Total word embeddings calculated: ",len(all_word_emb)) #should be 12260 bc thats how many sentences are in all 3 books combined -
    

Book 0 contains 1618 sentences
Sentence length stats (min,max and mean words in a sentence):
min = 1 max = 122 mean = 19.241656
Book 1 contains 4219 sentences
Sentence length stats (min,max and mean words in a sentence):
min = 1 max = 125 mean = 17.345105
Book 2 contains 6423 sentences
Sentence length stats (min,max and mean words in a sentence):
min = 1 max = 101 mean = 15.205511
Total number of sentences in all 3 books:  12260
Total word embeddings calculated:  12260


Seperate the data into X and y

In [5]:
X = all_word_emb
y = [] # create a one hot rep of the data | [1,0,0] means book 0, [0,1,0] means book 1, [0,0,1] means book 2

#numSentEachBook is a list where index 0(book zero) contains number of sentences for that book and so on
for book in range(len(numSentEachBook)):
  for sent in range(numSentEachBook[book]):
    ohRep = np.zeros(3) #3 bc there are 3 classes | book 0,1,2
    ohRep[book] = 1
    y.append(ohRep)

#print(len(X)) # used for debugging
#print(len(y)) # used for debugging

Split into training and testing

In [6]:
from sklearn.model_selection import train_test_split

X = np.array(X)
y = np.array(y)

print(X.shape)
print(type(X))
print(y.shape)
print(type(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2020)

print(X_train.shape)
print(type(X_train))
print(y_train.shape)
print(type(y_train))

print(X_test.shape)
print(type(X_test))
print(y_test.shape)
print(type(y_test))


(12260, 7, 50)
<class 'numpy.ndarray'>
(12260, 3)
<class 'numpy.ndarray'>
(9195, 7, 50)
<class 'numpy.ndarray'>
(9195, 3)
<class 'numpy.ndarray'>
(3065, 7, 50)
<class 'numpy.ndarray'>
(3065, 3)
<class 'numpy.ndarray'>


Use a CNN to classify what book a given sentence belongs to

In [14]:
import tensorflow as tf
from tensorflow.keras.layers import *
from keras.models import Model
from keras.optimizers import Adam, SGD
import numpy as np
import matplotlib.pyplot as plt
import os
import distutils

'''
tf.keras.layers.Conv1D(
    filters,
    kernel_size,
    strides=1,
    padding="valid",
    data_format="channels_last",
    dilation_rate=1,
    groups=1,
    activation=None,
    use_bias=True,
    kernel_initializer="glorot_uniform",
    bias_initializer="zeros",
    kernel_regularizer=None,
    bias_regularizer=None,
    activity_regularizer=None,
    kernel_constraint=None,
    bias_constraint=None,
    **kwargs
)
'''



def create_model():
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Conv1D(32, 3, input_shape=(X_train.shape[1],X_train.shape[2]), activation='relu'))
  model.add(tf.keras.layers.MaxPooling1D(pool_size=2))

  model.add(tf.keras.layers.Dropout(.2))

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Conv1D(64, 3, input_shape=(X_train.shape[1],X_train.shape[2]), activation='relu'))
  model.add(tf.keras.layers.MaxPooling1D(pool_size=2))


  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(64,activation ='relu'))
  model.add(tf.keras.layers.Dense(3,activation = 'softmax'))
  return model

In [15]:
model = create_model()
model.summary()

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=[tf.keras.metrics.])

history = model.fit(X_train, y_train, batch_size=50, epochs=20, validation_data=(X_test, y_test))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 5, 64)             9664      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 195       
Total params: 18,115
Trainable params: 18,115
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch