# Project By Varun Gehlot

## Objective:
### Text Generation Project to predict the next word(s). 

## Corpus:
### The Republic by Plato 

In [2]:
# loading the file

def load_doc(filename):
    with open(filename,'r') as file:
        text = file.read()
    return text

doc = load_doc('republic_clean.txt')
print(doc[:200])

BOOK I

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what 


In [3]:
# cleaning the file

from string import punctuation
import re

def clean_doc(doc):
    doc = doc.replace('-',' ')
    tokens = doc.split()
    re_punc = re.compile(f"[{re.escape(punctuation)}]")
    tokens = [re_punc.sub('',w) for w in tokens]
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t.lower() for t in tokens]
    return tokens

tokens = clean_doc(doc)
print(f"Total tokens: {len(tokens)}")
print(f"Unique tokens: {len(set(tokens))}")

Total tokens: 117584
Unique tokens: 7234


In [4]:
# organizing into sequence of tokens and saving it into a file (training data)

length = 50 + 1
sequences = []

for i in range(length,len(tokens)):
    seq = tokens[i-length : i] # slecting sequence of tokens
    line = ' '.join(seq)  # converting into a line
    sequences.append(line) # adding the sequence to the list 

print(len(sequences))

def save_doc(sequences,filename):
    data = '\n'.join(sequences)
    with open(filename,'w') as file:
        text = file.write(data)

save_doc(sequences = sequences, filename = 'republic_sequences.txt')

117533


In [5]:
# fitting a language model to this data

doc = load_doc('republic_sequences.txt')
lines = doc.split('\n')

In [6]:
# integer encoding sequences of words 

from keras.preprocessing.text import Tokenizer 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines) # so that each word is encoded with unique integers
sequences = tokenizer.texts_to_sequences(lines)
print(len(sequences))


117533


In [13]:
# vocabulary size (for defining the embedding layer) # input_dim
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7235

In [8]:
# separating input and output
import numpy as np
from keras.utils import to_categorical # for one hot encoding (grond truth for model to refer and learn)

sequences = np.array(sequences)
X,y = sequences[:,:-1] , sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size) # num_classes, number of unique categories
seq_length = X.shape[1] # 50, learned embedding needs to know vocab_size and length of input sequences

In [9]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length = seq_length)) # 50 is output_dim (vector space)
    model.add(LSTM(100, return_sequences = True)) # returns full sequence (often used when we use another layer lstm layer next)
    model.add(LSTM(100))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dense(vocab_size, activation = 'softmax'))
    # configure network
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    model.summary()
    return model

In [10]:
model = define_model(vocab_size,seq_length) # 7235, 117533
# aiming for accuracy to be little more than 50% not 100% since I want the model to understand the essence of text than to memorize.
model.fit(X,y,epochs = 125, batch_size = 128)

# saving the model for later use
model.save('model.h5')



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            361750    
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 7235)              730735    
                                                                 
Total params: 1243385 (4.74 MB)
Trainable params: 1243385 (4.74 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/125


Ep

  saving_api.save_model(
