### **Mounting To Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle

### **Read in the text corpus**
This text corpus was built by scraping the transcripts from the site:
https://clinic-duty.livejournal.com/12225.html. The data was then cleaned by removing non-ascii characters, blank lines, etc.

In [None]:
with open('/content/drive/My Drive/House MD Text Generation/Final Merged UTF8 SymbolCorrected.txt','r',encoding = "utf8") as f:
  text = f.read().lower().replace('\n',' \n ')
print(text[:2000])

In [None]:
words = [w for w in text.split(' ') if w != '']

In [None]:
words[0:10]

In [None]:
len(words)

### **Store word frequencies in a dictionary**

In [None]:
word_frequencies = {}
for word in words:
    word_frequencies[word] = word_frequencies.get(word, 0) + 1

### **Remove infrequent words from corpus**

In [None]:
min_frequency = 10
ignored_words = set()
for k,v in word_frequencies.items():
  if word_frequencies[k] < min_frequency:
    ignored_words.add(k)

In [None]:
len(ignored_words)

In [None]:
def remove_values_from_list(the_list, ignored_words):
   return [value for value in the_list if value not in ignored_words]

In [None]:
words = remove_values_from_list(words,ignored_words)

In [None]:
len(words)

In [None]:
len(set(words))

In [None]:
text_edited = ' '.join(words)
with open("/content/drive/My Drive/House MD Text Generation/House Edited Frequency and Symbols.txt",'w') as file:
  file.write(text_edited)

### **Create vectorization dictionaries and vectorize the corpus**

In [None]:
word_indices = dict((word, index) for index, word in enumerate(set(words)))
indices_word = dict((index, word) for index, word in enumerate(set(words)))

In [None]:
vectorized_words = []
for word in words:
    vectorized_words.append(word_indices[word])

In [None]:
vectorized_words = np.array(vectorized_words)

In [None]:
vectorized_words.shape

In [None]:
vectorized_words

### **Create sequences of words**
Here the window size is 50. Sentences and their corresponding next words are stored in X_train and y_train respectively.

In [None]:
#Hyperparameters definition
seq_length = 50
batch_size = 100
learning_rate = 0.001

In [None]:
X_train = []
y_train = []
for i in range(seq_length,len(vectorized_words)):
  X_train.append(vectorized_words[i-seq_length:i])
  y_train.append(vectorized_words[i])

### **Custom batch generator**
Due to the size of the corpus, we cannot load the entire set of sequences into memory at once as this will result in a crash. Therefore, we build a custom generator to yield batches during the fit call.

In [None]:
def get_batch(vectorized_words, seq_length, batch_size):
  n = vectorized_words.shape[0] - 1
  while True:    
    idx = np.random.choice(n-seq_length, batch_size)
    input_batch = [vectorized_words[i : i+seq_length] for i in idx]
    output_batch = [vectorized_words[i+seq_length+1] for i in idx]
    x_batch = np.reshape(input_batch, [batch_size, seq_length])
    y_batch = tf.keras.utils.to_categorical(output_batch,num_classes = len(set(words)))
    yield x_batch, y_batch

### **Model Structure**

In [None]:
def build_model(batch_size):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(input_dim = len(set(words)),output_dim = 128,input_length = seq_length))
  model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256,input_shape=(seq_length, len(set(words))))))
  model.add(tf.keras.layers.Dense(len(set(words))))
  model.add(tf.keras.layers.Activation('softmax'))
  model.compile(optimizer = tf.keras.optimizers.Adam(lr = learning_rate), loss = 'categorical_crossentropy',metrics = ['accuracy'])
  return model

In [None]:
model = build_model(batch_size)
model.summary()

### **Load pretrained weights if any (for further training) and fit the model**


In [None]:
load_path = "/content/drive/My Drive/house_weights_50-100-200000.hdf5"
if load_path != '':
  model.load_weights(load_path)

In [None]:
%load_ext tensorboard
import os
import datetime
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
checkpoint = "/content/drive/My Drive/House MD Text Generation/"
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint,monitor = 'accuracy',save_best_only = True)

In [None]:
steps_per_epoch = 1000
%tensorboard --logdir logs
history = model.fit(get_batch(vectorized_words, seq_length, batch_size),steps_per_epoch = steps_per_epoch,epochs = 500,verbose = 1,callbacks=[tensorboard_callback,model_checkpoint])

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])

In [None]:
save_path = "/content/drive/My Drive/house_weights_{}-{}-{}-second.hdf5".format(seq_length,batch_size,steps_per_epoch)
model.save_weights(save_path)

### **Build the predictor**

In [None]:
model_predict = build_model(1)
model_predict.load_weights(save_path)
model_predict.save('/content/drive/My Drive/model-predict-second.h5')

In [None]:
model_predict.summary()

### **Helper Function**
This is a function taken from the Keras example and it basically helps to add randomness to the text generation process. Otherwise, the generated text can get repetitive.

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

### **The Text Generator**

In [None]:
def generate_text(model, prompt_string, gen_len = 100,temperature = 0.45):
 
  prompt_string = prompt_string.lower()  
  input = []
  for word in prompt_string.split(" "):
    if word != '':
      try:
        input.append(word_indices[word])
      except KeyError:
        input.append(np.random.randint(0,len(set(words))))
  input = tf.keras.preprocessing.sequence.pad_sequences([input], maxlen=seq_length, padding = "post",truncating = "pre",value = np.random.randint(0,len(set(words))))
 
  generated = prompt_string
  for i in range(gen_len): 
    preds = model.predict(input, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_word = indices_word[next_index]
    generated += " " + next_word
    input= np.append(input,next_index)
    input = input[1:]
    
  return generated

In [None]:
text = generate_text(model_predict, 'wilson:', gen_len = 100,temperature = 0.65)
text

### **Name Capitalization**
Since all words in our corpus are converted to lowercase while training, we need to capitalize the names in the ouput text. For this, the entire list of character names was scraped from the IMDb site on House M.D. and saved to a file. The capitalize_names function checks for names and returns the desired output.

In [None]:
with open("/content/drive/My Drive/House MD Text Generation/Names-List.pkl","rb") as f:
  names_list = pickle.load(f)

In [None]:
def capitalize_names(text):
  final_output = []
  for word in text.split(" "):
    if word in names_list or word[:-1] in names_list:
      final_output.append(word[0].upper()+word[1:])
    else:
      final_output.append(word)
  return ' '.join(final_output)

In [None]:
text = capitalize_names(text)
text