In [250]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import re
import os

In [0]:
articles = pd.read_csv("/content/gdrive/My Drive/Word learn/all_data.csv")
text = articles['Title'].str.cat(sep='\n')

In [0]:
text = re.sub("\"", "", text)
text = text.lower()

In [0]:
with open('/content/gdrive/My Drive/Word learn/input.txt', 'w', encoding = "utf-8") as wfile:
  wfile.write(text)

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential, load_model
from keras.utils import to_categorical
import numpy as np
import math
from pickle import dump

In [0]:
#with open('/content/gdrive/My Drive/Word learn/input.txt', 'r', encoding = 'utf-8') as rfile:
#  data = rfile.read()
with open('/content/gdrive/My Drive/Word learn/test.txt', 'r', encoding = 'utf-8') as rfile:
  data = rfile.read()

data = data.lower().split('\n')

lens = [len(x) for x in data]

In [296]:
max_seq_len = max(lens)
batchsize = 32
total_words = sum(lens)
seq_num = len(data)
print(math.ceil(seq_num / batchsize))
data_batches = []

for i in range(0, len(data), batchsize):
  data_batches.append(data[i:i+batchsize])

batchnum = len(data_batches)
last_batch_size = len(data_batches[-1])
print("Number of sequences:", seq_num)
print("The longest article (number of words):", max_seq_len)
print("The size of a single batch (number of sequences):", batchsize)
print("The amount of batches:", batchnum)
print("The size of a last batch (number of sequences):", last_batch_size)
print("The total amount of words:", total_words)

6
Number of sequences: 172
The longest article (number of words): 29
The size of a single batch (number of sequences): 32
The amount of batches: 6
The size of a last batch (number of sequences): 12
The total amount of words: 4237


In [0]:
tokenizer = Tokenizer()
def myGenerator(data_batches):
  X = []
  Y = []
  for batch in data_batches[:-1]:
    tokenizer.fit_on_texts(batch)
    total_batch_words = len(tokenizer.word_index) + 1

    input_sequences = []

    for line in batch:
      token_list = tokenizer.texts_to_sequences([line])[0]
      for i in range(1, len(token_list)):
        n_gram_sequences = token_list[:i+1]
        input_sequences.append(n_gram_sequences)

    #pad
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_seq_len, padding = 'pre'))

    #predictors and label
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]

    predictors = np.array(predictors)

    #predictors = np.reshape(predictors, (predictors.shape[0], 1, predictors.shape[1]))
    label = to_categorical(label, num_classes = total_batch_words)
    #print(predictors, label)
    #print(predictors.shape, label.shape)
    #print("X shape: ", predictors.shape)
    #print("Y shape: ", label.shape)
    X.append(predictors)
    Y.append(label)
  return (X, Y)
      

In [0]:
def gen_myGenerator(data_batches):
  while True:
    for batch in data_batches[:-1]:
      tokenizer.fit_on_texts(batch)
      total_batch_words = len(tokenizer.word_index) + 1

      input_sequences = []

      for line in batch:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
          n_gram_sequences = token_list[:i+1]
          input_sequences.append(n_gram_sequences)

      #pad
      input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_seq_len, padding = 'pre'))

      #predictors and label
      predictors, label = input_sequences[:,:-1], input_sequences[:,-1]

      predictors = np.array(predictors)

      predictors = np.reshape(predictors, (predictors.shape[0], 1, predictors.shape[1]))
      label = to_categorical(label, num_classes = total_batch_words)
      #print(predictors, label)
      #print(predictors.shape, label.shape)
      #print("X shape: ", predictors.shape)
      #print("Y shape: ", label.shape)
      yield (predictors, label)

In [0]:
def create_model(my_generator, max_sequence_len, total_words, verb = 1):
  input_len = max_sequence_len - 1

  model = Sequential()
  model.add(Embedding(28, 10, input_length=max_sequence_len-1))
  model.add(LSTM(300, return_sequences = True))
  #model.add(Dropout(0.2))
  model.add(LSTM(100))
  model.add(Dense(28, activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

  checkpoint_path = r"/content/gdrive/My Drive/Word learn/Checkpoints/checkpt--{epoch:02d}.hdf5"
  checkpoint = ModelCheckpoint(checkpoint_path, monitor='acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=5)
  earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='auto')
  callbacks_list = [checkpoint, earlystop]

  model.fit_generator(my_generator, steps_per_epoch = batchnum, epochs=100, callbacks = callbacks_list, verbose=verb)

  full_model_path = r"/content/gdrive/My Drive/Word learn/2layers_model.h5"
  print(model.summary())
  model.save(full_model_path)

  return model

In [0]:
model = Sequential()
model.add(Embedding(total_words, 512, input_length = max_seq_len - 1))
#model.add(LSTM(34))#, return_sequences = True))

model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [0]:
all_vec = myGenerator(data_batches)

all_vec_reshaped = [(np.reshape(x[0], (x[0].shape[0], 1, x[0].shape[1])), np.reshape(x[1], (x[1].shape[0], 1, x[1].shape[1]))) for x in all_vec]

for vec in all_vec:
  model.train_on_batch(vec[0], vec[1])

In [298]:
model.fit_generator(gen_myGenerator(data_batches), steps_per_epoch = batchnum, epochs = 10)

Epoch 1/10


ValueError: ignored