<a href="https://colab.research.google.com/github/PearlSikka/language-ninja/blob/master/Text_generation_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using the dataset of New York Times Comments and Headlines to train a text generation language model which can be used to generate News Headlines


In [None]:
import os
import pathlib

# Upload the API token.
def get_kaggle():
  try:
    import kaggle
    return kaggle
  except OSError:
    pass

  token_file = pathlib.Path("~/.kaggle/kaggle.json").expanduser()
  token_file.parent.mkdir(exist_ok=True, parents=True)

  try:
    from google.colab import files
  except ImportError:
    raise ValueError("Could not find kaggle token.")

  uploaded = files.upload()
  token_content = uploaded.get('kaggle.json', None)
  if token_content:
    token_file.write_bytes(token_content)
    token_file.chmod(0o600)
  else:
    raise ValueError('Need a file named "kaggle.json"')
  
  import kaggle
  return kaggle


kaggle = get_kaggle()

In [None]:
!kaggle datasets download -d aashita/nyt-comments                         #downloading Kaggle dataset 

In [None]:
!unzip nyt-comments.zip -d train

In [None]:
import pandas as pd                                                              #importing libraries
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy.random import seed

tf.random.set_seed(2)
seed(1)

In [None]:
import os

curr_dir='/content/train/'

docs=[]
for filename in os.listdir(curr_dir):
  if 'Articles' in filename:
    data=pd.read_csv(curr_dir+filename)
    docs.append(data)
    break
frame=pd.concat(docs,axis=0)

print (frame[:10])


In [None]:
frame.columns

In [None]:
frame.headline[:10]

In [None]:
frame.shape

In [None]:
frame=frame[['headline']]                                                         #filter out headline column

In [None]:
frame.head()

In [None]:
frame=frame[frame['headline']!='Unknown']

In [None]:
tokenizer=Tokenizer(filters='!"#$%&()*+,-./:;<=>?...',lower=True)

In [None]:
tokenizer.fit_on_texts(frame.headline.values)

In [None]:
dict_words=tokenizer.word_index

In [None]:
def get_sequence_of_tokens(corpus):
                                                                                  # tokenization
    total_words = len(tokenizer.word_index) + 1
                                                                                  # convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(frame.headline.values)
inp_sequences[:10]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras.utils as ku 

In [None]:
def generate_padded_sequences(input_sequences):                                   #padding sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
print(predictors)

In [None]:
print(label)

In [None]:
from tensorflow.keras.models import Sequential            
from tensorflow.keras.layers import Embedding,Dropout,Dense,LSTM

In [None]:
def create_model(max_sequence_len, total_words):                                  
    input_len = max_sequence_len - 1
    model = Sequential()                                                          #sequential model
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))                 #Embedding Layer
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))                                                          #LSTM
    model.add(Dropout(0.1))                                                     
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
model.fit(predictors, label, epochs=100, verbose=5)                                 #training the model

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
generate_text('Trump',2,model,max_sequence_len)

In [None]:
generate_text('India and China',3,model,max_sequence_len)

In [None]:
generate_text('President Trump',5,model,max_sequence_len)

In [None]:
generate_text('South Africa',3,model,max_sequence_len)