<a href="https://colab.research.google.com/github/scaperex/My_Projects/blob/master/Next_Word_Prediction_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing The Required Libraries:

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np
import os
import string
from tensorflow import keras
from keras.utils.vis_utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from gensim.parsing.preprocessing import remove_stopwords


# Mount Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
with open('train_orig.txt', 'r') as f:
    sentences = []
    for line in f:
        outline = line
        line = line.strip()
        try:
            label, sentence = line.split('\t')[0:2] # remove label and \n
            if label in ['1', '7']: 
                if len(sentence.split()) <=30:
                    sentences.append(outline)
        except:
            print(line)
        
    print('num sentences:',len(sentences))
with open('train_clean.txt', 'a') as f:
    f.writelines(sentences)

num sentences: 19420


In [33]:
def preprocess_sentence(base_path = '/content/drive/My Drive/NewB-master', file_name='example.txt',news_id='0'):
    """
    news_id  1 - New York Times {Liberal}, 7 - New York Post {Conservative}
    """

    with open(os.path.join(base_path,file_name), 'r') as f:
        sentences = []
        for line in f:
            line = line.strip()
            label, sentence = line.split('\t')[0:2] # remove label and \n
            if label == news_id: 
                if len(sentence.split()) <=30:
                    sentences.append(remove_stopwords(sentence))
        print('num sentences:',len(sentences))

    # Tokenization
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)

    # saving the tokenizer for predict function.
    pickle.dump(tokenizer, open(f'tokenizer_{file_name.split(".")[0]}.pkl', 'wb'))

    sequence_data = tokenizer.texts_to_sequences(sentences)

    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size: ',vocab_size)

    # compute targets
    targets = [sentence[1:]+[0] for sentence in sequence_data]

    # Add padding
    padded_targets = pad_sequences(targets, padding="post")
    padded_inputs = pad_sequences(sequence_data, padding="post")

    return  padded_inputs, padded_targets, vocab_size

### Creating the Model:

In [16]:
class nwp_model():
    def __init__(self,vocab_size,model=None, embedding_dim=20):
        if model:
            self.model = model
        else:
            model = Sequential()
            model.add(Embedding(vocab_size, output_dim=embedding_dim, mask_zero=True, input_length=26))
            model.add(LSTM(80, return_sequences=True))
            model.add(Dense(80, activation="relu"))
            model.add(Dense(vocab_size, activation="softmax"))
            self.model=model
        print(model.summary())


    def train(self, feature, target, num_epochs=50):
        checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1, save_best_only=True, mode='auto')

        # reduce_LR = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

        # logdir='logsnextword1'
        # tensorboard_Visualization = TensorBoard(log_dir=logdir)
        self.model.compile(optimizer=Adam(lr=0.001), loss='SparseCategoricalCrossentropy', metrics=['acc'])
        self.model.fit(feature, target, epochs=num_epochs, batch_size=24, validation_split=0.1, callbacks=[checkpoint])#, reduce_LR, tensorboard_Visualization])



In [50]:
X1,y1, vocab_size1 = preprocess_sentence(base_path='', file_name='train_clean.txt',news_id='1')

num sentences: 19420
vocab_size:  18772


In [51]:
classifier = nwp_model(vocab_size=vocab_size1,embedding_dim=100)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 26, 100)           1877200   
_________________________________________________________________
lstm_2 (LSTM)                (None, 26, 80)            57920     
_________________________________________________________________
dense_4 (Dense)              (None, 26, 80)            6480      
_________________________________________________________________
dense_5 (Dense)              (None, 26, 18772)         1520532   
Total params: 3,462,132
Trainable params: 3,462,132
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
classifier.train(X1,y1, num_epochs=8)

Epoch 1/8
Epoch 00001: loss improved from inf to 2.29771, saving model to nextword1.h5
Epoch 2/8
Epoch 00002: loss improved from 2.29771 to 2.13322, saving model to nextword1.h5
Epoch 3/8
Epoch 00003: loss improved from 2.13322 to 2.06775, saving model to nextword1.h5
Epoch 4/8
Epoch 00004: loss improved from 2.06775 to 2.00441, saving model to nextword1.h5
Epoch 5/8
Epoch 00005: loss improved from 2.00441 to 1.94415, saving model to nextword1.h5
Epoch 6/8
Epoch 00006: loss improved from 1.94415 to 1.88665, saving model to nextword1.h5
Epoch 7/8
Epoch 00007: loss improved from 1.88665 to 1.82826, saving model to nextword1.h5
Epoch 8/8
Epoch 00008: loss improved from 1.82826 to 1.76602, saving model to nextword1.h5


### Plot The Model:

In [None]:
keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir="./logsnextword1"

In [82]:
# Importing the Libraries

from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer


def Predict_Next_Words(model, tokenizer, text, num_words_to_complete):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    res = ""
    text_len = len(text)
    for _ in range(num_words_to_complete):
        sequence = tokenizer.texts_to_sequences([text])
        padded_sequence = pad_sequences(sequence, padding="post", maxlen=26)
        output = model.predict(padded_sequence)
        preds = np.argmax(output, axis=-1)
        # print(preds)
        predicted_word = tokenizer.sequences_to_texts(preds)[0].split()[text_len-1]
        # print(predicted_word)
        res += predicted_word + ' '
        text += [predicted_word]
        text_len+=1
        # print(text)

    print(res)

In [87]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""
model = load_model('nextword111.h5')
tokenizer = pickle.load(open('tokenizer_train_clean11.pkl', 'rb'))

num_words_to_complete = int(input("Enter number of words to complete:"))

while(True):

    text = input("Enter beginning of sentence: ") 
    if text == "x":
        print("Ending The Program.....")
        break
    
    else:
        text = text.split(" ")
        Predict_Next_Words(model, tokenizer, text, num_words_to_complete)
        

Enter number of words to complete:5
Enter beginning of sentence: trump
trump trump trump trump trump 
Enter beginning of sentence: x
Ending The Program.....


In [86]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""
model = load_model('nextword177.h5')
tokenizer = pickle.load(open('tokenizer_train_clean7.pkl', 'rb'))

num_words_to_complete = int(input("Enter number of words to complete:"))

while(True):

    text = input("Enter beginning of sentence: ") 
    if text == "x":
        print("Ending The Program.....")
        break
    
    else:
        text = text.split(" ")
        Predict_Next_Words(model, tokenizer, text, num_words_to_complete)
        

Enter number of words to complete:5
Enter beginning of sentence: hillary clinton
trump probes disturbing new standard 
Enter beginning of sentence: trump
said hes monitoring republican presidential 
Enter beginning of sentence: president trump
rose garden nations plan military 
Enter beginning of sentence: clinton
slim crowned clinton criticized donald 
Enter beginning of sentence: x
Ending The Program.....


In [35]:
X7,y7, vocab_size7 = preprocess_sentence(base_path='', file_name='train_clean.txt',news_id='7')

num sentences: 17502
vocab_size:  17009


In [37]:
classifier2 = nwp_model(embedding_dim=40,vocab_size=vocab_size7)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 26, 40)            680360    
_________________________________________________________________
lstm_1 (LSTM)                (None, 26, 80)            38720     
_________________________________________________________________
dense_2 (Dense)              (None, 26, 80)            6480      
_________________________________________________________________
dense_3 (Dense)              (None, 26, 17009)         1377729   
Total params: 2,103,289
Trainable params: 2,103,289
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
classifier2.train(X7,y7, num_epochs=20)

Epoch 1/20
Epoch 00001: loss improved from inf to 2.93213, saving model to nextword1.h5
Epoch 2/20
Epoch 00002: loss improved from 2.93213 to 2.72927, saving model to nextword1.h5
Epoch 3/20
Epoch 00003: loss improved from 2.72927 to 2.65286, saving model to nextword1.h5
Epoch 4/20
Epoch 00004: loss improved from 2.65286 to 2.57733, saving model to nextword1.h5
Epoch 5/20
Epoch 00005: loss improved from 2.57733 to 2.50147, saving model to nextword1.h5
Epoch 6/20
Epoch 00006: loss improved from 2.50147 to 2.42158, saving model to nextword1.h5
Epoch 7/20
Epoch 00007: loss improved from 2.42158 to 2.34140, saving model to nextword1.h5
Epoch 8/20
Epoch 00008: loss improved from 2.34140 to 2.26186, saving model to nextword1.h5
Epoch 9/20
Epoch 00009: loss improved from 2.26186 to 2.17895, saving model to nextword1.h5
Epoch 10/20
Epoch 00010: loss improved from 2.17895 to 2.09274, saving model to nextword1.h5
Epoch 11/20
Epoch 00011: loss improved from 2.09274 to 2.00563, saving model to nex

In [72]:
model = load_model('nextword111.h5')
tokenizer = pickle.load(open('tokenizer_train_clean11.pkl', 'rb'))

def Predict_Next_Word_options(model, tokenizer, text, num_words_to_complete):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, padding="post", maxlen=26)
    output = model.predict(padded_sequence)[0,len(text)-1,:]
    outs = np.argpartition(output, -num_words_to_complete,axis=-1)[-num_words_to_complete:]
    predicted_word = tokenizer.sequences_to_texts([outs])[0].split()

    print(predicted_word)

In [None]:
model = load_model('nextword111.h5')
tokenizer = pickle.load(open('tokenizer_train_clean11.pkl', 'rb'))
num_words_to_complete = int(input("Enter number of words to complete:"))

while(True):

    text = input("Enter beginning of sentence: ") 
    if text == "x":
        print("Ending The Program.....")
        break
    
    else:
        text = text.split(" ")
        Predict_Next_Word_options(model, tokenizer, text, num_words_to_complete)

(18772,)
(20,)
['point', 'change', 'partner', 'past', 'issue', 'similar', 'seen', 'court', 'looking', 'leaders', 'trump', 'said', 'president', 'campaign', 'says', 'served', 'officials', 'record', 'got']




['offered', 'wrote', 'administration', 'appeared', 'won', 'organization', 'asked', 'mrs', 'says', 'republican', 'president', 'campaign', 'said', 'called', 'repeatedly', 'took', 'mr', 'told', 'realdonaldtrump']

Enter beginning of sentence: clinton

['administration', 'held', 'tried', 'think', 'asserted', 'says', 'melania', 'new', 'asked', 'ivanka', 'worked', 'campaign', 'president', 'said', 'like', 'donald', 'told', 'trump', 'later']

Enter beginning of sentence: president trump

['rally', 'going', 'won', 'presidency', 'win', 'took', 'campaign', 'supporters', 'said', 'told', 'organization', 'called', 'jr', 'says', 'administration', 'tower', 'wants', 'supporter', 'university']

Enter beginning of sentence: president trump accused

['presidential', 'immigration', 'comment', 'years', 'night', 'ms', 'year', 'percent', 'reporters', 'white', 'meeting', 'twitter', 'president', 'said', 'wednesday', 'republican', 'new', 'mr', 'times']

Enter beginning of sentence: x
Ending The Program.....

In [94]:
model = load_model('nextword177.h5')
tokenizer = pickle.load(open('tokenizer_train_clean7.pkl', 'rb'))
num_words_to_complete = int(input("Enter number of words to complete:"))

while(True):

    text = input("Enter beginning of sentence: ") 
    if text == "x":
        print("Ending The Program.....")
        break
    
    else:
        text = text.split(" ")
        Predict_Next_Word_options(model, tokenizer, text, num_words_to_complete)

Enter number of words to complete:20
Enter beginning of sentence: trump
(17009,)
(20,)
['won', 'supporters', 'wins', 'didnt', 'says', 'called', 'doesnt', 'jr', 'administration', 'tweeted', 'organization', 'campaign', 'told', 'added', 'went', 'later', 'took', 'said', 'wants']
Enter beginning of sentence: clinton
(17009,)
(20,)
['clobbering', 'easier', 'opened', '41', 'coolly', '43', 'gets', 'cleverly', 'pols', 'clinton', 'trump', 'cruz', 'underperforming', 'accused', 'regaining', 'said', 'undeserving', 'called', 'ex', 'slim']
Enter beginning of sentence: president trump
(17009,)
(20,)
['weighed', 'thundered', 'taking', 'sought', 'kept', 'learned', 'personally', 'oklahoma', 'dumping', 'instituted', 'careful', 'strengthened', 'trump', 'reportedly', 'says', 'said', 'rose', 'signed', 'ramp']
Enter beginning of sentence: president trump accused
(17009,)
(20,)
['people', 'said', 'attacking', 'intelligence', 'president', 'russian', 'politicians', 'ex', 'twitter', 'yellen', 'women', 'secret', '