<a href="https://colab.research.google.com/github/scaperex/My_Projects/blob/master/nlp_political_bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing The Required Libraries:

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os
import string
from tensorflow import keras
from keras.utils.vis_utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

# Mount Data

In [46]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Next Word Prediction:

In [3]:
def preprocess(base_path = '/content/drive/My Drive/NewB-master', file_name='example.txt'):
    
    data = ""

    with open(os.path.join(base_path,file_name), 'r') as f:
        for line in f:
            line = line.strip()
            label, sentence = line.split('\t')[0:2] # remove label and \n
            if label == '1': # 1 - New York Times {Liberal}, 7 - New York Post {Conservative}
                data += sentence
            # data += ' *start* '
            
    
    # Tokenization
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([data])

    # saving the tokenizer for predict function.
    pickle.dump(tokenizer, open(f'tokenizer_{file_name.split(".")[0]}.pkl', 'wb'))

    sequence_data = tokenizer.texts_to_sequences([data])[0]

    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size: ',vocab_size)

    sequences = []
    for i in range(1, len(sequence_data)):
        words = sequence_data[i-1:i+1]
        sequences.append(words)
        
    print("The Length of sequences are: ", len(sequences))
    sequences = np.array(sequences)
    return  sequences[:,0], sequences[:,1], vocab_size

X,y, vocab_size = preprocess(base_path='', file_name='train_orig.txt')

vocab_size:  34676
The Length of sequences are:  392778


In [131]:
# data, vocab_size = preprocess()

vocab_size:  113263
The Length of sequences are:  2547197


### Creating the Model:

In [4]:
class nwp_model():
    def __init__(self,model=None, embedding_dim=10):
        if model:
            self.model = model
        else:
            model = Sequential()
            model.add(Embedding(vocab_size, output_dim=embedding_dim,input_length=1))
            model.add(LSTM(50))
            model.add(Dense(50, activation="relu"))
            model.add(Dense(vocab_size, activation="softmax"))
            self.model=model
        print(model.summary())

    # def load_and_process_data(self:)

    def train(self, feature, target, num_epochs=50):
        checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1, save_best_only=True, mode='auto')

        # reduce_LR = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=10, min_lr=0.0001, verbose = 1)

        # logdir='logsnextword1'
        # tensorboard_Visualization = TensorBoard(log_dir=logdir)
        self.model.compile(optimizer=Adam(lr=0.001), loss='SparseCategoricalCrossentropy')
        self.model.fit(feature, target, epochs=num_epochs, batch_size=64, callbacks=[checkpoint])#, reduce_LR, tensorboard_Visualization])

classifier = nwp_model(embedding_dim=10)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             346760    
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 34676)             1768476   
Total params: 2,129,986
Trainable params: 2,129,986
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
classifier.train(X,y, num_epochs=10)

Epoch 1/10
Epoch 00001: loss improved from inf to 7.34979, saving model to nextword1.h5
Epoch 2/10
Epoch 00002: loss improved from 7.34979 to 6.83392, saving model to nextword1.h5
Epoch 3/10
Epoch 00003: loss improved from 6.83392 to 6.52115, saving model to nextword1.h5
Epoch 4/10
Epoch 00004: loss improved from 6.52115 to 6.28298, saving model to nextword1.h5
Epoch 5/10
Epoch 00005: loss improved from 6.28298 to 6.08493, saving model to nextword1.h5
Epoch 6/10
Epoch 00006: loss improved from 6.08493 to 5.92296, saving model to nextword1.h5
Epoch 7/10
Epoch 00007: loss improved from 5.92296 to 5.79419, saving model to nextword1.h5
Epoch 8/10
Epoch 00008: loss improved from 5.79419 to 5.69008, saving model to nextword1.h5
Epoch 9/10
Epoch 00009: loss improved from 5.69008 to 5.60384, saving model to nextword1.h5
Epoch 10/10

### Plot The Model:

In [None]:
keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir="./logsnextword1"

## Observation:
### We are able to develop a decent next word prediction model and are able to get a declining loss and an overall decent performance.

In [147]:
# Importing the Libraries

from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer

model = load_model('nextword1.h5')
tokenizer = pickle.load(open('tokenizer_train_orig.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        preds = np.argmax(model.predict(sequence), axis=-1)
        # preds = model.predict_classes(sequence)
#         print(preds)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return(predicted_word)

In [149]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""

while(True):

    text = input("Enter your line: ")
    
    if text == "x":
        print("Ending The Program.....")
        break
    
    else:
        text = text.split(" ")
        text = text[-1]

        text = ''.join(text)
        for _ in range(9):
            text = Predict_Next_Words(model, tokenizer, text)
        

Enter your line: hillary
clinton
and
the
trump
and
the
trump
and
the
Enter your line: trump
and
the
trump
and
the
trump
and
the
trump
Enter your line: is
a
trump
and
the
trump
and
the
trump
and
Enter your line: donald
trump
and
the
trump
and
the
trump
and
the
Enter your line: x
Ending The Program.....
