<a href="https://colab.research.google.com/github/scaperex/My_Projects/blob/master/political_bias_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing The Required Libraries:

In [27]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np
import os
import string
from tensorflow import keras
from keras.utils.vis_utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Mount Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
def preprocess_sentence(base_path = '/content/drive/My Drive/NewB-master', file_name='example.txt',news_id='0'):
    """
    news_id  1 - New York Times {Liberal}, 7 - New York Post {Conservative}
    """

    with open(os.path.join(base_path,file_name), 'r') as f:
        sentences = []
        for line in f:
            line = line.strip()
            label, sentence = line.split('\t')[0:2] # remove label and \n
            if label == news_id: 
                if len(sentence.split()) <=30:
                    sentence_without_sw = [word for word in sentence if not word in stopwords.words()]

                    sentences.append((" ").join(sentence_without_sw))
        print('num sentences:',len(sentences))

    # Tokenization
    tokenizer = Tokenizer(oov_token='oov_word')
    tokenizer.fit_on_texts(sentences)

    # saving the tokenizer for predict function.
    pickle.dump(tokenizer, open(f'tokenizer_{file_name.split(".")[0]}.pkl', 'wb'))

    sequence_data = tokenizer.texts_to_sequences(sentences)

    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size: ',vocab_size)

    # compute targets
    targets = [sentence[1:]+[0] for sentence in sequence_data]

    # Add padding
    padded_targets = pad_sequences(targets, padding="post")
    padded_inputs = pad_sequences(sequence_data, padding="post")

    return  padded_inputs, padded_targets, vocab_size

### Creating the Model:

In [29]:
class nwp_model():
    def __init__(self,vocab_size,model=None, embedding_dim=20, ):
        if model:
            self.model = model
        else:
            model = Sequential()
            model.add(Embedding(vocab_size, output_dim=embedding_dim, mask_zero=True, input_length=30))
            model.add(LSTM(40, return_sequences=True))
            model.add(Dense(40, activation="relu"))
            model.add(Dense(vocab_size, activation="softmax"))
            self.model=model
        print(model.summary())


    def train(self, feature, target, num_epochs=50):
        checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1, save_best_only=True, mode='auto')

        # reduce_LR = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

        # logdir='logsnextword1'
        # tensorboard_Visualization = TensorBoard(log_dir=logdir)
        self.model.compile(optimizer=Adam(lr=0.001), loss='SparseCategoricalCrossentropy', metrics=['acc'])
        self.model.fit(feature, target, epochs=num_epochs, batch_size=64, validation_split=0.1, callbacks=[checkpoint])#, reduce_LR, tensorboard_Visualization])



In [None]:
X1,y1, vocab_size1 = preprocess_sentence(base_path='', file_name='train_orig.txt',news_id='1')

In [11]:
classifier = nwp_model(vocab_size=vocab_size1,embedding_dim=40)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 40)            923120    
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 40)            12960     
_________________________________________________________________
dense_4 (Dense)              (None, 30, 40)            1640      
_________________________________________________________________
dense_5 (Dense)              (None, 30, 23078)         946198    
Total params: 1,883,918
Trainable params: 1,883,918
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
classifier.train(X1,y1, num_epochs=20)

Epoch 1/20
Epoch 00001: loss improved from inf to 3.44265, saving model to nextword1.h5
Epoch 2/20
Epoch 00002: loss improved from 3.44265 to 3.07212, saving model to nextword1.h5
Epoch 3/20
Epoch 00003: loss improved from 3.07212 to 3.01167, saving model to nextword1.h5
Epoch 4/20
Epoch 00004: loss improved from 3.01167 to 2.90537, saving model to nextword1.h5
Epoch 5/20
Epoch 00005: loss improved from 2.90537 to 2.81168, saving model to nextword1.h5
Epoch 6/20
Epoch 00006: loss improved from 2.81168 to 2.74575, saving model to nextword1.h5
Epoch 7/20
Epoch 00007: loss improved from 2.74575 to 2.69689, saving model to nextword1.h5
Epoch 8/20
Epoch 00008: loss improved from 2.69689 to 2.65640, saving model to nextword1.h5
Epoch 9/20
Epoch 00009: loss improved from 2.65640 to 2.62030, saving model to nextword1.h5
Epoch 10/20
Epoch 00010: loss improved from 2.62030 to 2.58746, saving model to nextword1.h5
Epoch 11/20
Epoch 00011: loss improved from 2.58746 to 2.55753, saving model to nex

### Plot The Model:

In [None]:
keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir="./logsnextword1"

In [24]:
# Importing the Libraries

from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer

model = load_model('nextword1.h5')
tokenizer = pickle.load(open('tokenizer_train_orig.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text, num_words_to_complete):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    res = ""
    text_len = len(text)
    for _ in range(num_words_to_complete):
        sequence = tokenizer.texts_to_sequences([text])
        padded_sequence = pad_sequences(sequence, padding="post", maxlen=30)
        output = model.predict(padded_sequence)
        preds = np.argmax(output, axis=-1)
        predicted_word = tokenizer.sequences_to_texts(preds)[0].split()[text_len-1]
        print(predicted_word)
        res += predicted_word + ' '
        text += predicted_word
        text_len+=1

    print(res)

In [25]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""
num_words_to_complete = int(input("Enter number of words to complete:"))

while(True):

    text = input("Enter beginning of sentence: ") 
    if text == "x":
        print("Ending The Program.....")
        break
    
    else:
        text = text.split(" ")
        Predict_Next_Words(model, tokenizer, text, num_words_to_complete)
        

Enter number of words to complete:5
Enter beginning of sentence: donald
trump
said
and
and
oov_word
trump said and and oov_word 
Enter beginning of sentence: trump campaign
and
new
oov_word
oov_word
oov_word
and new oov_word oov_word oov_word 
Enter beginning of sentence: x
Ending The Program.....


In [None]:
X7,y7, vocab_size7 = preprocess_sentence(base_path='', file_name='train_orig.txt',news_id='7')

In [None]:
classifier = nwp_model(embedding_dim=40)

In [None]:
classifier.train(X7,y7, num_epochs=20)

In [26]:
import nltk