In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
np.random.seed(42)

---
Hinglish Chatbot
---


In [4]:
import re
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# clean_text function is used to convert text to lower and handle words such that:
# -> if there are punctiation it would add space across it.
# -> if there is some unnecessary character then it could convert it into space
# -> Strip the sentence

def clean_text(text):
    text = text.lower()
    text = re.sub(r"([@#'।,?!])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z0-9@#'।,?!]+", ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
with open('chatbot_tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

model = tf.keras.models.load_model('/content/chatbot_attention.keras')

In [5]:
# tested on various inputs
MAXLEN_QUESTIONS = 96
MAXLEN_ANSWERS = 97
# result from : '/content/chatbot_attention.keras'
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM,Concatenate, Embedding, Dense, Dot, Activation
hidden_units = 512
def build_inference_models(model, hidden_units):
    # Encoder
    encoder_inputs = model.input[0]
    encoder_outputs, state_h_enc, state_c_enc = model.get_layer("lstm_6").output
    encoder_model = Model(encoder_inputs, [encoder_outputs, state_h_enc, state_c_enc])

    # Decoder Inputs
    decoder_inputs = Input(shape=(1,), name="decoder_input_infer")
    decoder_state_input_h = Input(shape=(hidden_units,), name="decoder_h")
    decoder_state_input_c = Input(shape=(hidden_units,), name="decoder_c")
    encoder_outputs_input = Input(shape=(None, hidden_units), name="encoder_outputs_infer")

    # Decoder Embedding
    decoder_embedding_layer = model.get_layer("embedding_8")
    decoder_embedding = decoder_embedding_layer(decoder_inputs)

    # Decoder LSTM
    decoder_lstm = model.get_layer("lstm_7")
    decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
        decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c]
    )

    # Attention
    score = Dot(axes=[2, 2])([decoder_outputs, encoder_outputs_input])
    attention_weights = Activation('softmax')(score)
    context_vector = Dot(axes=[2, 1])([attention_weights, encoder_outputs_input])
    decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

    # Dense Output
    decoder_dense = model.get_layer("dense_3")
    decoder_outputs_final = decoder_dense(decoder_combined_context)

    # Decoder Model
    decoder_model = Model(
        [decoder_inputs, encoder_outputs_input, decoder_state_input_h, decoder_state_input_c],
        [decoder_outputs_final, state_h_dec, state_c_dec]
    )

    return encoder_model, decoder_model
encoder_model, decoder_model = build_inference_models(model, hidden_units)


def decode_sequence(input_text, tokenizer, maxlen_questions, maxlen_answers, temperature=0.8):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=maxlen_questions, padding='post')

    # Run encoder
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq)
    states_value = [state_h, state_c]

    start_token = tokenizer.word_index.get("<start>")
    end_token = tokenizer.word_index.get("<end>")
    target_seq = np.array([[start_token]])

    decoded_sentence = []

    for _ in range(maxlen_answers):
        output_tokens, h, c = decoder_model.predict(
            [target_seq, encoder_outputs] + states_value
        )

        output_distribution = output_tokens[0, -1, :]
        #output_distribution = np.log(output_distribution + 1e-10) / temperature
        output_distribution = np.log(output_distribution + 1e-10) / temperature
        exp_preds = np.exp(output_distribution)
        output_distribution = exp_preds / np.sum(exp_preds)


        sampled_token_index = np.argmax(output_distribution)
        #sampled_token_index = np.random.choice(len(output_distribution), p=output_distribution)

        sampled_word = tokenizer.index_word.get(sampled_token_index, "?")

        if sampled_token_index == end_token or sampled_word == "?":
            break

        decoded_sentence.append(sampled_word)
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return " ".join(decoded_sentence)

input_texts = [
    "Hey Radhika! Kaisi ho?",
    "Mujhe teri kuch paintings dekhni hai. Kya hum ek din art exhibition pe milke chal sakte hai?",
    "tumhara naam kya hai?",
    "aaj mausam kaisa hai?",
    "kahan ja rahe ho?"
]

# Iterate through the list of input texts
for i, input_text in enumerate(input_texts, 1):
    input_text = clean_text(input_text)
    print(f"User (Input {i}): {input_text}")
    response = decode_sequence(input_text, tokenizer, MAXLEN_QUESTIONS, MAXLEN_ANSWERS)
    print(f"Bot (Response {i}): {response}\n")


User (Input 1): hey radhika ! kaisi ho ?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Bot (Response 1): mai

---
NEWS SUMMARIZER
---

In [13]:
input = "By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in Italy last month. Symptoms of hepatitis A include fever, tiredness, loss of appetite, nausea and abdominal discomfort. Fargo Catholic Diocese in North Dakota (pictured) is where the bishop is located ."
import re
contractions_map =  {
    "isn't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "i'm": "i am",
    "i've": "i have",
    "you've": "you have",
    "they're": "they are",
    "we're": "we are",
    "we've": "we have",
    "won't": "will not",
    "wouldn't": "would not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "there's": "there is",
    "that's": "that is",
    "what's": "what is",
    "who's": "who is",
    "let's": "let us",
    "mustn't": "must not",
    "shan't": "shall not",
}
with open('summary_tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
def clean_text(raw_text):

    def expand_contractions(text, contractions_map):
        if not contractions_map:
            return text
        pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in contractions_map.keys()) + r')\b',
                             flags=re.IGNORECASE)
        return pattern.sub(lambda m: contractions_map.get(m.group(0).lower(), m.group(0)), text)

    # apply strop and lowering the text
    text = re.sub(r'\s+', ' ', raw_text).strip().lower()
    # make changes for short form words to full like isn't to is not
    text = expand_contractions(text, contractions_map)

    text = re.sub(r"\b(?!(" + "|".join(re.escape(k) for k in contractions_map) + r"))(\w+)'s\b", r"\2", text, flags=re.IGNORECASE)

    # remove published and updated words from the starting
    text = re.sub(r'By\s+.*?PUBLISHED:.*?UPDATED:.*?\.\s*', '', text, flags=re.IGNORECASE)

    # remove  "By . Associated Press . PUBLISHED"
    text = re.sub(r'By\s+\.?\s*[A-Za-z\s]*\.?\s*PUBLISHED:.*?UPDATED:.*?\.\s*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^By\s+\.*\s*[a-z\s]+\.?\s*', '', text, flags=re.IGNORECASE)
    # remove date  and time stamp
    text = re.sub(r'\d{1,2}:\d{2}\s*[A-Z]{2,4},\s*\d{1,2}\s+\w+\s+\d{4}\s*\.', '', text,  flags=re.IGNORECASE)
    text = re.sub(r'\b(?:last\s+updated\s+at\s+)?\d{1,2}:\d{2}\s*[APap][Mm]\s+on\s+\d{1,2}(?:st|nd|rd|th)?\s+\w+\s+\d{4}\s*,?', '', text, flags=re.IGNORECASE)

    # if any part is there in brackets we don't consider it for highlights
    text = re.sub(r'\s*[\(\[].*?[\)\]]', '', text)

    # there were this words like video and read more which is unnecessary for highlight creation
    text = re.sub(r'scroll down for video.*?(?=\s[a-z])', '', text)
    text = re.sub(r'watch the video above.*?(?=\s[a-z])', '', text)
    text = re.sub(r'read more:.*?(?=\s[a-z])', '', text)

    # replace unnecessary punctuation marks like repetititve
    text = re.sub(r'--+', ' ', text)
    text = re.sub(r'[“”"]', '', text)
    text = re.sub(r"[^\w\s.,?!$£€₹\-']", '', text)

    # replace multiple commas and dots to single
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r',{2,}', ',', text)
    text = re.sub(r'^\.\s*', '', text)
    # remove whitespave from start and end
    text = re.sub(r'\s+', ' ', text).strip()

    return text
input = clean_text(input)
# preparing articles and highlights for training
max_len_articles = 400
max_len_highlights = 199
VOCAB_SIZE = len(tokenizer.word_index) + 1

tokenized_articles = tokenizer.texts_to_sequences([input])

# padding and truncating both articles and highlights
padded_articles = pad_sequences(
    tokenized_articles,
    maxlen=max_len_articles,
    padding='post',
    truncating='post'
)


encoder_input_data = np.array(padded_articles, dtype=np.int32)

print("Encoder Input Shape:", encoder_input_data.shape)


model = tf.keras.models.load_model('/content/summary_lstm.keras')

Encoder Input Shape: (1, 400)


In [16]:

hidden_units = 256
MAXLEN_ANSWERS = 199
MAXLEN_QUESTIONS = 400
def build_inference_models(model, hidden_units):
    # Encoder
    encoder_inputs = model.input[0]
    encoder_outputs, state_h_enc, state_c_enc = model.get_layer("lstm_4").output
    encoder_model = Model(encoder_inputs, [encoder_outputs, state_h_enc, state_c_enc])

    # Decoder Inputs
    decoder_inputs = Input(shape=(1,), name="decoder_input_infer")
    decoder_state_input_h = Input(shape=(hidden_units,), name="decoder_h")
    decoder_state_input_c = Input(shape=(hidden_units,), name="decoder_c")
    encoder_outputs_input = Input(shape=(None, hidden_units), name="encoder_outputs_infer")

    # Decoder Embedding
    decoder_embedding_layer = model.get_layer("embedding_3")
    decoder_embedding = decoder_embedding_layer(decoder_inputs)

    # Decoder LSTM
    decoder_lstm = model.get_layer("lstm_5")
    decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
        decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c]
    )

    # Attention
    score = Dot(axes=[2, 2])([decoder_outputs, encoder_outputs_input])
    attention_weights = Activation('softmax')(score)
    context_vector = Dot(axes=[2, 1])([attention_weights, encoder_outputs_input])
    decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

    # Dense Output
    decoder_dense = model.get_layer("dense_2")
    decoder_outputs_final = decoder_dense(decoder_combined_context)

    # Decoder Model
    decoder_model = Model(
        [decoder_inputs, encoder_outputs_input, decoder_state_input_h, decoder_state_input_c],
        [decoder_outputs_final, state_h_dec, state_c_dec]
    )

    return encoder_model, decoder_model
encoder_model, decoder_model = build_inference_models(model, hidden_units)


def decode_sequence(input_text, tokenizer, maxlen_questions, maxlen_answers, temperature=1):
    input_seq = encoder_input_data[0].reshape(1,maxlen_questions)  # shape = (1, 400)

    # Run encoder
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq)
    states_value = [state_h, state_c]

    start_token = tokenizer.word_index.get("startseq")
    end_token = tokenizer.word_index.get("endseq")
    target_seq = np.array([[start_token]])

    decoded_sentence = []

    for _ in range(maxlen_answers):
        output_tokens, h, c = decoder_model.predict(
            [target_seq, encoder_outputs] + states_value
        )

        output_distribution = output_tokens[0, -1, :]
        #output_distribution = np.log(output_distribution + 1e-10) / temperature
        output_distribution = np.log(output_distribution + 1e-10) / temperature
        exp_preds = np.exp(output_distribution)
        output_distribution = exp_preds / np.sum(exp_preds)


        sampled_token_index = np.argmax(output_distribution)
        #sampled_token_index = np.random.choice(len(output_distribution), p=output_distribution)

        sampled_word = tokenizer.index_word.get(sampled_token_index, "?")

        if sampled_token_index == end_token or sampled_word == "?":
            break

        decoded_sentence.append(sampled_word)
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return " ".join(decoded_sentence)

input_seq = encoder_input_data[0]

print(input)
response = decode_sequence(input_seq, tokenizer, MAXLEN_QUESTIONS, MAXLEN_ANSWERS)
print("Bot:", response)

the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october. the state health department has issued an advisory of exposure for anyone who attended five churches and took communion. bishop john folda of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a . state immunization program manager molly howell says the risk is low, but officials feel it is important to alert people to the possible exposure. the diocese announced on monday that bishop john folda is taking time off after being diagnosed with hepatitis a. the diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in italy last month. symptoms of hepatitis a include fever, tiredness, loss of appetite, nausea and abdominal dis