## 1.Importing the required libraries

In [1]:
# Basic libraries
import os
import time
import pickle
import numpy as np

# URL Request libraries
import json
import requests
from requests.compat import urljoin

# Text Processing Libraries
import re
import nltk
import gensim
import gensim.downloader as api
from nltk.corpus import stopwords

# Library for similarity score computation  
from sklearn.metrics.pairwise import pairwise_distances_argmin

## 2.Creating a class BotHandler

BotHandler class takes care of all the back-end taks of the bot.It has three main functions:

**get_updates** — checks for new messages  
**send_message** – posts new message to user  
**get_answer** — computes the most relevant on a user's question

In [2]:
class BotHandler(object):
    
    # Class' constructor which initializes the token value , builds the api url
    # and stores the object of the Simple Dialogue Manager class 
    def __init__(self, token, dialogue_manager):
        
        self.token = token
        self.api_url = "https://api.telegram.org/bot{}/".format(token)
        self.dialogue_manager = dialogue_manager
    
    # Get_updates function build the getUpdates URL to hear the updates sent by 
    # the user and performs preliminary checks i.e. if updates can be represented
    # in json format and if result key is present or not
    def get_updates(self, offset=None, timeout=30):
        params = {"timeout": timeout, "offset": offset}
        raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
        try:
            resp = raw_resp.json()
        except json.decoder.JSONDecodeError as e:
            print("Failed to parse response {}: {}.".format(raw_resp.content, e))
            return []

        if "result" not in resp:
            return []
        return resp["result"]
    
    # Send_message function is responsible for posting the text supplied 
    # to the mentioned chat_id
    def send_message(self, chat_id, text):
        params = {"chat_id": chat_id, "text": text}
        return requests.post(urljoin(self.api_url, "sendMessage"), params)

    # Get_answer function's responsibilty is to provide either standard answer when
    # user starts the bot or to seek answer from the generate_answer function 
    # present in the Simple Dialogue Manager Class
    def get_answer(self, question):
        if question == '/start':
            return "Hi, I am your project bot. How can I help you today?"
        return self.dialogue_manager.generate_answer(question)

## 3.Creating a function which performs basic text preprocessing tasks

This function basically performs the following tasks on the question posed by the user :-

a) Replace the characterts such as [/(){}[]|@,;] with space.  
b) Keep only the alphanumeric ones (and some special ones).  
c) Removing the english stopwords as these do not serve any purpose.

In [3]:
def text_prepare(text):
    
    """Performs tokenization and simple preprocessing."""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

## 4.Building a function which will convert the question to an embedding aka word vector.

The word vector will then be used to give an answer to user's question by getting the most similar title using the cosine similarity metric.

In [4]:
def question_to_vec(question, embeddings, dim=300):
    
    """
        question: a string
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation

        result: vector representation for the question
    """
    
    # Splitting the question into words and storing the no of words present in the question
    word_tokens = question.split(" ")
    question_len = len(word_tokens)
    
    # Initializing an array with the rows as no of words in the text and 300 as the columns
    question_mat = np.zeros((question_len,dim), dtype = np.float32)
    
    # Checking if the word exists in the google pre-trained model 
    for idx, word in enumerate(word_tokens):
        if word in embeddings:
            # If word is there then populate the array with the embeddings of the word 
            question_mat[idx,:] = embeddings[word]
            
    # remove zero-rows which stand for OOV words       
    question_mat = question_mat[~np.all(question_mat == 0, axis = 1)]
    
    # Compute the mean of each word along the sentence
    if question_mat.shape[0] > 0:
        vec = np.array(np.mean(question_mat, axis = 0), dtype = np.float32).reshape((1,dim))
    else:
        vec = np.zeros((1,dim), dtype = np.float32)
        
    return vec

## 5. Creating a Simple Dialogue Manager Class

This class can be assumed as the brain of our chatbot as this is where all the heavylifting takes place.

Primary functions of this class are :

1. **Training of the Dialogue bot** on english coprus.
2. **Loading of all the necessary models** like the TFIDF vectorizer , Intent classifier , Tag classifer and Google's pre trained model.
3. **Pre-process and transformation of the question into TF-TDF matrix** using the models loaded in the above step.
4. **Classifying the intent (Dialogue/Stack Overflow)** and incase of stack overflow question , **identify the tag associated** with it. 
5. Lastly , **finding the most similar title's post_id** to return the most appropriate answer to the user's question.   

In [5]:
class SimpleDialogueManager(object):
    """
    This is a simple dialogue manager to test the telegram bot.
    The main part of our bot is written here.
    """
    def __init__(self):

        # Instantiate all the models and TFIDF Objects.
        print("Loading resources...")
        
        # Instantiate a Chatterbot for Dialogue type questions
        from chatterbot import ChatBot
        from chatterbot.trainers import ChatterBotCorpusTrainer
        chatbot = ChatBot('NLP Chat bot')
        trainer = ChatterBotCorpusTrainer(chatbot)
        trainer.train('chatterbot.corpus.english')
        self.chitchat_bot = chatbot
        
        print("Loading Vectorizer object...")
        
        # Loading TFIDF vectorizer object
        self.tfidf_vectorizer = pickle.load(open('resources/tfidf.pkl', 'rb'))
        
        print("Loading Classifier objects...")
        
        # Loading intent classifier
        self.intent_recognizer =  pickle.load(open('resources/intent_clf.pkl', 'rb'))
        
        # Loading tag classifier
        self.tag_classifier =  pickle.load(open('resources/tag_clf.pkl', 'rb'))
        
        print("Loading Word2vec model...")
        
        # Instantiating the Google's pre-trained Word2Vec model.
        
        path = api.load("word2vec-google-news-300", return_path=True)
        self.model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)         
        
        print("Finished Loading Resources")

    # We need this function to get most similar title's *post id* in the dataset
    # Given , we know the programming Language of the question.
    
    def get_similar_question(self,question,tag):
        
        # Get the path where all question embeddings are kept and load the post_ids and post_embeddings
        embeddings_path = 'resources/embeddings_folder/' + tag + ".pkl"
        post_ids, post_embeddings = pickle.load(open(embeddings_path, 'rb'))
        
        # Get the embeddings for the question
        question_vec = question_to_vec(question, self.model, 300)
        
        # Find index of most similar post
        best_post_index = pairwise_distances_argmin(question_vec,post_embeddings)
        
        # Return best post id
        return post_ids[best_post_index]

    def generate_answer(self, question): 
        prepared_question = text_prepare(question)
        features = self.tfidf_vectorizer.transform([prepared_question])
        # Find intent
        intent = self.intent_recognizer.predict(features)[0]
        # Dialogue part:   
        if intent == 'dialogue':
            response = self.chitchat_bot.get_response(question)
        # Stack Overflow Question
        else:
            # find programming language
            tag = self.tag_classifier.predict(features)[0]
            # find most similar question post id
            post_id = self.get_similar_question(question,tag)[0]
            # respond with 
            response ="I think it's about %s\nThis thread might help you: https://stackoverflow.com/questions/%s" %(tag, post_id)
        return response

## 6. Defining the heart of our Chat Bot : MAIN function

Here in the Main function , we are performing the following things :

1. **Creating the objects of the both the classes** i.e. the Simple Dialogue Manager and the BotHandler
2. Responsible for **extracting the chat id** from which the updates are coming.
3. **Storing the text of the message** sent by the user which eventually is the question.
4. Calling the send message function to **send the appropriate answer** to the user.
5. **Continously listen to the new set of updates** that the user might send it to.

In [None]:
def main():
    
    # Putting our own Telegram Access token here...
    os.chdir('C:\\Users\\nisha\\Desktop')
    token = '1239181010:AAH2HeSPaOw73RNEwyc85xSykQbchK4zSG8'
    
    # Creating objects of both the classes
    simple_manager = SimpleDialogueManager()
    bot = BotHandler(token, simple_manager)

    print("Ready to talk!")
    offset = 0
    while True:
        
        # Getting all the messages posted by the user before starting the bot
        updates = bot.get_updates(offset=offset)
        
        for update in updates:
            
            print("\nAn update received.")
            
            # Iterating thorugh each message
            if "message" in update:
                
                # Fetching the message's chat id to reply to
                chat_id = update["message"]["chat"]["id"]
                
                # Obtaining the text of the message
                if "text" in update["message"]:
                    text = update["message"]["text"]
                    
                    if (len(text) == len(text.encode())):
                        
                        # Displaying the whole json format of the message
                        print("Update content: {}".format(update))
                        
                        # Replying to the user's message based on the intent of the question
                        bot.send_message(chat_id, bot.get_answer(update["message"]["text"]))
                    
                    else:
                     
                        # If some special characters like emojis are sent , following message is sent to the user
                        bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...")
            
            # Updating the offset to move on to the next update
            offset = max(offset, update['update_id'] + 1)
            
        time.sleep(1)

if __name__ == "__main__":
    main()    

Loading resources...
Training ai.yml: [####################] 100%
Training botprofile.yml: [####################] 100%
Training computers.yml: [####################] 100%
Training conversations.yml: [####################] 100%
Training emotion.yml: [####################] 100%
Training food.yml: [####################] 100%
Training gossip.yml: [####################] 100%
Training greetings.yml: [####################] 100%
Training health.yml: [####################] 100%
Training history.yml: [####################] 100%
Training humor.yml: [####################] 100%
Training literature.yml: [####################] 100%
Training money.yml: [####################] 100%
Training movies.yml: [####################] 100%
Training politics.yml: [####################] 100%
Training psychology.yml: [####################] 100%
Training science.yml: [####################] 100%
Training sports.yml: [####################] 100%
Training trivia.yml: [####################] 100%
Loading Vectorizer object.