In [None]:
!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
!pip install git+https://github.com/PrithivirajDamodaran/Gramformer.git
!pip install language-tool-python
!pip install deepmultilingualpunctuation
!pip install better-profanity
!pip install spacy==2.2.4 
# !pip install -U 'spacy[cuda-autodetect]'
!python -m spacy download en
!pip install Flask-Cors
!pip install gradio
!pip install sentence_transformers

In [None]:
# Note: For gramformer use this spacy version otherwise you will get error for model link

In [2]:
import language_tool_python
import logging
from gramformer import Gramformer
from deepmultilingualpunctuation import PunctuationModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from better_profanity import profanity
import spacy
import os
import torch
import random
from transformers import T5ForConditionalGeneration, T5Tokenizer
import warnings
import json
from flask import Flask, request, jsonify
from flask_cors import CORS
import gradio as gr
import pandas as pd
import numpy as np
print(spacy.__version__)

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

2.2.4


In [3]:
os.chdir("/content/drive/MyDrive/FAQ_ChatBot")

# CONFIG 
LOGFILE = "logs/main.log"
SERVER = "0.0.0.0"
PORT = 8091
BAD_WORDS_DATA = "DATA/bad-words.txt"

# Probability for T5 model
TOP_P = 0.98

# Model path for loading
MODEL = "ramsrigouthamg/t5_paraphraser"  # "models/T5_MODELS"
COUNT = 6

# add aditional bad words 
file = open(BAD_WORDS_DATA)
custom_badwords = [line.strip() for line in file.readlines()]
file.close()
profanity.add_censor_words(custom_badwords)

# DATA = "DATA/question_embedding.csv"

# df = pd.read_csv(DATA)
# df["embd"] = df["embd"].apply(lambda x: eval(x))

CONF = 0.80
SENTENSE_MODEL = 'sentence-transformers/all-mpnet-base-v2'



# set random seed 
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

In [None]:
!pip freeze > requirements.txt

In [4]:
# load model for punctuation 
punct_model = PunctuationModel()
nlp = spacy.load("en")
# load model for grammer and puctuation
grammer_model = Gramformer(models=1, use_gpu=False)

# model for grammer correction 
lang_tool = language_tool_python.LanguageTool("en-US")

# shif model to gpu cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load similarity transformer models
similarity_model = SentenceTransformer(SENTENSE_MODEL)
similarity_model = similarity_model.to(device)


Downloading:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/406 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

[Gramformer] Grammar error correct/highlight model loaded..


Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:17<00:00, 12.9MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmphfnkfu4_.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /root/.cache/language_tool_python.


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:


# to check profanity
def profanity_detection(sentence):
    """
    objective: this function is used to detect and fillter bad words

    input: 
        sentence: string data for input sentence

    returns:
        censored_text: clean cesored text
        isProfane: bool, if found True else false
    """
    profanity.load_censor_words()
    print("[INFO] Profanity check for sentence: {}".format(sentence))
    censored_text = profanity.censor(sentence)
    isProfane = profanity.contains_profanity(sentence)
    return censored_text, isProfane


# correct sentence grammer and spelling and add punctuations
def correct_sentence(input_sentence):
    """
    objective: this function is used to correct and check the grammatical mistakes,
    spelling mistakes and punctuation.

    input:
        input_sentence: string input data

    returns:
        result: corrected sentence with grammer spelling and punctuation
    """
    print("[INFO] Mistakes in input questions: {}".format(lang_tool.check(input_sentence)))
    corrected_gram = list(grammer_model.correct(input_sentence.lower(), max_candidates=1))
    corrected_punct = punct_model.restore_punctuation(corrected_gram[0])
    result = lang_tool.correct(corrected_punct)
    return result


# function to find similar phrases
def question_search(input_ques, emmbding_df, conf, type=1):
    """
    objective : This function is used to search similar question

    input:
        input_sentence: input question from user
        emmbding_df: a data frame which contain embedding of the same question
        conf: thershold for question similartiy

    return:
        sq: list of similar questions
    """

     # fillter profanity
    censored_text, isProfane = profanity_detection(input_ques)

    if isProfane:
        print("[INFO] Profanity Word Detected...")
        input_ques = censored_text
    else:
        print("[INFO] No Profanity word found...")

    # check speling mistake and grammatical mistake
    input_ques = correct_sentence(input_ques)

    ques = [input_ques]
    # extract embeddings for all all questoin
    sentence_embd = emmbding_df["embd"].to_list()

    # to get the tensor on gpu
    sentence_embd = torch.Tensor(sentence_embd).to('cuda')

    # create embeddings for questions
    question_embd = similarity_model.encode(ques, convert_to_tensor=True, device="cuda")

    # Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.cos_sim(sentence_embd, question_embd)

    emmbding_df["score"] = [round(float(score), 2) for score in cosine_scores]  # remove
    result = emmbding_df[emmbding_df["score"] >= conf].reset_index(drop=True)
    result = result.sort_values(by=['score'], ascending=False)

    if type==1:

        # convert all question into list
        sq = result["Question"].to_list()
        
        return sq
    elif type==2:
        sq = result[["Question", "Answer"]]
        return sq

In [6]:
DATA1 = "DATA/quora_embeddings.csv"
df1 = pd.read_csv(DATA1)
df1["embd"] = df1["embd"].apply(lambda x: eval(x))

DATA2 = "DATA/amazon_embeddings.csv"
df2 = pd.read_csv(DATA2)
df2["embd"] = df2["embd"].apply(lambda x: eval(x))

In [7]:
question_search("how to make bomb", df1, conf=0.30, type=1)

[INFO] Profanity check for sentence: how to make bomb
[INFO] No Profanity word found...
[INFO] Mistakes in input questions: [Match({'ruleId': 'UPPERCASE_SENTENCE_START', 'message': 'This sentence does not start with an uppercase letter.', 'replacements': ['How'], 'offsetInContext': 0, 'context': 'how to make bomb', 'offset': 0, 'errorLength': 3, 'category': 'CASING', 'ruleIssueType': 'typographical', 'sentence': 'how to make bomb'})]


['What would happen if a bomb was made of antimatter?',
 'How do I use pepper spray?',
 'How do made a gun?',
 'How will I destroy the humanity?',
 'How do I make my game?',
 'Does shooting a barrel of oil really cause it to explode?',
 "What is the meaning of Marathi word 'Bombil'?",
 "What is the meaning of Marathi word 'Bombil'?",
 'What does it feel like to bomb on stage as a stand-up comedian?',
 'Any ideas of simple project using logic gates?',
 'How do I stop terrorism?',
 'How do I bake a cake?',
 'What happens if I drop a lit match immediately into a 5ltr can of paraffin which is half full?',
 'Could someone with a trillion dollars make a nuclear fusion reactor?',
 'How is Twisted tea made?',
 'How do I make Thai curry?',
 'What is the use of baking soda?',
 'How can I make tomato sauce for a chili recipe?',
 'How oil and vinegar is used to make pickle?',
 'What are the ways to increase the pressure of gas?',
 'How do you make a video game?',
 'Which countries have nuclear wea

In [8]:
question_search("My bulb arrived broken as several others", df2, conf=0.30, type=2)

[INFO] Profanity check for sentence: My bulb arrived broken as several others
[INFO] No Profanity word found...
[INFO] Mistakes in input questions: []


Unnamed: 0,Question,Answer
0,My bulb arrived broken as several others menti...,"Mine was a gift to someone, so I never looked ..."
50,Housing only? No working bulb?,housing and bulb included
81,How can I get replacement bulb?,you can check with the local hardware store/so...
64,can one get replacement bulbs/,"Maggie, Yes we offer the replacement tube, sto..."
69,"does this lamp come with the light bulb?, and ...","Lamp uses a regular, standard base bulb."
...,...,...
76,Just hooked up my new Power Bright 600 watt. W...,there is a beep but the only light is the powe...
29,I lost my screws to the end of my telescope to...,My suggestion is to contact them through their...
47,Can I use this bulb in a goose neck student la...,Yes but it is longer than the lamp fixture. So...
59,When I was adjusting the cover to work consist...,I am not sure. Call the manufacturer. I have n...


In [9]:

def main1(input_sentence):
    out = question_search(input_sentence, df1, conf=0.30, type=1)
    out = pd.DataFrame({"Similar_Questions":out})
    return out


app_inputs = gr.inputs.Textbox(lines=2, placeholder="Enter Question Here...")
interface = gr.Interface(fn=main1, inputs=app_inputs, outputs=gr.Dataframe(type="pandas", col_count=1) ,title="FAQ Quora Chatbot")

if __name__ == '__main__':
    interface.launch(share=True,debug=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://491a25ca-c068-4af1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


In [10]:
def main2(input_sentence):
    out = question_search(input_sentence, df2, conf=0.30, type=2)
    return out


app_inputs = gr.inputs.Textbox(lines=2, placeholder="Enter Question Here...")
interface = gr.Interface(fn=main2, inputs=app_inputs, outputs=gr.Dataframe(type="pandas", col_count=2) ,title="FAQ Amazon Chatbot")

if __name__ == '__main__':
    interface.launch(share=True,debug=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://26ab658f-67b3-4df2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


In [None]:
Punctuations
grammer
spelling mistakes
remove bad words
identify greetings
find out the similar words
check topics in user querys
it can also be merge with Existing chatbot or Rasa chatbot


## Creating Embeddings

In [None]:
df_amz = pd.read_csv("DATA/Amazon_QNA.csv")
print("Total Shape: {}".format(df_amz.shape))
SAMPLE = 10000
df_amz = df_amz.sample(n=SAMPLE)
print("Final Shape: {}".format(df_amz.shape))
df_amz = df_amz.sample(n=SAMPLE)
df_amz["QID"] = list(range(1,SAMPLE+1))
df_amz = df_amz[["QID","Question", "Answer"]].reset_index(drop=True)

# create embeddings of all question
all_question_embd = similarity_model.encode(df_amz["Question"].to_list(), 
                                            convert_to_tensor=True, 
                                            device="cuda",
                                            batch_size=128,
                                            show_progress_bar=True)

# assighn embeddings to final dataframe

b = [list(np.array(i))  for i in all_question_embd.cpu()]
df_amz["embd"] = b
# df_amz["embd"] = list(all_question_embd.cpu().numpy())
# df_amz["embd"] = df_amz["embd"].apply(str) 
df_amz.to_csv("DATA/amazon_embeddings.csv", index=False)


Shape: (10000, 8)


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

In [None]:
df_amz.head(10)

Unnamed: 0,QID,Question,Answer,embd
0,1,Do you know if this will work for my HP G6 - 2...,I it's all about the model number versus the p...,"[0.05150938, 0.04544325, 0.018049337, 0.017407..."
1,2,"If the police pull me over, do they consider i...",It IS illegal! You would get a ticket for driv...,"[-0.027042547, 0.035542715, -0.02083596, -0.02..."
2,3,What is the input power on the battery charger?,It is a 110 charger.,"[0.017510118, -0.06630166, 0.040766235, -0.020..."
3,4,My bulb arrived broken as several others menti...,"Mine was a gift to someone, so I never looked ...","[0.042652078, 0.0063836, -0.014847118, 0.01170..."
4,5,Will it fit bear bows?,I can't say if it will fit all Bear bows but I...,"[0.039244145, 0.041788135, -0.0043693245, -0.0..."
5,6,"Chinese, I presume...?",American made. American company. Good steel.. ...,"[0.052161057, -0.0108938245, -0.0015074158, -0..."
6,7,what are the length and width of the filter?,8 x 8 7/8 inches,"[0.00082931283, -0.08775879, -0.018720118, -0...."
7,8,use with other MP3 players: Does this system w...,My wife uses the Sound Dock I got here with he...,"[0.033765167, -0.015236797, -0.0018112531, -0...."
8,9,I have some old video in VCD format that I wan...,I had trouble with this dvd playing dvdrs on i...,"[-0.060723748, 0.021367408, -0.011743658, -0.0..."
9,10,is it safe to use on eye lids?,I personally use this product on my eye lids a...,"[0.0026188593, 0.007344159, 0.0030409908, -0.0..."


In [None]:
df_quora = pd.read_csv("DATA/question_data.csv")
print("Total Shape: {}".format(df_quora.shape))
SAMPLE = 10000
df_quora = df_quora.sample(n=SAMPLE)
print("Final Shape: {}".format(df_quora.shape))
df_quora = df_quora.sample(n=SAMPLE)
df_quora["QID"] = list(range(1,SAMPLE+1))
df_quora = df_quora[["QID","question1"]].reset_index(drop=True)

# create embeddings of all question
all_question_embd_q = similarity_model.encode(df_quora["question1"].to_list(), 
                                            convert_to_tensor=True, 
                                            device="cuda",
                                            batch_size=128,
                                            show_progress_bar=True)

# assighn embeddings to final dataframe

bq = [list(np.array(i))  for i in all_question_embd_q.cpu()]
df_quora["embd"] = bq
# df_amz["embd"] = list(all_question_embd.cpu().numpy())
# df_amz["embd"] = df_amz["embd"].apply(str) 
df_quora.rename(columns = {'question1':'Question'}, inplace = True)
df_quora.to_csv("DATA/quora_embeddings.csv", index=False)

Total Shape: (404290, 6)
Final Shape: (10000, 6)


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

In [None]:
df_quora.head(10)

Unnamed: 0,QID,question1,embd
0,1,Which laptop is best under 25000 INR?,"[-0.03122546, 0.011354784, -0.017733293, -0.00..."
1,2,How much does a new garage door cost?,"[-0.011178708, -0.012601473, 0.023727156, 0.01..."
2,3,"How do you think outside the ""think outside th...","[0.028986493, 0.009261504, -0.01898976, 0.0254..."
3,4,What are the biggest marketing challenges that...,"[0.08148132, -0.027627472, -0.028240815, -0.05..."
4,5,How do I get a lean body quickly?,"[0.029229898, 0.050710864, 0.03379705, 0.00038..."
5,6,What are some little things that make you happy?,"[-0.047119886, -0.0019513975, -0.0100774085, -..."
6,7,(Cannabis) Which is the best grow guide for an...,"[-0.006068481, -0.03449526, 0.0033497058, -0.0..."
7,8,What type of diets can you follow to lose 5 po...,"[0.064898185, 0.073459595, 0.010997311, 0.0121..."
8,9,What is a moon opposition ascendant?,"[0.07191957, -0.06391938, 0.008843898, -0.0103..."
9,10,What is the value of an Instagram account with...,"[0.0014931855, 0.005142003, 0.0051253703, 0.00..."


In [None]:
print()




In [None]:
How many lumens do these bulbs emit per bulb?

Object `bulb` not found.


In [None]:
how can i make bombs