In [1]:
import spacy
from contractions import contractions_dict
import re
from multiprocessing import Pool
import json
#TAG_MAP=[".",",","-LRB-","-RRB-","``","\"\"","''",",","$","#","AFX","CC","CD","DT","EX","FW","HYPH","IN","JJ","JJR","JJS","LS","MD","NIL","NN","NNP","NNPS","NNS","PDT","POS","PRP","PRP$","RB","RBR","RBS","RP","SP","SYM","TO","UH","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB","ADD","NFP","GW","XX","BES","HVS","_SP",]
TAG_MAP=["AFX","CC","CD","DT","EX","FW","HYPH","IN","JJ","JJR","JJS","LS","MD","NIL","NN","NNP","NNPS","NNS","PDT","POS","PRP","PRP$","RB","RBR","RBS","RP","SP","SYM","TO","UH","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB","ADD","NFP","GW","XX","BES","HVS","_SP",]
TAG_MAP_LOWER=[tag.lower() for tag in TAG_MAP]
class DataPreProcessing():
    """This class will take a list of documents
    as a parameter and return list of processed
    data."""
    nlp = spacy.load('en_core_web_sm')
    noisy_pos_tags = ['-PRON-','-PRON-']
    

    def __init__(self,list_of_doc,lemmatize=True,expand_contraction=True,remove_special_characters=True):
        self()
        self.__list_of_doc=list_of_doc
        self._remove_special_characters=remove_special_characters
        self._lemmatize=lemmatize
        self._expand_contraction=expand_contraction        
    

    def normalized_corpus(self):
        return self.processData()
    
    
    @staticmethod
    def Noise(token):     
        is_noise = False
        if token.is_stop == True:
            is_noise = True
        elif token.pos_ in DataPreProcessing.noisy_pos_tags:
            is_noise = True 
        elif token.string.strip() in DataPreProcessing.noisy_pos_tags:
            is_noise = True
        return is_noise

    def __call__(self):
        add_List=['']
        remove_List=['not','no']
        for w in add_List:
            DataPreProcessing.nlp.vocab[w].is_stop = True

        for w in remove_List:
            DataPreProcessing.nlp.vocab[w].is_stop = False

    # # Expanding Contractions
    @staticmethod
    def expand_contractions(text,regex,contraction_mapping):
        match_list=regex.findall(text)
        if len(match_list)>0:
            for word in match_list:
                text=re.sub(word,contraction_mapping.get(word),text)
        return text

    # # Removing Special Characters
    @staticmethod
    def remove_special_characters(text):
        text = re.sub('[^a-zA-Z0-9\s]', '', text)
        return text
    
    def processData(self):
        corpus=[]
        contraction_mapping={k.lower(): v.lower() for k, v in contractions_dict.items()}
        ####correcting expansions of some keys
        contraction_mapping["they'd've"]="they would have"
        contraction_mapping["couldn't've"]="could not have"
        contraction_mapping["y’all’d"]="you all would"
        contraction_mapping["we’ll’ve"]="we will have"
        contraction_mapping["i'd've"]="I would have"
        contraction_mapping["wouldn’t’ve"]="would not have"
        contraction_mapping["mustn’t’ve"]="must not have"
        contraction_mapping["won’t’ve"]="will not have"
        contraction_mapping["he'd've"]="he would have"
        contraction_mapping["mightn't've"]="might not have"
        contraction_mapping["y’all’re"]="you all are"
        contraction_mapping["oughtn’t’ve"]="ought not have"
        contraction_mapping["it’ll’ve"]="it will have"
        contraction_mapping["who’ll’ve"]="who will have"
        contraction_mapping["hadn’t’ve"]="had not have"
        contraction_mapping["she’d’ve"]="she would have"
        contraction_mapping["oughtn't've"]="ought not have"
        contraction_mapping["there'd've"]="there would have"
        contraction_mapping["y’all’ve"]="you all have"
        contraction_mapping["mightn’t’ve"]="might not have"
        contraction_mapping["shouldn't've"]="should not have"
        contraction_mapping["how'd'y"]="how did you"
        contraction_mapping["i’ll’ve"]="i will have"
        contraction_mapping["y'all've"]="you all have"
        contraction_mapping["mustn’t’ve"]="must not have"
        contraction_mapping["she’ll’ve"]="she will have"
        contraction_mapping["they’ll’ve"]="they will have"
        contraction_mapping["shouldn’t’ve"]="should not have"
        contraction_mapping["oughtn’t’ve"]="ought not have"
        contraction_mapping["shan't've"]="shall not have"
        contraction_mapping["it’ll’ve"]="it will have"
        contraction_mapping["shan’t’ve"]="shall not have"
        contraction_mapping["who’ll’ve"]="who will have"
        contraction_mapping["hadn’t’ve"]="had not have"
        contraction_mapping["needn’t’ve"]="need not have"
        contraction_mapping["mightn’t’ve"]="might not have"
        contraction_mapping["shouldn't've"]="should not have"
        contraction_mapping["how'd'y"]="how did you"
        contraction_mapping["i’d’ve"]="i would have"
        contraction_mapping["we'll've"]="we will have"
        contraction_mapping["he'll've"]="he will have"
        contraction_mapping["wouldn't've"]="would not have"
        contraction_mapping["we’d’ve"]="we would have"
        contraction_mapping["can't've"]="cannot have"
        contraction_mapping["couldn’t’ve"]="could not have"
        contraction_mapping["i'll've"]="i will have"
        contraction_mapping["what’ll’ve"]="what will have"
        contraction_mapping["y’all’d’ve"]="you all would have"
        contraction_mapping["y'all're"]="you all are"
        contraction_mapping["there’d’ve"]="there would have"
        contraction_mapping["he’d’ve"]="he would have"
        contraction_mapping["you'd've"]="you would have"
        contraction_mapping["there’d’ve"]="there would have"
        contraction_mapping["he’ll’ve"]="he will have"
        contraction_mapping["will've"]="will have"
        contraction_mapping["cannot’ve"]="cannot have"
        contraction_mapping["you will've"]="you will have"
        contraction_mapping["he will’ve"]="he will have"
        contraction_mapping["will not've"]="will not have"
        contraction_mapping["he would’ve"]="he would have"
        contraction_mapping["all’ve"]="all have"
        """
        print("***********************************************************************************************************************************")
        with open("contraction.json","w") as f1:
            json.dump(contraction_mapping,f1,ensure_ascii=True)

        with open("contraction.json","r") as f1:
            abc=json.loads(f1.read())
            print(abc.items())
        print("***********************************************************************************************************************************")
        """
        regex = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
        for doc in self.__list_of_doc:
            if self._expand_contraction:
                doc=self.expand_contractions(doc.strip().lower(),regex,contraction_mapping)

            if self._remove_special_characters:
                doc=self.remove_special_characters(doc)
                
            document=DataPreProcessing.nlp(doc.lower())
            if not self._lemmatize:
                #cleaned_list = " ".join([(token.string.strip(),token.tag_) for token in document if not self.Noise(token) and len(token.string.strip())>0])
                corpus.append(cleaned_list)
            else:
                #cleaned_list = " ".join([(token.lemma_.strip(),token.tag_) for token in document if not self.Noise(token) and len(token.string.strip())>0])
                cleaned_list = [[token.lemma_.strip(),token.tag_] for token in document if not self.Noise(token) and len(token.string.strip())>0]
                alst=[]
                #cleaned_list_b=" ".join(alst.append(" ".join(tup)) for tup in cleaned_list)
                for tup in cleaned_list:
                    alst.append(" ".join(tup))
                    #print(alst)
                          
                corpus.append(alst)
                
        corpus_mod=[" ".join(lst) for lst in corpus]
        print(corpus)
        return corpus_mod  

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import pandas as pd

  _nan_object_mask = _nan_object_array != _nan_object_array


In [3]:
import nltk
def getVocab(corpus):
    vocab=[]
    for text in corpus:
        bigrm = nltk.bigrams(text.split())
        b=[" ".join(tup) for tup in bigrm if tup[0] not in TAG_MAP]
        
        vocab.extend(b)
    return set(vocab)

In [7]:
def question(corpus):
    #vocabulary=getVocab(corpus)
    #tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),sublinear_tf=True,vocabulary=vocabulary)
    tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(2,3),sublinear_tf=True)
    tfidf_matrix_train = tv.fit_transform(corpus)
    #vocab=tv.get_feature_names()
    #tfidf_matrix_train=tfidf_matrix_train.toarray()
    #print(tv.vocabulary_)
    #print(type(vocab))
    #print(tv)
    #df=pd.DataFrame(tfidf_matrix_train,columns=vocab)
    #print(df)
    #print(tfidf_matrix_train.shape)
    return tv, tfidf_matrix_train

In [8]:
def learning_DB(file,text):
    import os,json
    from lockfile import LockFile
    if os.path.isfile(file):
        lock = LockFile(file)
        with lock:
            with open(file,"r+") as f1:
                aDict=json.loads(f1.read())
                f1.seek(0)
                aDict['questions'].append(text)
                json.dump(aDict,f1,ensure_ascii=True)
                f1.truncate()
    
    else:
        aDict=dict()
        aDict['questions']=[]
        aDict['questions'].append(text)
        with open(file,"w") as f1:
            json.dump(aDict,f1,ensure_ascii=True)
            
            

In [9]:
def reply(test,tv,tfidf_matrix_train):
    #print(tv)
    #print("b")
    test=(test,)
    tfidf_matrix_test = tv.transform(test)
    cosine = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
    print(cosine)
    #print(tfidf_matrix_test, tfidf_matrix_train)


    minimum_score=0.3
    #cosine = np.delete(cosine, 0)  #not required
    #print(cosine)
    maxa = cosine.max()
    #print(maxa)
    response_index=-99999
    if (maxa >= minimum_score):
        #print("hello")
        #new_max = maxa - 0.01
        alist = np.where(cosine > minimum_score)[1]
        #alist = np.where(cosine > new_max)
        # print ("number of responses with 0.01 from max = " + str(list[0].size))
        #response_index = random.choice(alist[0])
        
        #print(response_index)
        for index in alist:
            print("**************")
            print(index)
            print(target[index])
            print(data[index])
            print("**************")

        blist=np.where(cosine==maxa)[1]
        print("******MAX b********")
        print(blist)
        print("******MAX b********")
        final_list=[]
        for index in alist:
            final_list.append(target[index])
            
        print("replied")
        return final_list
def main(data):
    from os import getpid
    d=DataPreProcessing(data)
    corpus=d.normalized_corpus()
    #print(corpus)
    return corpus
if __name__=="__main__":
    data=['how to resolve error for Server Load','How to resolve error for No file found','job abended or failed  for script','how to execute a script','efgh ',\
      'Hi']
    target=['run top to idntify the process.kill it ','specify the correct directory','check the logs','execute a sh/source','abcd','Hey Let me know how could I help you']

    
    q=[]
    a=[]
    print(len(a))
    with open('train.json', "r") as sentences_file:
        reader = json.load(sentences_file)
        for item in reader['data']:
            if type(item)==dict:
                for qas in item['paragraphs']:
                    for question in qas['qas']:
                        try:
                            a.append(question['answers'][0]['text'])
                            q.append(question['question'])
                        
                        except:
                            pass
                        break
    q.extend(data)
    a.extend(target)
    print(len(a))
    data=q
    target=a

    del a
    del q
    
    
    corpus_mod=main(data)
    tv, tfidf_matrix_train=question(corpus_mod)
    while True:
        question=input("rudra:" )      
        ques=(question,)
        ques=" ".join(main(ques))
        print("Mod question")
        print(ques)
        response=reply(ques,tv,tfidf_matrix_train)
        if response is None:
            print("I am Sorry!! I am not aware of this")
            learning_DB("learning.json",question)
        else:
            print(response)

0
18888


TypeError: 'dict' object is not callable