In [None]:
# 1

class DocumentRanker():
    
    def __init__(self, query, folderPath):
        
        self.documentFolderPath = folderPath
        print("folderPath  : ", folderPath)
        self.query = self.convertQuery(query)
        self.folderDocumentContent = self.loadDocuments()  # Store the content of all the files in dictionary
        self.bm25Model = self.createBM25Model()
    
    
    def convertQuery(self,query):
        
        nlp = spacy.load("en_core_web_sm",disable = ['ner', 'parser', 'textcat'])
        doc = nlp(query.lower())
        return doc
        
    def loadDocuments(self):
        '''
        Load documents from the invest_data folder into a dictionary
        '''
        documentTuple = []
        nounList = self.getNouns()
        nlp = spacy.load("en_core_web_sm",disable = ['ner', 'parser', 'textcat'])
        
        for root, dirs, files in os.walk(self.documentFolderPath):
    
            for file in files:
                if file.endswith('txt'):
                    with open(os.path.abspath(os.path.join(root, file)), 'r' , encoding="utf8" ) as f:
                        content = f.read()
                        content = content.lower()
                        doc = nlp(content)
                        tokenizedContent = [tokens.text for tokens in doc] #if tokens.text not in ['a','an','the']]  
                        documentTuple.append((file,tokenizedContent))
                            
        return documentTuple
    
    def getNouns(self):
        
        #nlp = spacy.load("en_core_web_sm")
        
        '''
        spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)
        
        querySplit = self.query.lower().split()
        
        querySplit = [x for x in querySplit if x not in spacy_stopwords]
        
        print(querySplit)
        
        return querySplit
        
        '''
        nounList = []
        for token in self.query:
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN" or token.pos_ =="ADJ":

                nounList.append(token.text)
            
        print("noun list : ", nounList)        
        return nounList
    
        
    def createBM25Model(self):
        '''
        Return a created BM25 model.
        '''
        return gensim.summarization.bm25.BM25([x[1] for x in self.folderDocumentContent])
    
    def rankDocuments(self):
        '''
        Rank the documents in the corpus wrt the query. 
        '''
        query = [tokens.text for tokens in self.query]# if tokens.text not in ["a","an","the"]]
        scores = self.bm25Model.get_scores(query)
        documentScores =  [(self.folderDocumentContent[i][0],scores[i]) for i in range(len(scores))]
        return documentScores
    
    def returnTopK(self,k=5):
        '''
        Return top k ranked documents wrt given user query
        '''
        
        scoreDict = self.rankDocuments()
        return sorted(scoreDict , key = lambda x: x[1], reverse = True)[:k]
    
    def returnTopDocuments(self):
        '''
        Return top documents wrt given user query. This uses a different approach from our 
        previous function. We return all documents with score a standard deviation above the
        mean score of our corpus
        '''
        
        scoreTup = self.rankDocuments()
        #scoreDict = dict(scoreTup)
        scores = np.array([x[1] for x in scoreTup])
        #scores = np.array(list(scoreDict.values()))
        meanScore = np.mean(scores)
        sdScore = np.std(scores)
        maxThreshold = meanScore + sdScore
        topScoresDict = [x for x in scoreTup if x[1] > maxThreshold]
        topDocumentScores = sorted(topScoresDict, key = lambda x: x[1], reverse = True)
        
        return topDocumentScores
    
    def returnTopDocumentsData(self):
        '''
        Return data from the top documents retrieved by returnTopDocuments
        '''
        #query = self.query
        topDocumentScores = self.returnTopDocuments()
        print("topDocumentScores  : ", topDocumentScores)
        topDocumentNames = [x[0] for x in topDocumentScores]
        #print(topDocumentNames)
        
        topDocumentText = [x for x in self.folderDocumentContent if x[0] in topDocumentNames]
        return topDocumentText

In [None]:
#2 

class PassageRanker():
    
    def __init__(self, query, topDocumentContent):
        
        self.query = self.convertQuery(query)
        self.topDocumentContent = topDocumentContent
        #self.BM25Model = self.createBM25Model()
    
    def convertQuery(self,query):
        
        nlp = spacy.load("en_core_web_sm",disable = ['ner', 'parser', 'textcat'])
        doc = nlp(query.lower())
        return doc
    
    
    def createBM25Model(self,documentContent):
        '''
        Return a created BM25 model.
        '''
        #return gensim.summarization.bm25.BM25([x[1] for x in self.topDocumentContent])
        return gensim.summarization.bm25.BM25(documentContent)
    
    
    def getNouns(self):
        
        #nlp = spacy.load("en_core_web_sm")
        
        '''
        spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)
        
        querySplit = self.query.lower().split()
        
        querySplit = [x for x in querySplit if x not in spacy_stopwords]
        
        print(querySplit)
        
        return querySplit
        
        '''
        nounList = []
        for token in self.query:
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN" or token.pos_ == "ADJ":
                nounList.append(token.text)
            
        #print("noun list : ", nounList)        
        return nounList
    
    def checkNounMatch(self,paragraph,nounList):
        '''
        A function to see if each noun is matching for a paragraph
        '''
        '''
        nounBool = True
        stemmer = SnowballStemmer("english")
        #
        #stemmedPara = [stemmer.stem(token) for token in paragraph]
        paragraphContent = " ".join(paragraph)
        for nouns in nounList: #nounsLemma:
            
            #nounLemma = [tokens.lemma_ for tokens in nounDoc]
            
            if stemmer.stem(nouns) in paragraphContent:#paragraphContent:
                nounBool = True
            else:
                nounBool = False
                break
        
        return nounBool
        
        '''
        nounBool = True
        
        #nlp = spacy.load("en_core_web_sm",disable=['ner', 'parser', 'textcat'])
        
        for nouns in nounList:
            
            #nounDoc = nlp(nouns)
            #nounLemma = [tokens.lemma_ for tokens in nounDoc]
            paragraphContent = " ".join(paragraph)
            #paragraphDoc = nlp(paragraphContent)
            #paragraphLemma = [tokens.lemma_ for tokens in paragraphDoc]
            
            if nouns in paragraphContent:
                nounBool = True
            else:
                nounBool = False
                break
        
        return nounBool
        
        '''
        nounBool = True
        
        #nlp = spacy.load("en_core_web_sm",disable=['ner', 'parser', 'textcat'])
        stemmer = SnowballStemmer("english")
        paragraphContent = " ".join(paragraph)
        
        for nouns in nounList:
            
            if stemmer.stem(nouns) in paragraphContent:
                nounBool = True
            else:
                nounBool = False
                break
        
        return nounBool
        '''        
            
    def returnParagraphList(self,content):
        '''
        Return pargaraph in the form of list of lists to be fed to the BM25 Model
        '''
        paragraphList = []
        sepCounter = 0
        nounList = self.getNouns()
        
        for i,term in enumerate(content):
            
            if term == "--------------------------":
                paragraph = content[sepCounter:i] # " ".join() <-
                if self.checkNounMatch(paragraph,nounList):
                #if all(nouns in paragraph for nouns in nounList):
                    paragraphList.append(paragraph)
                
                sepCounter = i+1
        
        return paragraphList
    
    
    def returnParagraphScores(self,bm25Model,query):
        '''
        Return the score of a paragraph given a model.
        '''
        return bm25Model.get_scores(query)
        
    
    def returnTopPassages(self, k=10):
        '''
        Rank each paragraph from the document and return the top 10 passages from the collection of documents
        '''
        query = [tokens.text for tokens in self.query]
        
        #documentParagraphScores = [(document[0],self.rankPassages(document[1])) for document in topDocumentContent]
        paragraphScoreTup = []
        paragraphLoL = []
        for documents in self.topDocumentContent:
            #print(documents)
            paragraphLoL.extend(self.returnParagraphList(documents[1]))
        
        
        print(len(paragraphLoL))
        #passageBM25Model = self.createBM25Model(paragraphLoL)
        
        #for documents in self.topDocumentContent:
        #paragraphScores = self.returnParagraphScores(passageBM25Model, query)
        for i,paragraph in enumerate(paragraphLoL):
                
            #paragraphScoreTup.append((paragraph,paragraphScores[i]))
            paragraphScoreTup.append((paragraph,1))
        
        print(" length of paragraphScoreTup : ", len(paragraphScoreTup))
        
        self.topParagraphScores = sorted(paragraphScoreTup, key = lambda x: x[1], reverse = True)
        
        return self.topParagraphScores#[:k]

In [None]:
class bertQAModel():
    
    def __init__(self):
        
        self.tokenizerModel,self.bertQAModel = self.__initializeModel()
    
    def __initializeModel(self):
        '''
        Initialize the Beret Tokenizer and the QA model. Note that this is currently compatible with
        transformers module, NOT pytorch/tensorflow
        '''
        with open("./models/bertTokenizer.pkl","rb") as f:
            tokenizerModel = pickle.load(f)
        
        with open("./models/bertQAModel.pkl","rb") as f:
            bertQAModel = pickle.load(f)
        
        return tokenizerModel, bertQAModel
    
    def stringProcess(self,answer):
        answerSplit = answer.split(" ##")
        return "".join(answerSplit)
    
    def preprocessQuery(self,query):
        
        queryNLP = spacy.load("en_core_web_sm",disable = ['ner', 'parser', 'textcat'])
        doc = queryNLP(query)
        queryTokens = [tokens.text for tokens in doc if tokens.text not in ["a","an","the"]]
        return " ".join(queryTokens)
    
    def predict(self,text,question):
        '''
        Predict the answer given a passage and a question.
        '''
        question = self.preprocessQuery(question)
        input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
        #print("INPUT_TEXT : ")
        #print(input_text)
        
        input_ids = self.tokenizerModel.encode(input_text)
        #print("TOKENIZED TEXT : ")
        #print(input_ids)
        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
        start_scores, end_scores = self.bertQAModel(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
        
        all_tokens = self.tokenizerModel.convert_ids_to_tokens(input_ids)
        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
        startScoreInd,endScoreInd = torch.argmax(start_scores),torch.argmax(end_scores)
        startScoresVec = start_scores.detach().numpy().flatten()
        endScoresVec = end_scores.detach().numpy().flatten()
        
        startScoreMax, endScoreMax = startScoresVec[startScoreInd] , endScoresVec[endScoreInd]
        #avgScore = float(np.absolute(startScoreMax) + np.absolute(endScoreMax))
        #avgScore = float(startScoreMax + endScoreMax)
        return (self.stringProcess(answer),startScoreMax)

if __name__ == "__main__" :
    '''
    obj = bertQAModel()
    obj.predict("Narendar Modi said that there is a lack of good employment opportunities in India","What did Narendar Modi say?")
    '''
    pass