In [None]:
import json
import re

# library fr tf-idf and VSM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Library for Probabilistic Retrieval
from rank_bm25 import BM25Okapi


# Boolean Retrieval (4α)
def BooleanRetrieval(Query, Records, SearchWords, Data):

    # List of acceptable boolean operations 
    Operators = ['and', 'not', 'or']

    # Variable initialization
    Results = []
    Op = ''

    # Loop for every word in the query
    for Word in Query:

        # If the word is an operator, save it
        if Word in Operators:
            Op = Word

        # If it's a keyword find it and operate accordingly to the saved operator
        else:
            WordResults = SearchKeyword(Word)
            if Op == 'not':
                WordResults = list(set(range(Records)) - set(WordResults))
            if not Results:
                Results.append(WordResults)
            elif Op == 'and':
                Results[-1] = list(set(Results[-1]) & set(WordResults))
            elif Op == 'or':
                Results[-1] = list(set(Results[-1]) | set(WordResults))
            else:
                Results.append(WordResults)

    # Set Found as Result properly
    Found = Results
    if Found:
        Found = list(set.union(*[set(f) for f in Found]))
    else:
        Found = [] 

    # Display Results (Default ranking, by first article scrapped)
    if not Found:
        print(f"The Keyword '{SearchWords}' couldn't be found in any of our articles.")
    else:
        print(f"The Keyword '{SearchWords}' was found in the Following article(s): ")
        c = 0
        for i in Found:
            c = c + 1
            print(f"{c}. Title: '{Data[i]['Title']}' | URL: [{Data[i]['Url']}] ")
    return Found

# Searching the Inverted index file to locate the searchword
def SearchKeyword(SearchWord):
    with open('Data3.json', 'r') as File:
        InvertedIndexes = json.load(File)
    
    for w, i in InvertedIndexes.items():
        if w == SearchWord:
            return i
    return []

# Checking Query and processing Search words
def CheckQuery(SearchWords):
    return re.findall(r'\w+|AND|OR|NOT', SearchWords.lower())
# ===========================================================


# Calculate Term Frequency-Inverse Document Frequency (TF-IDF)
def CalculateTFIDF(Data):

    # Save Data content and lower all to scan
    Contents = [Segment['Content'] for Segment in Data]
    Contents = [C.lower() for C in Contents]

    # Convert  content to TF-IDF array
    Vectorizer = TfidfVectorizer()
    TFArray = Vectorizer.fit_transform(Contents)
    
    return TFArray, Vectorizer
# ============================


# Vector Space Model (VSM)
def VSM(Data, SearchWords):

    # Use Tf-idf functions
    TFArray, Vectorizer = CalculateTFIDF(Data)
    SearchWords = Vectorizer.transform([SearchWords])

    # Calculate score by checking similarity between the two vectors
    VSMScore = cosine_similarity(SearchWords, TFArray).flatten()
    
    VSMScoreRanked = VSMScore.argsort()

    # Get and sortt all results
    Results = [(i, VSMScore[i]) for i in VSMScoreRanked]
    Results = sorted(Results, key=lambda R:R[1], reverse = True)

    # Display all acordingly
    c = 0
    for i, VSMScore in Results:
        c = c + 1
        print(f"{c}. Title: '{Data[i]['Title']}' | Score: {VSMScore:.4f}")
        print(f"   URL: [{Data[i]['Url']}]")
    
    return Results
# ==========

# Probabilistic Retrieval
def ProbabilisticRetrieval(Data, SearchWords):

    # Save Data content, lower and split all words to lists
    # Tokenizing all content
    Contents = [Segment['Content'] for Segment in Data]
    Contents = [C.lower().split(" ") for C in Contents]

    # Calculating BM25
    BM25 = BM25Okapi(Contents)

    # Tokenizing query
    SearchWords = SearchWords.split(" ")

    # Compute the score
    BMScore = BM25.get_scores(SearchWords)

    # Set results and sort them
    Results = [(i, BMScore[i]) for i in range(len(BMScore))]
    Results = sorted(Results, key=lambda R:R[1], reverse = True)

    # Display all acordingly
    c = 0
    for i, Score in Results:
        c = c + 1
        print(f"{c}. Title: '{Data[i]['Title']}' | BMScore: {Score:.4f}")
        print(f"   URL: [{Data[i]['Url']}]")
    
    return Results
# ==========

def main():
    with open('Data.json', 'r') as file:
        Data = json.load(file)

    print("...Wikipedia Search Engine...")
    print("=============================")
    print("1. Boolean retrieval")
    print("2. Vector Space Model (VSM)")
    print("3. Probabilistic Retrieval")
    Algorithm = input("Choose algorithm to sort the outcomes: ")
    print("=========================================")
    SearchWords = input("Insert keyword(s): ")
    print("================================")

    # Processing the input
    SearchWords = SearchWords.lower()
    Query = CheckQuery(SearchWords)

    # Using the algorithm the user choose
    if Algorithm == "1":
        BooleanRetrieval(Query, len(Data), SearchWords, Data)
    elif Algorithm == "2":
        VSM(Data, SearchWords)
    elif Algorithm == "3":
        ProbabilisticRetrieval(Data, SearchWords)
    else:
        # loop again if the input is invalid
        print("No such algorithm")
        main()
        return 0
        
    return 0

if __name__ == "__main__":
    main()