In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install fast-autocomplete


In [None]:
!pip install sentence-transformers

In [None]:
pip install stanfordnlp

In [None]:
pip install editdistance

In [None]:
pip install lupyne

In [None]:
pip install pylucene4

In [None]:
import requests
import random
import editdistance
import stanfordnlp as st
import spacy 
from fast_autocomplete import AutoComplete
from fast_autocomplete import autocomplete_factory
import json

import re
from nltk.corpus import stopwords
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import TruncatedSVD

from tqdm import tqdm
tqdm.pandas()

In [None]:
st.download('en')

# 1. Importing Files

In [None]:
queries_df = pd.read_csv('../input/ms-marco-queries/msmarco-doctrain-queries.tsv', sep = '\t', names = ['qid','query'])
queries_df = queries_df.set_index('qid')
display(queries_df.head(10))
display(queries_df.tail(10))
print(len(queries_df.index))

In [None]:
print(os.listdir('./')) # This will print the content of current directory
print(os.listdir('../input')) # This will print the content of input directory

In [None]:
# Install and import relevant libraries
!python -m easy_install ../input/compiledlucene/bk/lucene-8.1.1-py3.6-linux-x86_64.egg
!cp -r ../input/compiledlucene/bk/JCC-3.7-py3.6-linux-x86_64.egg /opt/conda/lib/python3.6/site-packages/
import sys
sys.path
sys.path.append('/opt/conda/lib/python3.6/site-packages/JCC-3.7-py3.6-linux-x86_64.egg')
sys.path.append('/opt/conda/lib/python3.6/site-packages/lucene-8.1.1-py3.6-linux-x86_64.egg')

In [None]:
import lucene

# 2. EDA

In [None]:
# Let's first check for any null rows

null_rows = queries_df[queries_df.isnull().any(axis = 1)]
display(null_rows)

# Cool, there aren't any

In [None]:
# Now let's try to find the average length of a query

def return_length(n):
    return len(n)

total_len = np.sum(queries_df['query'].apply(return_length))
average_query_length = total_len/len(queries_df)
print(total_len)
print(average_query_length)

# 33 characters seems reasonable.

In [None]:
# Average number of word counts in the query

def number_of_words(n):
    words = n.split(' ')
    return len(words)

total_word_count = np.sum(queries_df['query'].apply(number_of_words))
average_word_count = total_word_count/len(queries_df)
print(total_word_count)
print(average_word_count)

# An average word count of 6 seems about right for a query

In [None]:
# Let's now create a corpus of the 2.1M words used in the queries

def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence.split(' '):
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

sentences = queries_df['query'].values
# print(sentences)
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

# As we can see, there are some tokens that are misspelled. I'll have to handle that later

In [None]:
# Let's now sort the vocab, that way we can remove the mispelled words

sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

# Let's first the see the top 10 most common words in the text, and their counts
print(sorted_vocab[:5])

# Let's now have a look at the last 50 terms in the sorted list (they will most probably be mispellings)
print(sorted_vocab[-50:])


# WOAH!! This is surprising, the last items of the sorted list, aren't actually mispelling, but many of them are just words ending with a question mark or bracket. I am gonna leave them be for now.

# Phaze 0. Evaluation Metrics *(& Supporting Functions)*

In [None]:
def query_sampler(df, percentage_of_samples = 0.9):
    """ 
    This function creates a sample set of queries from the orignal dataset
    
    Args:
        df (dataframe) -> The original dataframe
        percentage_of_samples (float) -> Between 0 and 1
        
    Returns:
        Sampled set of queries.
    
    """
    l = len(df)
    number_of_samples = int(l*percentage_of_samples)
    print("Number of instances being sampled is", number_of_samples)
    randomList = random.sample(range(0, l), number_of_samples)
    return(df.iloc[randomList])


In [None]:
def break_the_query(query_string):
    """
    This function breaks the query down on character level.
    
    Args:
        query_string(str) -> Original query string
        
    Returns:
        list_of_strings(iterable) -> A list of multiple sub-strings made from the query
    """
    list_of_strings = []
    l = len(query_string)
    for i in range (1, l-1):
        s = query_string[:i]
        list_of_strings.append(s)
    return list_of_strings
    

In [None]:
def train_test_split(df, train_percentage = 0.9):
    """ 
    This function creates a sample set of queries from the orignal dataset
    
    Args:
        df (dataframe) -> The original dataframe
        train_percentage (float) -> Represents the percentage of training examples. Between 0 and 1
        
    Returns:
        Training and testing data
    
    """
    l = len(df)
    number_of_training_samples = int(l*train_percentage)
    print("Number of instances being sampled for training data is", number_of_training_samples)
    train_List = random.sample(range(0, l), number_of_training_samples)
    test_List = []
    for i in range(l):
        if i not in train_List:
            test_List.append(i)
    return(df.iloc[train_List], df.iloc[test_List])


#### Distance Scores

In [None]:
def distance_score(str1, str2):
    """
    This function calculates the Levenshtein distance between the two strings. Will use the library 'editdistance' for this task.
    
    Args:
        str1(string) -> String 1
        str2(string) -> String 2
        
    Returns:
        d (float) -> Represents the leveshtein distance between the two strings
    """
    d = editdistance.eval(str1, str2)
    return d

def best_distance_on_query(model, query_string):
    """
    This function returns the best distance score from all the suggestions made for a query.
    
    Args:
        model (object) -> Trained model object 
        query_string (string) -> The query string
        
    Returns:
        Best Levenshtein distance score of all the suggestions made by the model.
    """
    query_suggestions = model.getAutoSuggestions(query_string)
    least_distance = len(query_string) * 5
    for query_suggestion in query_suggestions:
        distance_measure = distance_score(query_suggestion, query_string)
        if distance_measure < least_distance:
            least_distance = distance_measure
    return least_distance

#### Relevance Scores

I will evaluate Relevance score by evaluating the cosine distance between word embeddings of the query suggestions and the original query.

In [None]:
def preprocess(raw_text):
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))
    
    # remove the words we do not have an embedding for
    #preprocessed_words = list(set([w for w in cleaned_words if w in list(model_5.keys())]))
    preprocessed_words = cleaned_words

    return preprocessed_words

def bert_cosine_distance_on_sentences(vector_1, s2):
    #sentence_list_1 = preprocess(s1)
    sentence_list_2 = preprocess(s2)
    #sentence_1 = ''
    sentence_2 = ''
    #for word in sentence_list_1:
        #sentence_1 += word
        #sentence_1 += ' '
    #sentence_1 = sentence_1[:-1]
    for word in sentence_list_2:
        sentence_2 += word
        sentence_2 += ' '
    sentence_2 = sentence_2[:-1]
    #vector_1 = model_5.encode(sentence_1)
    vector_2 = model_5.encode(sentence_2)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    # print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
    return cosine

def evaluate_bert_cosine_distance_on_query_suggestions(query_string, query_suggestions):
    """
    This function finds the BERT cosine distance between the actual query and the query suggestions made by the model
    
    Args:
        query_string (string) -> The original query string
        query_suggestions (list) -> List of strings containing the suggestions.
        
    Returns:
        cosine_distance (float) -> The smallest cosine distance between the query strings and the 
    
    """
    predicted_similarity = []
    cosine_distance = []
    bert_original_query_vector = model_5.encode(query_string)
    for i in range(len(query_suggestions)):
        #print(df.iloc[i, 1])
        #print(type(df.iloc[i, 1]))
        cosine_distance_between_sentences = bert_cosine_distance_on_sentences(bert_original_query_vector, query_suggestions[i])
        cosine_distance.append(cosine_distance_between_sentences)
    #scaler = MinMaxScaler()
    cosine_distance = np.array(cosine_distance)
    return cosine_distance
    

In [None]:
model_5 = SentenceTransformer('bert-base-nli-mean-tokens')

# Phaze 1. AutoSuggest Methods

## Part 1: Dictionary Based Approach

#### Preparing Data
Ideally the dictionary-based approach (without analyzers) shouldn't include words in the build dictionary, but I am gonna try it out as well. 
So, the data would be prepared for TRIE in two batches:
1. Simple queries data.
2. Simple queries data, along with all the words used in that queries data.

In [None]:
# This cell contains the point 1 of the two data forms described above.

#First I am gonna create a sample of ~36000 queries
queries_df_1 = query_sampler(queries_df, percentage_of_samples = 0.1)

# Next up, I'll remove the qid from the index
queries_df_1.reset_index(drop = True, inplace = True)

# Next I'll divide the queries data into train and test set. With 95% training data
train_queries_df_1, test_queries_df_1 = train_test_split(queries_df_1, train_percentage = 0.95)
train_queries_data_1 = train_queries_df_1['query'].values
test_queries_data_1 = test_queries_df_1['query'].values
print(train_queries_data_1)
print(len(train_queries_data_1))
#print(queries_data_1)
#print(len(queries_data_1))

In [None]:
# This cell contains data created by definition of point 2 of the two data forms described above.

#First I am gonna create a sample of ~36000 queries
queries_df_2 = query_sampler(queries_df, percentage_of_samples = 0.1)

# Next up, I'll remove the qid from the index
queries_df_2.reset_index(drop = True, inplace = True)

# Next up I am gonna add all the words in these queries seperately as well
sentences = queries_df_2['query'].values
tokens = []
for sentence in tqdm(sentences):
    for word in sentence.split(' '):
        # I will also have to remove the special characters from the words before putting them into the vocab. I will not remove numerics because many search queries might require numbers like WC20 or XL5 etc
        token = ""
        for character in word:
            if character.isalnum():
                token += character
        tokens.append(token)
vocab = list(set(tokens))
for word in vocab:
    queries_df_2 = queries_df_2.append({'query' : word}, ignore_index=True)
    
display(queries_df_2.tail(10))
display(len(queries_df_2))

# Next I'll divide the queries data into train and test set. With 95% training data
train_queries_df_2, test_queries_df_2 = train_test_split(queries_df_2, train_percentage = 0.95)
train_queries_data_2 = train_queries_df_2['query'].values
test_queries_data_2 = test_queries_df_2['query'].values


#### Model Creation: TRIE

In [None]:
class TrieNode(): 
    def __init__(self): 
          
        # Initialising one node for trie 
        self.children = {} 
        self.last = False
  

class Trie(): 
    def __init__(self): 
          
        # Initialising the trie structure. 
        self.root = TrieNode() 
        self.word_list = []
        self.count = 0
  
    def formTrie(self, keys): 
          
        # Forms a trie structure with the given set of strings 
        # if it does not exists already else it merges the key 
        # into it by extending the structure as required 
        for key in keys: 
            self.insert(key) # inserting one key to the trie. 
  
    def insert(self, key): 
          
        # Inserts a key into trie if it does not exist already. 
        # And if the key is a prefix of the trie node, just  
        # marks it as leaf node. 
        node = self.root 
  
        for a in list(key): 
            if not node.children.get(a): 
                node.children[a] = TrieNode() 
  
            node = node.children[a] 
  
        node.last = True
  
    def search(self, key): 
          
        # Searches the given key in trie for a full match 
        # and returns True on success else returns False. 
        node = self.root 
        found = True
  
        for a in list(key): 
            if not node.children.get(a): 
                found = False
                break
  
            node = node.children[a] 
  
        return node and node.last and found 
  
    def suggestionsRec(self, node, word): 
          
        # Method to recursively traverse the trie 
        # and return a whole word.  
        if node.last: 
            self.word_list.append(word) 
  
        for a,n in node.children.items(): 
            self.suggestionsRec(n, word + a)
            
    def limitedSuggestionsRec(self, node, word, no_of_suggestions): 
          
        # Method to recursively traverse the trie 
        # and return a whole word. But limit the number of routes ventured to no_of_suggestions
        if node.last:
            self.count = self.count + 1
            if self.count <= no_of_suggestions:
                self.word_list.append(word)

        for a,n in node.children.items():
            self.suggestionsRec(n, word + a) 
            
    
    def printAutoSuggestions(self, key, no_of_suggestions = 0): 
        # Returns all the words in the trie whose common 
        # prefix is the given key thus listing out all  
        # the suggestions for autocomplete. 
        node = self.root 
        not_found = False
        temp_word = '' 
        print('This function was called')
        characters = list(key)
        for a in characters:
            if not node.children.get(a): 
                not_found = True
                break
  
            temp_word += a 
            node = node.children[a] 
  
        if not_found: 
            return 0
        elif node.last and not node.children: 
            return -1
  
        if no_of_suggestions>0:
            self.count = 0
            self.limitedSuggestionsRec(node, temp_word, no_of_suggestions)
        else:
            self.suggestionsRec(node, temp_word)
  
        for s in self.word_list: 
            print(s) 
        return 1
    
    def getAutoSuggestions(self, key, no_of_suggestions = 0): 
          
        # Returns all the words in the trie whose common 
        # prefix is the given key thus listing out all  
        # the suggestions for autocomplete. 
        node = self.root 
        not_found = False
        temp_word = '' 
  
        characters = list(key)
        for a in characters:
            if not node.children.get(a): 
                not_found = True
                break
  
            temp_word += a 
            node = node.children[a] 
  
        if not_found: 
            return 0
        elif node.last and not node.children: 
            return -1
        
        if no_of_suggestions>0:
            self.count = 0
            self.limitedSuggestionsRec(node, temp_word, no_of_suggestions)
        else:
            self.suggestionsRec(node, temp_word)
   
        return self.word_list

#### Model Training
We are gonna create two TRIE structures (one for each kind of data)

In [None]:
model_1 = Trie()
model_1.formTrie(train_queries_data_1)

In [None]:
model_2 = Trie()
model_2.formTrie(train_queries_data_2)

#### Model Testing
We are gonna test each of the two models trained above using 3 metrics of evaluation:
1. Intuition Based (Will analyze the relevance of the search suggestions manually)
2. Distance Based (Will produce the overall score of test set using Levenshtein Distance)
3. Relevance Based (Will use semantic textual similarity to determine relevance of each of the suggestions).

##### Model 1:

1. **Intuition Based**

In [None]:
# I am first gonna define the three strings I'll be using for Intuition based testing
string1 = 'What building method might use a balloon frame?'
string2 = 'What is causing rash on arms'
string3 = 'What causes your glands on the top of your throat to swell'

# Next up, I am gonna create a list of sub strings from these queries
list_string1 = break_the_query(string1)
list_string2 = break_the_query(string2)
list_string3 = break_the_query(string3)

# Now, I'll print the query results after each new character being typed in
comp = model_1.printAutoSuggestions(string1) 
if comp == -1: 
    print("No other strings found with this prefix\n") 
elif comp == 0: 
    print("No string found with this prefix\n") 
break


2. **Distance Scoring**

3. **Relevance Scoring**

## Part 2: Analyzer Based Approach

#### Data Preparation
I am gonna prepare the data with the following steps:
1. Tokenize the data
2. Create a Dictionary of Words, and their contexts.

In [None]:
#First I am gonna create a sample of ~36000 queries
queries_df_3 = query_sampler(queries_df, percentage_of_samples = 0.1)

# Next up, I'll remove the qid from the index
queries_df_3.reset_index(drop = True, inplace = True)
display(queries_df_3.head(10))

In [None]:
# The problem with this data is that it cannot be used for NER tasks as it is, that is, it is not capitalized. And therefore, will not be able to identify NER tags properly.
# To resolve this problem, we will perform manual True Casing by using POS tagger of StanfordNLP, and then the results thus produced would be fed into SpaCy NER model.

# Initialize the StanfordNLP pipeline, and instantiate the spacy english models
stf_nlp = st.Pipeline(processors='tokenize,mwt,pos')
spacy_nlp = spacy.load('en_core_web_sm')

In [None]:
class Custom_Analyzer:
    """
    This class performs several analysis on the query data (NER, POS, etc.) and creates a word dictionary of a form appropriate for usage in fast-autocomplete library
    
    Args:
        df (DataFrame) -> Contains all the queries
        
    Returns:
        words (dictionary) -> Dictionary of the fast-autocomplete prescribed format
    
    """
    def __init__(self, df):
        self.words = {}
        self.df = df
        
        
    def get_entities(self, query_string):
        '''
        This function takes in the query string and returns all the Named Entities present in the caseless string being passed to it.
        
        Args:
            query_string (string) -> A single query of the user
            
        Returns:
            ner_word_values (list) -> Contains all the identified NER terms in the text, along with their corresponding tag and start and end points (of words, not characters).
        
        '''
        #print(query_string)
        
        # Here, I'll first perform Truecasing using StanfordNLP, and then use NER model from Spacy
        doc = stf_nlp(query_string)
        truecased_list = [w.text.capitalize() if w.upos in ["PROPN","NNS"] else w.text for sent in doc.sentences for w in sent.words]
        truecased_query = ''
        for word in truecased_list:
            truecased_query += str(word)
            truecased_query += ' '
        truecased_query = truecased_query[:-1]
        doc = spacy_nlp(truecased_query)
        ner_values = []
        for ent in doc.ents: 
            #print(ent.text, ent.start_char, ent.end_char, ent.label_)
            ner_values.append([ent.text, ent.start_char, ent.end_char, ent.label_])
        
        # The following code maps the character position of entitites recognized to the word position.
        char_to_word_mapping = []
        j = 1
        start_char_position = 0
        end_char_position = 0
        for i in range(len(query_string)):
            if len(list(query_string.split(' '))) == 1:
                end_char_position = len(query_string) - 1
                char_to_word_mapping.append([start_char_position, end_char_position, j])
                break
            if query_string[i] == ' ':
                end_char_position = i-1
                char_to_word_mapping.append([start_char_position, end_char_position, j])
                start_char_position = i + 1
                j = j+1
        ner_word_values = []
        for i in range(len(ner_values)):
            ner_word = ner_values[i][0]
            ner_word_label = ner_values[i][3]
            ner_word_position_start = 0 
            ner_word_position_end = 0
            for k in range(len(char_to_word_mapping)):
                if char_to_word_mapping[k][0] == ner_values[i][1]:
                    ner_word_position_start = char_to_word_mapping[k][2]
                if ner_values[i][2] == char_to_word_mapping[k][1] + 1:
                    ner_word_position_end = char_to_word_mapping[k][2]
            ner_word_values.append([ner_word, ner_word_position_start, ner_word_position_end, ner_word_label])
        return ner_word_values

        
        
    def add_new_word_to_dictionary(self, current_word, prior1_word = None, prior2_word = None, ner_context = None):
        '''
        This function adds a new word/phraze to the words dictionary
        
        Args:
            current_word (string) -> A token (word/phraze) to be added to the dictionary
            ner_context (string) -> Describes the type of entity, if there is one
            prior1_word (string) -> The word immediately prior to the current one
            prior2_word (string) -> The word before the prior1_word
        
        '''
        self.words[current_word] = []              # Initializing the word in words dictionary
        self.words[current_word].append({})           # Initializing the Context dictionary for that word
        self.words[current_word].append(current_word)         # Setting the display value equal to the word string
        self.words[current_word].append(1)            # Initializing the Count to be equal to zero
        if prior1_word != None:
            self.words[current_word][0]["priorone"] = prior1_word
        if prior2_word != None:
            self.words[current_word][0]["priortwo"] = prior2_word
        if ner_context != None:
            self.words[current_word][0]["type"] = ner_context 
        # Write the code for adding a POS tag value to the context of each current_word. Will have to decide whether or not I want to do this.
     
    
    def check_whether_in_dictionary(self, current_word, prior1_word = None, prior2_word = None, ner_context = None):
        '''
        This function checks whether a word, along with all its contexts, exists in a dictionary. It overlooks differences due to absence of NER tags in one of the words.
        
        Args:
            current_word (string) -> The string to be checked for within the words dictionary
            ner_context (string) -> Describes the type of entity, if there is one
            prior1_word (string) -> The word immediately prior to the current one
            prior2_word (string) -> The word before the prior1_word            
        Returns:
            is_in (int) -> 0 for not in dictionary, 1 for in dictionary but different context, 2 for in dictionary same context
        
        '''
        is_in = 0
        
        # Ignore context for recognized entities
        if ner_context != None:
            if current_word not in self.words:
                return is_in
            else:
                self.words[current_word][2] += 1
                is_in = 2
                return is_in
                
        # But if the word is not a recognized entity, then the context would be considered.
        if current_word not in self.words:
            is_in = 0
            return is_in
        context_of_existing_current_word = self.words[current_word][0]
        existing_context_list = context_of_existing_current_word.keys()
        new_word_context = {}
        if prior1_word != None:
            new_word_context["priorone"] = prior1_word
        if prior2_word != None:
            new_word_context["priortwo"] = prior2_word
        #if ner_context != None:
            #new_word_context["type"] = ner_context
    
        # Following code checks whether the word has been used in the same context or not
        same = True
        for context in existing_context_list:
            if context in new_word_context.keys():
                if new_word_context[context] == context_of_existing_current_word[context]:
                    same = True
                else:
                    same = False
                    break
        if same == True:
            self.words[current_word][2] += 1           # Increasing the count of the word by one
            is_in = 2
        else:
            is_in = 1
        return is_in

        
    
    def get_words_dictionary(self, context_size = 1):
        """
        This function creates a dictionary of words in the format that can be directly fed into fast-autocomplete model.

        Args:
            context_size (int) -> The size of the context window to be turned into the dictionary format.
            
        Returns:
            words (dictionary) -> Dictionary of words in the format required by fast-autocomplete model.
        """
        for i in range(len(self.df)):
            query = self.df['query'][i]
            ner_word_values = self.get_entities(query_string = query)
            # ner_word_values_df = pd.DataFrame(ner_word_values)
            query_words = query.split(' ')
            ner_under_play = 0
            for j in range(len(query_words)):
                ner_context = None
                prior1_word = None
                if (j-1) >= 0:
                    prior1_word = query_words[j-1]
                #if (j-2) >= 0:
                    #prior2_word = query_words[j-2]
                
                # The following code extracts the entire entity recognized as 1 word, otherwise follows the conventional path for extraction of word
                if len(ner_word_values) > 0:
                    if (j+1) == ner_word_values[ner_under_play][1]:
                        j_start = j
                        j_end = ner_word_values[ner_under_play][2]
                        current_word = ''
                        for k in range(j_start, j_end):
                            current_word += query_words[k]
                            current_word += ' '
                        current_word = current_word[:-1]
                        ner_context = ner_word_values[ner_under_play][3]
                        j = j_end - 1
                        if ner_under_play+1 < len(ner_word_values):
                            ner_under_play += 1
                    else:
                        current_word = query_words[j]
                else:
                    current_word = query_words[j]
                
                # The following code updates the dictionary on the basis that NER words are added as it is, and non-recognized words are checked for context
                if ner_context != None:
                    word_check = self.check_whether_in_dictionary(current_word)
                    if word_check == 2:
                        continue
                    else:
                        self.add_new_word_to_dictionary(current_word, prior1_word, ner_context = ner_context)
                
                else:
                    word_check = self.check_whether_in_dictionary(current_word)
                    if word_check == 2:
                        continue
                    elif word_check == 0:
                        self.add_new_word_to_dictionary(current_word, prior1_word)
                    else:
                        check_flag = True
                        additional_words_under_consideration = 1
                        while check_flag and (j + additional_words_under_consideration) < len(query_words):
                            current_word = current_word.append(' ')
                            current_word = current_word.append(query_words[j + additional_words_under_consideration])
                            word_check = self.check_whether_in_dictionary(current_word)
                            if word_check == 2:
                                check_flag = False
                            elif word_check == 0:
                                self.add_new_word_to_dictionary(current_word, prior1_word)
                                check_flag = False
                            else:
                                continue
        return self.words
    

In [None]:
# test file

'''
string1 = 'What building method might use a balloon frame?'
string2 = 'What is causing rash on arms'
string3 = 'What causes your glands on the top of your throat to swell'
query_list = ['What building method might use a balloon frame?','What is causing rash on arms', 'what does jon snow know', 'where is duke university in columbia','what causes the glands on top of your throat to swell']
query_df_111 = pd.DataFrame(query_list, columns = ['query'])
display(query_df_111.head(5))


data_generator = Custom_Analyzer(query_df_111)
words = data_generator.get_words_dictionary()
print(words)


# Will convert the words dictionary into the appropriate .json file
words_json = json.dumps(words)
with open('temp_words.json', 'w') as json_file:
    json.dump(words, json_file)

    
content_files = {
    'words': {
        'filepath': './temp_words.json',
        'compress': True  # means compress the graph data in memory
    }
}
'''

In [None]:
# Here, I'll create the data, and convert it into the required format

#First I am gonna create a sample of ~36000 queries
#queries_df_3 = query_sampler(queries_df, percentage_of_samples = 0.1)

# Next up, I'll remove the qid from the index
#queries_df_3.reset_index(drop = True, inplace = True)

# Next, I'll parse the data into dictionary format
data_generator = Custom_Analyzer(queries_df_3)
words = data_generator.get_words_dictionary()
print(words)


# Will convert the words dictionary into the appropriate .json file
# words_json = json.dumps(words)
with open('ten_percent_words.json', 'w') as json_file:
    json.dump(words, json_file)

    
content_files = {
    'words': {
        'filepath': './ten_percent_words.json',
        'compress': True  # means compress the graph data in memory
    }
}

#### Model Creation

In [None]:
class Analyzer_Suggestor:
    '''
    This class creates the object function of the fast-autocomplete model
    
    '''
    def __init__(self):
        pass
    
    def general_autocomplete(self, words):
        self.model = AutoComplete(words = words)
        return self.model
    
    def factory_autocomplete(self, content_files):
        self.model = autocomplete_factory(content_files=content_files)
        return self.model
    
    def getAutoSuggestions(self, query_string):
        return self.model.search(query_string)

In [None]:
analyzer_object = Analyzer_Suggestor()
model = analyzer_object.factory_autocomplete(content_files)

#### Model Testing

1. **Intuition Based**

In [None]:
# I am first gonna define the three strings I'll be using for Intuition based testing
string1 = 'What building method might use a balloon frame?'
string2 = 'What is causing rash on arms'
string3 = 'What causes your glands on the top of your throat to swell'

# Next up, I am gonna create a list of sub strings from these queries
list_string1 = break_the_query(string1)
list_string2 = break_the_query(string2)
list_string3 = break_the_query(string3)

# Now, I'll print the query results after each new character being typed in
for partial_query_text in list_string1:
    comp = model.search(partial_query_text)
    print(partial_query_text)
    print(comp)

for partial_query_text in list_string2:
    comp = model.search(partial_query_text)
    print(partial_query_text)
    print(comp)
    
for partial_query_text in list_string3:
    comp = model.search(partial_query_text)
    print(partial_query_text)
    print(comp)


2. **Distance Score**

3. **Relevance Score**

## Part 3: Content Based Approach