# Final Notebook

This notebook is your search engine. 

For testing your work, we will run each cell. Thus, your code we'll have to fit the structure expected.



## Initialisation

- Install libraries (if you use Colab and needed),
- Import the modules,
- Declare global variable


In [None]:
! pip install nltk
! pip install py7zr
! pip install ttable
! pip install sentence_transformers

In [None]:
import nltk
import re
import pickle
import math
import py7zr
import os
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from nltk.corpus import stopwords
from math import log
from tt import BooleanExpression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sentence_transformers import SentenceTransformer, util


nltk.download('stopwords')
nltk.download('all')
lemmatizer = nltk.stem.WordNetLemmatizer()
stops = set(stopwords.words('english'))

On google colab use this

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MAIN_PATH = '/content/drive/MyDrive/TP Centrale'
DATA_PATH = '/content/drive/MyDrive/TP Centrale/data'


And in VS Code use this :

In [None]:
# MAIN_PATH = ''
# DATA_PATH = '/data'

In [None]:
INVINDEX_PATH = os.path.join(DATA_PATH, "inverted_index.pickle")
EMBEDDING_PATH = os.path.join(DATA_PATH, "embeddings.pkl")

## Extraction the data

In [None]:
def extract_data(filepath):
    if not os.path.isdir(MAIN_PATH):
        os.mkdir(MAIN_PATH)
    if not os.path.isdir(MAIN_PATH):
        os.mkdir(DATA_PATH)
    archive = py7zr.SevenZipFile(os.path.join(MAIN_PATH, 'datascience.stackexchange.com.7z'), mode='r')
    archive.extractall(path=os.path.join(MAIN_PATH, 'data'))
    archive.close()
    return

In [None]:
posts = pd.read_xml(os.path.join(DATA_PATH, 'Posts.xml'), parser="etree", encoding="utf8")

## Indexation data

def index_data():
    # TODO
    
    return

In [None]:
def extract_words(text:str)->list:
  """Transforms a given text into a list of tokens"""
  tokens = text.lower()
  tokens = nltk.tokenize.word_tokenize(tokens)
  for i in range(len(tokens)):
    tokens[i] = tokens[i].rstrip(".!?,;:\(\)\"\'")
    tokens[i] = lemmatizer.lemmatize(tokens[i])
  return tokens


def remove_tags(text: str) -> str:
    """Remove the HTML tags from a given text"""
    cleaned_text = re.sub(r'<.*?>', ' ', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra whitespaces
    cleaned_text = cleaned_text.strip()  # Remove leading/trailing whitespaces
    return cleaned_text


def filter_stop_words(words:list[str]) -> list[str]:
  new_words = []
  for word in words:
    if word not in stops:
        new_words.append(word)
  return new_words


def clean_post(text:str):
  text = remove_tags(text)
  text = text.lower()
  res = nltk.word_tokenize(text)
  clean = ''
  for i in range(len(res)):
    res[i] = lemmatizer.lemmatize(res[i])
    if res[i] not in stops and res[i] not in string.punctuation:
      clean += str(res[i]) + ' '
  return clean


def inverted_index_data():
    # TODO

    return

In [None]:
clean_posts = posts[['Id','Body']]
clean_posts['Words'] = clean_posts['Body'].fillna('').apply(remove_tags).apply(extract_words).apply(filter_stop_words)
clean_posts['len'] = clean_posts['Words'].apply(len) #On a besoin de cette donnée en accès rapide pour OBM25
posts['Cleaned_post'] = posts['Body'].fillna('').apply(clean_post)


In [None]:
def create_inverted_index(posts:pd.DataFrame)->dict:
  """
  On suppose que les posts sont pré-traités. 
  On va renvoyer un index inversé complet et un index des TF
  full_ind = {key : {'df' : int , 'inv_ind' : [ (id, tf ) ] } }
  """
  full_ind = {}
  for i in posts.index:
    id = posts['Id'][i]
    words = posts['Words'][i]
    seen = [] #pour ne traiter qu'une fois un mot par document
    for word in words:
      if word not in full_ind:
        seen.append(word)
        tf = words.count(word) / len(words)
        full_ind[word] = {'df': 1, 'inv_ind': [(id, tf)]}
      elif word not in seen :
        seen.append(word)
        tf = words.count(word) / len(words)
        full_ind[word]['df'] += 1
        full_ind[word]['inv_ind'].append((id,tf))
  return full_ind

In [None]:
# full_index = create_inverted_index(posts)

In [None]:
# Save and Load your Index(es) in Pickle format
def save_index(savepath, inverted_index):
    """Saves the index given as parameter to a `pickle` file"""
    with open(savepath, "wb") as file:
        pickle.dump(inverted_index, file)


def load_index(savepath):
    """Load the inverted index saved as a `pickle` file"""
    with open(savepath, "rb") as file:
        loaded_dict = pickle.load(file)
    # Access the loaded dictionary
    return loaded_dict

In [None]:
full_index = load_index(INVINDEX_PATH)

# Search Methods

## Naive Search and Improvements

La fonction à appeler est:
```python
search_naive(query: str, inverted_index: dict, top: int =5)
```

In [None]:
# Naive search
def word_in_index(word: str, word_list_index: list)->pd.Series:
  """
    Implement the word_in_index function 
    Inputs : a word (str) & a list of words
    Output : pandas series of 1 if the word is in the list, else 0
  """
  if word_list_index == []:
    return pd.Series(dtype='float64')
  df = pd.DataFrame(word_list_index)
  df["New Word"] = [word for _ in range(len(word_list_index))]
  df["Comparison"] = (df[0] == df["New Word"])
  return pd.Series(df["Comparison"])


def count_common_words(query: str, word_serie: pd.Series)->pd.Series:
  """
  Implement the function which run through a pandas series and count the number of word in common
  Use extract_words method, apply method with word_in_index function
  Inputs : the query (str) & pandas series of strings
  Output : Pandas series counting the number of common words between the query and each string in word_serie
  """
  query_items = extract_words(query)
  return sum(word_in_index(q_word, word_serie) for q_word in query_items)


def rank_top_query(query:str, df:pd.DataFrame, top: int = 5)->list:
  """  """
  ranking = []
  for line in range(df.shape[0]):
    post_id = df['Id'][line]
    word_ser = df['Words'][line]
    nb_comm_words = sum(count_common_words(query, word_ser))
    ranking.append([nb_comm_words, post_id])
  ranking.sort(reverse=True)
  return ranking[0:top]

In [None]:
# Naive but using the inverted index
def search_naive(query: str, inverted_index: dict, top: int=5):
    query_items = extract_words(query)
    ranking = dict()
    for word in query_items:
        if word in inverted_index:
            posting_list = inverted_index[word]["inv_ind"]
            for post_id, tf in posting_list:
                if post_id in ranking:
                    ranking[post_id] += tf
                else:
                    ranking[post_id] = tf
        else:
            continue
    ranking = sorted(ranking.items(), key=lambda item: item[1])
    return ranking[0:top]

## Boolean Search

La fonction à appeler est :
```python 
search_boolean(query: str, inverted_index: dict, booleanOperator: set)
```

In [None]:
# Boolean Search
inverted_index_simple = {}
for word in full_index:
  l=[]
  tuple_list = full_index[word]['inv_ind']
  for elt in tuple_list:
    (doc_id,_)=elt #elt = (a,b)
    l.append(doc_id)
  inverted_index_simple[word]=l


In [None]:
# la requête est sous la formenormale conjonctive A1 OR A2 OR A3 OR A4...
# transforme la requête en booléen
def transformation_query_to_boolean(query: str):
    boolean_query=[]
    for token in query.split():
        boolean_query.append(token)
        boolean_query.append('AND')
    boolean_query.pop()
    return boolean_query


BooleanOperator = {"AND", "OR", "NOT"}

def transformation_query_to_postfixe(query: str):
    b = BooleanExpression(query)
    return b.postfix_tokens

# merge deux posting lists selon l'opérateur
def merge_and_postings_list(posting_term1: list, posting_term2: list)->list:
    result=[]
    n = len(posting_term1)
    m = len(posting_term2)
    i = 0
    j = 0
    while i < n and j <m:
        if posting_term1[i] == posting_term2[j]:
            result.append(posting_term1[i])
            i = i+1
            j = j+1
        else:
            if posting_term1[i] < posting_term2[j]:
                i = i+1
            else:
                j=j+1
    return result

def merge_or_postings_list(posting_term1: list, posting_term2: list)->list:
    result=[]
    n = len(posting_term1)
    m = len(posting_term2)
    i = 0
    j = 0
    while i < n and j <m:
        if posting_term1[i] == posting_term2[j]:
            result.append(posting_term1[i])
            i = i+1
            j = j+1
        else:
            if posting_term1[i] < posting_term2[j]:
                result.append(posting_term1[i])
                i = i+1
            else:
                result.append(posting_term2[j])
                j=j+1
    if i <n:
        result = result + posting_term1[i:]
    if j <m:
        result = result + posting_term2[j:]
    return result

def merge_and_not_postings_list(posting_term1: list, posting_term2: list)->list:
    result=[]
    n = len(posting_term1)
    m = len(posting_term2)
    i = 0
    j = 0
    while i < n and j <m:
        if posting_term1[i] == posting_term2[j]:
            i = i+1
            j = j+1
        else:
            if posting_term1[i] < posting_term2[j]:
                result.append(posting_term1[i])
                i = i+1
            else:
                j=j+1
    return result

# généralise le merge selon l'opérateur
def boolean_operator_processing_with_inverted_index(BoolOperator: str, posting_term1: list, posting_term2: list)->list:
    result=[]
    if BoolOperator == "AND":
        result.append(merge_and_postings_list(posting_term1,posting_term2))
    elif BoolOperator=="OR" :
        result.append(merge_or_postings_list(posting_term1,posting_term2))
    elif BoolOperator == "NOT":
        result.append(merge_and_not_postings_list(posting_term1,posting_term2))
    return result


In [None]:
def search_boolean(query: str, inverted_index_simple: dict, booleanOperator=BooleanOperator):
    evaluation_stack = []
    # transformer query en liste de mots
    query = extract_words(query)

    for term in query:
        if term.upper() not in booleanOperator:
          evaluation_stack.append(inverted_index_simple[term])#on rajoute la posting list du dernier terme
        else:
            if term.upper() == "NOT":
              operande= evaluation_stack.pop()
              eval_prop = boolean_operator_processing_with_inverted_index(term.upper(), evaluation_stack.pop(),operande)
              evaluation_stack.append(eval_prop[0])
            else:
              operator = term.upper()
              eval_prop =  boolean_operator_processing_with_inverted_index(operator, evaluation_stack.pop(),evaluation_stack.pop())
              evaluation_stack.append(eval_prop[0])
    return  evaluation_stack.pop()

## Probabilstic Search (OBM25)

La fonction à appeler est :
```python 
search_OBM25(query: str, inverted_index: dict, simple_index: pd.DataFrame, top: int)
```

In [None]:
# Probabilistic Search Okapi BM25

def search_OBM25(query: str, inverted_index: dict =full_index,
                 simple_index: pd.DataFrame =clean_posts, top: int =5):
  #constantes 
  k1 = 1.2
  k3 = 1000
  b = 0.75
  m = np.mean(simple_index['len']) #longueur moyenne des docs, à trouver
  #traitement de la query
  query_ind = {}
  query_tmt = nltk.word_tokenize(query)

  for i in range(len(query_tmt)) : 
    query_tmt[i] = lemmatizer.lemmatize(query_tmt[i])
  for word in query_tmt:
    tf = query_tmt.count(word)/len(query_tmt)
    query_ind[word] = tf
  
  N = len(posts)
  #CORE on va faire sum(a*b*c) sur les termes pour chaque doc
  
  RSV = {}

  for word in query_ind.keys():
    if word in inverted_index:
      df_j = inverted_index[word]['df']
      
      tuple_list = inverted_index[word]['inv_ind']
      tf_j_q = query_ind[word]
      a3 = math.log((N-df_j+0.5)/df_j+0.5)
      a2 = (k3 + 1 ) * tf_j_q / ( k3 + tf_j_q)
      for tuple_elt in tuple_list : 
        (doc_id , tf_j_d) = tuple_elt
        L = simple_index.loc[simple_index['Id'] == doc_id].iloc[0]['len']
        a1 = (k1 + 1) * tf_j_d / (k1 * ((1-b) + b * L/m) + tf_j_d)
        if not(doc_id in RSV) :
          RSV[doc_id] = a1 * a2 *a3
        else :
          RSV[doc_id] += a1 *a2 * a3

  RSV = sorted(RSV.items(), key=lambda x: x[1], reverse=True)
  return RSV[0:top]

## MIB

La fonction à appeler est :
```python
search_MIB(quey: str, inverted_index: dict, top: int)
```

In [None]:


def search_MIB(query: str, inverted_index: dict =full_index, data=posts, top: int =5):
  N = len(data) #nombre des posts
  tokens = nltk.word_tokenize(query)
  Docs_id = dict()
  for i in range(len(tokens)):
    tokens[i] = lemmatizer.lemmatize(tokens[i])
    if tokens[i] in inverted_index:
      for j in range(len(inverted_index[tokens[i]]['inv_ind'])):
        if inverted_index[tokens[i]]['inv_ind'][j][0] not in Docs_id:
          Docs_id[inverted_index[tokens[i]]['inv_ind'][j][0]] = np.log(N/inverted_index[tokens[i]]['df']) * (1 + inverted_index[tokens[i]]['inv_ind'][j][1])
        else:
          Docs_id[inverted_index[tokens[i]]['inv_ind'][j][0]] += np.log(N/inverted_index[tokens[i]]['df']) * (1 + inverted_index[tokens[i]]['inv_ind'][j][1])
  sort_orders = sorted(Docs_id.items(), key=lambda x: x[1], reverse=True)
  return sort_orders[0:top]


## TF-IDF

La fonction à appeler est :
```python
vectorizer_search(query : str, data=posts, vectors=vectors, vectorizer=vectorizer)
```

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(posts.Cleaned_post.values)
vectors = vectorizer.transform(posts.Cleaned_post.values)

In [None]:
def clean_query(query: str)->str:
    return query.lower()

def vectorize_query(query : str, vectorizer=vectorizer):
    """Vectorizes the query
    Args:
        query (str): query string
        vectorizer (optional): Defaults to vectorizer.

    Returns:
        query vectorized
    """
    query_vectorized = vectorizer.transform([clean_query(query)])
    return query_vectorized


def search_tfidf(query : str,
                 data=posts,
                 vectors=vectors,
                 vectorizer=vectorizer) -> list:
    #renvoit une liste de rankings (doc_id, score)
    query_vectorized = vectorize_query(query, vectorizer) 
    results =  vectors @ query_vectorized.transpose()
    res_dict = {}
    doc_id_array = results.tocoo().row
    for i in range(len(doc_id_array)):
      res_dict[data.iloc[doc_id_array[i]]['Id']] = results.data[i]
    return sorted(res_dict.items(), key=lambda x: x[1], reverse=True)


## Semantic Similarity

La fonction à appeler est :
```python
search_semantic(query: str, data=posts, embeddings=embeddings, top: int =10)
```

In [None]:
sentence_transformer_model = 'multi-qa-mpnet-base-dot-v1'
MODEL_ST = SentenceTransformer(sentence_transformer_model)

In [None]:
# Prend du temps à éxecuter (~15mins)
# embeddings = MODEL_ST.encode(posts.cleaned_body.values, normalize_embeddings=True)

In [None]:
# Save/Load the embeddings into pickle format
def save_embedding(savepath, embeddings):
    with open(savepath, 'wb') as file:
        pickle.dump(embeddings, file)

def load_embedding(savepath):
    with open(savepath, "rb") as file:
        return pickle.load(file)

In [None]:
embeddings = load_embedding(EMBEDDING_PATH)

In [None]:
def encode_query(query : str) ->  np.ndarray:
    encoded_query = MODEL_ST.encode(query)
    return encoded_query


def similarity(query: str, embeddings=embeddings):
    query_emb = encode_query(query)
    similarity_matrix = util.dot_score(query_emb, embeddings)[0].cpu().tolist()
    return similarity_matrix


def ordre_en_fonction_similarité(matrix_similarity)->list[tuple]:
    scores = [(i, matrix_similarity[i]) for i in range(len(matrix_similarity))]
    ordre = sorted(scores, key=lambda x: x[1], reverse=True)
    return ordre 


def search_semantic(query: str, data=posts, embeddings=embeddings, top: int =10)->list[tuple]:
    sim_mat = similarity(query, embeddings)
    sorted_indexes = ordre_en_fonction_similarité(sim_mat)
    max_score = sorted_indexes[0][1]
    closest_posts = [(data.Id.iloc[j], score / max_score) for (j, score) in sorted_indexes]
    return closest_posts[0:top]


## Clustering

In [None]:
# Vectorize document using TF-IDF
vectorizer_lda = TfidfVectorizer()

# Fit and Transform the documents
train_data = vectorizer_lda.fit_transform(posts.Cleaned_post.values)

num_topics = 200
lda_model = LatentDirichletAllocation(n_components=num_topics)
lda_model.fit(train_data)

In [None]:
def topic_find(text):
  new_document_vectorized = vectorizer_lda.transform([text])
  topic_probabilities = lda_model.transform(new_document_vectorized)
  dominant_topic = topic_probabilities.argmax()
  return dominant_topic


In [None]:
posts['Topic'] = posts['Cleaned_post'].fillna('').apply(topic_find)
posts['Words'] = posts['Cleaned_post'].fillna('').apply(remove_tags).apply(extract_words).apply(filter_stop_words)


In [None]:
def get_topic_query(text, vectorizer=vectorizer_lda, lda_model=lda_model) -> int:
  new_document_vectorized = vectorizer.transform([text])
  topic_probabilities = lda_model.transform(new_document_vectorized)
  dominant_topic = topic_probabilities.argmax()
  return dominant_topic


In [None]:
query = 'draw neural networks'
topic_query = get_topic_query(query)
topic_k = posts.loc[posts['Topic'] == topic_query]

full_ind_k = create_inverted_index(topic_k)

test_cluster_without = search_OBM25(query, full_index)
test_cluster_with = search_OBM25(query, full_ind_k)

In [None]:
def search(query):
    # TODO

    return

# MERGED SEARCH METHOD

In [None]:
def nlp_search_algorithm(query,
                         ponderation = 1/6*np.ones((6)),
                         data=posts,
                         inverted_index=full_index,
                         simple_index=clean_posts,
                         # topic_documents=topic_documents,
                         vectors=vectors,
                         vectorizer=vectorizer,
                         vectorizer_lda=vectorizer_lda,
                         lda_model=lda_model,
                         embeddings=embeddings,
                         top_n=10
                         )->list:
    #Pondération
    coefs = ponderation
    
    #Récupération de tous les rankings listes pour la query
    rankings = []
    tip_top = min(top_n, data.shape[0])

    r_boolean = search_boolean(query, inverted_index_simple )
    rankings.append(r_boolean)

    #pour traiter le plus de documents pertinents possibles
    tip_top = max(tip_top, len(r_boolean)) 
    
    r_naive = search_naive(query, inverted_index, tip_top) 
    rankings.append(r_naive)

    r_obm25 = search_OBM25(query, inverted_index , simple_index , tip_top)
    rankings.append(r_obm25)

    r_MIB = search_MIB(query, inverted_index, data, tip_top)
    rankings.append(r_MIB)

    r_tfidf = search_tfidf(query, data, vectors, vectorizer)
    rankings.append(r_tfidf)

    r_semantic = search_semantic(query, data, embeddings, tip_top)
    rankings.append(r_semantic)

    ranking_dict = {}

    #On traite tous les modèles sauf le booléen
    for i in range(1, len(rankings)) : 
      r_i = rankings[i]
      for j in range(len(r_i)):
        (doc_id, score_j) = r_i[j]
        if doc_id not in ranking_dict : 
          ranking_dict[doc_id] = coefs[i] * score_j / log( j + 2 )
        else : 
          ranking_dict[doc_id] += coefs[i] * score_j / log( j + 2 )

    #On ajoute booléen, qui va pénaliser les docs où les termes sont absents
    N = len(rankings[0])
    for j in range( N ):
        if doc_id not in ranking_dict : 
          ranking_dict[doc_id] = coefs[0] / N
        else : 
          ranking_dict[doc_id] += coefs[0] / N

    matching_posts = sorted(ranking_dict.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return matching_posts

## Metadonnées

In [None]:
""" A partir d'une liste de résultats d'une fonction de search, on utilise
le score de chaque posts pour affiner la pertinence, en faisant copter un poids c_score"""

def metadata_score( ranking_list : list[tuple], c_score:float) -> list[tuple] :
  meta_list = [] #list[tuple] [(doc_id, score)]
  for couple in ranking_list:
    meta_list.append((couple[0],posts.loc['Id' == id]['Score'].value))
  #On ordonne la liste des rankings par score
  meta_ranking_list = sorted(meta_list, key=lambda x: x[1], reverse=True)
  
  #On utilise la structure de dictionnaire pour accéder facilement aux doc_id
  new_ranking = {}
  for cpl in ranking_list:
    new_ranking[cpl[0]] = cpl[1] #new_ranking[doc_id] = score_init
  #On normalise le poids grâce à l'élément de plus haut score
  c2 = c_score * new_ranking[meta_ranking_list[0][0]] 
  N = len(meta_ranking_list)
  
  for j in range(N) : 
    new_ranking[meta_ranking_list[j][0]] += c2 *(N-j)/N 
  
  return sorted(new_ranking.items(), key=lambda x: x[1], reverse=True)

## Ranking

In [None]:
def rank_search(results, top=5):
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
    return sorted_results[0:top]

## Visualising Results

In [None]:
def visualize_output(query, results):
    print("La requête :")
    print(query)
    print("Résultats :")
    for post_id, score in results:
      id = posts["Id"][posts["Id"]==post_id].values[0]
      print("Post Id : ", id)
      title = posts["Title"][posts["Id"]==post_id].values[0]
      print("Title : ", title)
      print("First sentence :")
      body = posts["Body"][posts["Id"]==post_id].values[0]
      first_sentence = body[0:150]
      print(first_sentence)
    return

In [None]:
print('Without clustering')
visualize_output(query, test_cluster_without)

print('With clustering')
visualize_output(query, test_cluster_with)

## Querying

In [None]:
def make_query(natural_query):
    # TODO

    return

## Scoring

In [None]:
# Pas sûr de garder cette partie

# Testing

In [None]:
# Read Relevancy CSV
# /!\ changer le filepath
df_relevancy = pd.read_excel(os.path.join(DATA_PATH, "evaluation_search_engine_post_queries_ranking_EI_CS.xlsx"))
df_relevancy = df_relevancy.fillna(0)
df_relevancy

In [None]:
test_queries = {1:'Query 1 : mesure performance for multiclassification model',
                2:'Query 2 : draw neural network',
                3:'Query 3 : neural network layers',
                4:'Query 4 : how sklearn working',
                5:'Query 5 : treat categorical data',
                'Query 1 : mesure performance for multiclassification model': 1,
                'Query 2 : draw neural network': 2,
                'Query 3 : neural network layers': 3,
                'Query 4 : how sklearn working': 4,
                'Query 5 : treat categorical data': 5}

def calc_dcg(query_results: list[int], rank: int =5, query_number: int =1)->float:
  dcg = 0
  for k in range(rank):
    id = query_results[k]
    score = df_relevancy[test_queries[query_number]][df_relevancy["PostId"]==id].iloc[0]/ (log(k+2)/log(2))
    dcg +=  score 
  return dcg


def calc_dcg_ideal(rank: int =5, query_number: int =1)->float:
  dcg_ideal = 0
  perfect_ranking = sorted(list(df_relevancy[test_queries[query_number]]), reverse=True)
  for k in range(rank):
    dcg_ideal += perfect_ranking[k] / log(k+2, 2)
  return dcg_ideal


def calculate_ndcg(query_results: list[int], rank: int =5, query_number: int =1)->float:
  return calc_dcg(query_results, rank, query_number) / calc_dcg_ideal(rank, query_number)


TESTs

In [None]:
subset_docs = set(df_relevancy["PostId"])
subset_posts = clean_posts[clean_posts['Id'].isin(subset_docs)]
subset_posts['Cleaned_post'] = subset_posts['Body'].fillna('').apply(clean_post)
subset_invind = create_inverted_index(subset_posts)
subset_vectorizer = TfidfVectorizer()
subset_vectorizer.fit(subset_posts.Cleaned_post.values)
subset_vectors = subset_vectorizer.transform(subset_posts.Cleaned_post.values)
subset_embeddings = MODEL_ST.encode(subset_posts.Cleaned_post.values, normalize_embeddings=True)

# First tests 'a la mano'
print(calc_dcg(sorted(list(subset_docs), reverse=True)))
print(calc_dcg_ideal())
print(calculate_ndcg(sorted(list(subset_docs), reverse=True)))
# ideal ranking found by hand for the first test query
ideal_ranking = [13490, 15989, 6107, 12321, 22, 14899, 5706, 15135, 12851, 694, 9302, 9443]
print(calculate_ndcg(ideal_ranking))


In [None]:
# Test for our previous search models
query_1 = 'mesure performance for multiclassification model'
query_2 = 'draw neural network'
naive_result = search_naive(query_2, subset_invind, 10)
bm25_result = search_OBM25(query_2, subset_invind, subset_posts, top=10)
mib_result = search_MIB(query_2, subset_invind, 10)
tfidf_result = search_tfidf(query_2, subset_posts, subset_vectors, subset_vectorizer)
semantic_result = search_semantic(query_2, subset_posts, subset_embeddings, 10)

naive_ranking = [r[0] for r in naive_result]
bm25_ranking  = [r[0] for r in bm25_result]
mib_ranking   = [r[0] for r in mib_result]
tfidf_ranking = [r[0] for r in tfidf_result]
semantic_ranking = [r[0] for r in semantic_result]

print(calculate_ndcg(naive_ranking, query_number=2))
print(calculate_ndcg(bm25_ranking, query_number=2))
print(calculate_ndcg(mib_ranking, query_number=2))
print(calculate_ndcg(tfidf_ranking, query_number=2))
print(calculate_ndcg(semantic_ranking, query_number=2))


In [None]:
visualize_output('machine learning basic tutorial best', nlp_search_algorithm('machine learning basic tutorial best'))

## Find a better ponderation

In [None]:
def rand_pond(nb_models=6):
  pond = [np.random.random() for _ in range(nb_models)]
  s = sum(pond)
  return [i/s for i in pond]

def get_imax(liste):
  i_max = 0
  for j in range(1, len(liste)):
    if liste[i_max] < liste[j]:
      i_max = j
  return i_max

def find_ponderation(query_number=2, nb_reps=10, nb_models=6):
  to_test = [rand_pond(nb_models) for _ in range(nb_reps)]
  ndcg_list = []
  for pond in to_test:
    query = test_queries[query_number]
    results = nlp_search_algorithm(query, ponderation=pond, data=subset_posts, inverted_index=subset_invind,
                     simple_index=clean_posts,
                     vectors=subset_vectors,
                     vectorizer=subset_vectorizer,
                     embeddings=subset_embeddings)
    ndcg_list.append(calculate_ndcg(results, query_number))
  i_max = get_imax(ndcg_list)
  return to_test[i_max]


find_ponderation()