In [4]:
from indexing import IndexType, InvertedIndex, BasicInvertedIndex, Indexer
from document_preprocessor import RegexTokenizer, Doc2QueryAugmenter
from relevance import map_score, ndcg_score, run_relevance_tests
from ranker import *
from tqdm.auto import tqdm
import json
from collections import Counter, defaultdict
import gzip
import os
import seaborn as sns                                                                                                                                                                                                            
import matplotlib.pyplot as plt                                                                                                                                                                                                  
import csv
from importlib import reload
import l2r
from l2r import L2RRanker, L2RFeatureExtractor
import openai
import os
from dotenv import load_dotenv
from openai import OpenAI
from Document_retriver import L2RRetriever
from RAG import RAGSystem
from vector_ranker import *

In [5]:
stopwords = set()

with open('stopwords.txt', 'r', encoding='utf-8') as file:                                                                                                                                                                  
    for line in file:                                                                                                                                                                                                            
        stopwords.add(line.strip().lower())

print('Loaded %d stopwords' % len(stopwords))

Loaded 543 stopwords


In [6]:
Tokenizer = RegexTokenizer()

In [7]:

index_type = IndexType.BasicInvertedIndex  # Or PositionalIndex based on your needs
dataset_path = 'cleaned_output.jsonl'  # Path to your JSONL file
minimum_word_frequency = 0  # Set to control minimum token frequency, 0 to ignore
text_key = "text"  # The JSON key containing the text content
doc_augment_dict = None  # Optional dictionary for additional document queries



In [8]:
doc_text = {}
doc_dict = {}
with open(dataset_path, 'r', encoding='utf-8') as file:
    for line in file:
        doc = json.loads(line)
        doc_text[doc['ID']] = doc['text']
        doc_dict[doc['ID']] = doc['URL']

In [9]:
models = 'google/flan-t5-base'
augmenter = Doc2QueryAugmenter(models)
prefix = "Generate a query for the following text: "


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
documents = list(doc_text.values())[:1]
for doc in tqdm(documents):
    query = augmenter.get_queries(doc, n_queries=10, prefix_prompt=prefix)

100%|██████████| 1/1 [00:04<00:00,  4.93s/it]


In [11]:
query

['Which item is listed on the list of ingredients and ingredients?',
 'Where are the names of the products discussed?',
 "What are the names of the ingredients in Healthline's evidence-based skin care ingredient dictionary?",
 "What products are included in Healthline's evidence-based skin care ingredient dictionary?",
 'What is the title of the text?',
 'Is it a good idea to buy a dictionary of skin care ingredients?',
 'What ingredients are considered to be evidence based?',
 "What are the names of Healthline's ingredients in the glossary?",
 "Which website is Healthline's Evidence-Based Skin Care Ingredients Dictionary?",
 'The BD is a medical review. What are the ingredients of this product?']

In [12]:
# # for every file in the jsonl file, we will generate a query with doc id, title, and the query and sacve as csv file
# start_line = 2281
# with open(dataset_path, 'r', encoding='utf-8') as file:
#     for current_line_number, line in enumerate(file, start=0):
#         if current_line_number < start_line:
#             continue
#         doc = json.loads(line)
#         doc_id = doc['ID']
#         doc_title = doc['Title']
#         doc_text = doc['text']
#         query = augmenter.get_queries(doc_text, n_queries=10, prefix_prompt=prefix)
#         with open('queries.csv', 'a', newline='') as file:
#             writer = csv.writer(file)
#             for q in query:
#                 writer.writerow([doc_id, doc_title, q])


In [13]:
DOC2QUERY_PATH = 'queries.csv'
doc_augment_dict = defaultdict(lambda: [])
with open(DOC2QUERY_PATH, 'r', encoding='utf-8') as file:
    dataset = csv.reader(file)
    for idx, row in tqdm(enumerate(dataset), total=123649):
        if idx == 0:
            continue
        doc_augment_dict[int(row[0])].append(row[2])

100%|██████████| 123649/123649 [00:00<00:00, 551736.15it/s]


In [14]:
index_directory_name = 'index'
if os.path.exists(index_directory_name):
    print("Index found. Loading existing index...")
    index = BasicInvertedIndex()
    index.load(index_directory_name)
    
else:
    print("Index not found. Creating and saving index...")
    index = Indexer.create_index(
        index_type=index_type,
        dataset_path=dataset_path,
        document_preprocessor=Tokenizer,
        stopwords=stopwords,
        minimum_word_frequency=minimum_word_frequency,
        text_key=text_key,
        doc_augment_dict=doc_augment_dict
    )
    index.save(index_directory_name)

Index found. Loading existing index...
Index loaded from index


In [15]:
index_directory_name = 'index'
index.save(index_directory_name)

Index saved to index


In [16]:
# get meta data about doc 1
doc_meta = index.get_doc_metadata(0)
doc_meta

{'unique_tokens': 1163, 'length': 5938, 'stored_length': 3576}

In [17]:
bm25_scorer = BM25(
    index=index,
    parameters={'b': 0.75, 'k1': 1.2, 'k3': 8}
)

# Initialize Ranker with BM25 scorer
ranker = Ranker(
    index=index,
    document_preprocessor=Tokenizer,
    stopwords=stopwords,
    scorer=bm25_scorer,
    raw_text_dict=doc_dict
)



In [18]:
# # Run a query through the Ranker
# query = "Effective treatments for skin conditions"

# ranked_documents = ranker.query(query)

# # Display ranked documents and their BM25 scores
# for doc_id, score in ranked_documents[:50]:
#     print(f"Document ID: {doc_id}, BM25 Score: {score}")
#     print(doc_dict[doc_id])
#     print()

In [19]:
# results_bm25 = run_relevance_tests('ranked_documents.csv', ranker)

# # Print the results
# print(f"Mean Average Precision (MAP): {results_bm25['MAP']}")
# print(f"Normalized Discounted Cumulative Gain (NDCG): {results_bm25['NDCG']}")

In [20]:
text_key = "Title"  # The JSON key containing the text content


# Create the index
Title_index = Indexer.create_index(
    index_type=index_type,
    dataset_path=dataset_path,
    document_preprocessor=Tokenizer,
    stopwords=stopwords,
    minimum_word_frequency=minimum_word_frequency,
    text_key=text_key,
    doc_augment_dict=doc_augment_dict
)
index_directory_name = 'Titleindex'
Title_index.save(index_directory_name)

Indexing documents: 12368it [00:00, 17845.01it/s]


Index saved to Titleindex


In [21]:

bi_encoder_model_name = 'sentence-transformers/all-MiniLM-L6-v2'  # or any other sentence-transformers model
biencoder_model = SentenceTransformer(bi_encoder_model_name, device='cpu')

documents_list = list(doc_text.values())
# Encode the documents once
encoded_docs = biencoder_model.encode(documents_list, convert_to_tensor=False)
encoded_docs = np.array(encoded_docs)  # Ensure it's a numpy array

# Create a mapping from row number to doc ID
row_to_docid = list(doc_text.keys())

# Initialize VectorRanker
vector_ranker = VectorRanker(
    bi_encoder_model_name=bi_encoder_model_name,
    encoded_docs=encoded_docs,
    row_to_docid=row_to_docid
)

In [30]:
vector_ranker

<vector_ranker.VectorRanker at 0x3a45653d0>

In [31]:
# Assuming you have the necessary components like document_index, title_index, etc.
feature_extractor = L2RFeatureExtractor(
    document_index=index,  # The inverted index of document contents
    title_index=Title_index,        # The inverted index of document titles
    document_preprocessor=Tokenizer,  # Instance of the tokenizer class
    stopwords=stopwords                 # Set of stopwords
)
l2r_ranker = L2RRanker(
    document_index=index,
    title_index=Title_index,
    document_preprocessor=Tokenizer,
    stopwords=stopwords,
    ranker=vector_ranker,
    feature_extractor=feature_extractor
)

# Train the L2R ranker with relevance data
l2r_ranker.train('ranked_documents.csv')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 622, number of used features: 9


In [32]:
query = "Effective treatments for skin conditions"
ranked_documents = l2r_ranker.query(query)
# Display top-ranked documents and their scores
for doc_id, score in ranked_documents[:10]:
    print(f"Document ID: {doc_id}, Score: {score}")
    print(doc_dict[doc_id])
    print()

Document ID: 148, Score: 0.32154217072155816
https://www.healthline.com/health/custom-skin-care

Document ID: 7222, Score: 0.32154217072155816
https://www.healthline.com/health/beauty-skin-care/what-does-your-skin-really-need-how-to-achieve-truly-healthy-skin#keep-it-simple

Document ID: 4349, Score: 0.29457445751171457
https://www.healthline.com/health/beauty-skin-care/the-ultimate-summer-skin-care-routine-in-8-simple-steps

Document ID: 1125, Score: 0.29457445751171457
https://www.healthline.com/health/beauty-skin-care/what-does-your-skin-really-need-how-to-achieve-truly-healthy-skin#sun-protection

Document ID: 1211, Score: 0.29457445751171457
https://www.healthline.com/health/beauty-skin-care/what-does-your-skin-really-need-how-to-achieve-truly-healthy-skin#hydration

Document ID: 986, Score: 0.29457445751171457
https://www.healthline.com/health/beauty-skin-care/what-does-your-skin-really-need-how-to-achieve-truly-healthy-skin#know-your-skin-type

Document ID: 1557, Score: 0.198021

In [34]:
results = run_relevance_tests('ranked_documents.csv', l2r_ranker)
# Print the results
print(f"Mean Average Precision (MAP): {results['MAP']}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {results['NDCG']}")


Mean Average Precision (MAP): 0.03901098901098901
Normalized Discounted Cumulative Gain (NDCG): 0.07966750266109551


In [21]:
# #plot the results
# from matplotlib import pyplot as plt
# plt.figure(figsize=(10, 5))
# plt.bar(['BM25', 'L2R'], [results_bm25['MAP'], results['MAP']])
# plt.ylabel('Mean Average Precision (MAP)')
# plt.title('BM25 vs L2R')
# plt.yticks([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
# plt.show()


In [22]:
# plt.figure(figsize=(10, 5))
# plt.bar(['BM25', 'L2R'], [results_bm25['NDCG'], results['NDCG']])
# plt.ylabel('Normalized Discounted Cumulative Gain (NDCG)')
# plt.title('BM25 vs L2R')
# plt.yticks([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
# plt.show()


In [23]:
l2r_retriever = L2RRetriever(l2r_ranker, doc_text)
query = "condoms"
results_bm25, ranked_doc = l2r_retriever.retrieve(query, top_k=2)
ranked_doc

[(4129, np.float64(0.23464614739350026)),
 (5664, np.float64(0.23464614739350026)),
 (4052, np.float64(0.2065058708902234)),
 (7191, np.float64(0.19162648007714966)),
 (11671, np.float64(0.19162648007714966)),
 (11542, np.float64(0.15860758248297863)),
 (8388, np.float64(0.15254568674778193)),
 (682, np.float64(0.1351106020217465)),
 (2898, np.float64(0.12875195617926338)),
 (10267, np.float64(0.10993917284483876)),
 (9256, np.float64(0.09484816611055788)),
 (8360, np.float64(0.0934706593433244)),
 (9633, np.float64(0.08075272735637218)),
 (9164, np.float64(0.07412859738081685)),
 (1877, np.float64(0.06829763546753892)),
 (8954, np.float64(0.06829763546753892)),
 (6795, np.float64(0.06829763546753892)),
 (7325, np.float64(0.06829763546753892)),
 (10299, np.float64(0.06829763546753892)),
 (4586, np.float64(0.06829763546753892)),
 (8144, np.float64(0.06829763546753892)),
 (8054, np.float64(0.06829763546753892)),
 (5743, np.float64(0.06829763546753892)),
 (10133, np.float64(0.068297635467

OPENAI

In [25]:
rag = RAGSystem(documents, l2r_retriever)
print("Welcome to the RAG Chatbot! Type 'exit' to quit.\n")
query = input("You:")
answer, ranked_doc = rag.get_answer(query, top_k=2)
print(f"Chatbot: {answer}\n")
for doc_id, score in ranked_doc[:10]:
    print(f"Document ID: {doc_id}, Score: {score}")
    print(doc_dict[doc_id])


Welcome to the RAG Chatbot! Type 'exit' to quit.

Chatbot: Condoms are a barrier method of birth control that help prevent the transmission of sexually transmitted infections (STIs) and reduce the chance of unplanned pregnancy. Male condoms are made of latex or other materials and are about 82% effective at preventing pregnancies, while female condoms are around 79% effective. They are highly effective at preventing the transmission of HIV and other STIs when used correctly during each sexual encounter. It's important to use water-based lubricants with condoms, as oil-based lubricants can weaken latex and cause breakage.

Document ID: 4129, Score: 0.23464614739350026
https://www.healthline.com/health/hiv/risks-sex-without-condoms/
Document ID: 5664, Score: 0.23464614739350026
https://www.healthline.com/health/birth-control/types-of-birth-control?utm_source=ReadNext
Document ID: 4052, Score: 0.2065058708902234
https://www.healthline.com/health/healthy-sex/lube-shopping-guide-types
Docum

ModuleNotFoundError: No module named 'pyspellchecker'