<a href="https://colab.research.google.com/github/T-Sunm/Text-Retrieval/blob/main/Text_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install datasets
! pip install tqdm



In [None]:
from datasets import load_dataset
ds = load_dataset("microsoft/ms_marco", "v1.1")

In [None]:
from tqdm import tqdm

# Text Retrieval Traditional

## Xây dựng danh sách câu truy vấn và tài liệu:

In [None]:
# Chọn bộ test:
subset = ds['test']


# Khai báo danh sách chứa tập câu truy vấn và tài liệu có liên quan:
queries_infos = []
queries = []
corpus = []

for sample in tqdm(subset, desc=  "Processing get data in dataset"):
    query_type = sample['query_type']

    if query_type != 'entity':
        continue
    query_str = sample['query']
    query_id = sample['query_id']
    passages_dict = sample['passages']
    is_selected_lst = passages_dict["is_selected"]
    passage_text_lst = passages_dict["passage_text"]
    query_info = {
        'query_id': query_id,
        'query': query_str,
        'relevant_docs': []
    }

    length_of_corpus = len(corpus)
    for idx in range(len(is_selected_lst)):
        if is_selected_lst[idx] == 1:
#           mình sẽ gộp tất cả các passage_text_lst theo thứ tự vào trong 1 corpus chứ kh chia ra như sample
#           vì vậy khi mình lưu 1 câu doc có is_selected == 1 thì mình current_len_corpus + idx để sau này truy xuất đúng index của nó
            doc_idx = length_of_corpus + idx
            query_info['relevant_docs'].append(doc_idx)

#     sample nào kh có câu trả lời đúng thì loại ra
    if query_info['relevant_docs'] == []:
        continue

    queries.append(query_str)
    queries_infos.append(query_info)

#     thêm nhiều phần tử vào mảng
    corpus.extend(passage_text_lst)


Processing get data in dataset: 100%|██████████| 9650/9650 [00:01<00:00, 6928.77it/s]


In [None]:
corpus[:5]

['SUBPHYLUM CHELICERATA, CLASS ARACHNIDA. Spiders. This group contains many familiar organisms, including the spiders, mites, scorpions and ticks. Examine the large spider on the right. Again, notice that there are two body regions, a cephalothorax and an abdomen. On the cephalothorax are two to four pairs of simple eyes.',
 'The class Arachnida includes a diverse group of arthropods: spiders, scorpions, ticks, mites, harvestmen, and their cousins. Scientists describe over 75,000 species of arachnids, the majority of them spiders. Most arachnids are carnivorous, typically preying on insects, and terrestrial, living on land.',
 'Spiders belong to the phylum Arthropoda, along with insects and crustaceans. The order of spiders, Araneae—together with scorpions, harvestmen, and the large order of mites and ticks—make up the class of Arachnida. Spiders differ from other arachnids in having the body divided into cephalothorax and abdomen.',
 'The class Araneae contains the spiders, a large an

## Xây dựng hàm chuẩn hóa văn bản:

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Lowercase

In [None]:
def lowercase(text : str):
    return text.lower()

## Punctuation removal

In [None]:
def punctual_removal(text: str):
    translator = str.maketrans('','',string.punctuation)

    return text.translate(translator)

## Tokenizer

In [None]:
def tokenizer(text:str):
    return text.split()

## Remove stopword

In [None]:
def remove_stopword(vocab_lst:list):
    stopword = stopwords.words('english')
    return [vocab for vocab in vocab_lst if vocab not in stopword]

## Stemming

In [None]:
def stemming(vocab_lst: list):
    # Khởi tạo PorterStemmer
    ps = PorterStemmer()
    return [ps.stem(vocab) for vocab in vocab_lst]

In [None]:
def preprocessing_text(text):
    text = lowercase(text)
    text = punctual_removal(text)
    vocabs = tokenizer(text)
    vocabs = remove_stopword(vocabs)
    vocabs = stemming(vocabs)

    return vocabs

test = preprocessing_text(corpus[0])
print(test)

['subphylum', 'chelicerata', 'class', 'arachnida', 'spider', 'group', 'contain', 'mani', 'familiar', 'organ', 'includ', 'spider', 'mite', 'scorpion', 'tick', 'examin', 'larg', 'spider', 'right', 'notic', 'two', 'bodi', 'region', 'cephalothorax', 'abdomen', 'cephalothorax', 'two', 'four', 'pair', 'simpl', 'eye']


## Xây dựng bộ từ vựng (dictionary)

In [None]:
def create_dict(corpus):
    dictionary = {}
    index = 0  # Khởi tạo chỉ số cho từ điển
    for text in tqdm(corpus, desc="Preprocessing text and create dictionary..."):
        vocabs = preprocessing_text(text)
        for vocab in vocabs:
            if vocab not in dictionary:
                dictionary[vocab] = index
                index += 1

    return dictionary



dictionary = create_dict(corpus)



Preprocessing text and create dictionary...: 100%|██████████| 7303/7303 [00:12<00:00, 567.27it/s]


## Xây dựng ma trận document-term:
ta sẽ dụng cấu trúc dữ liệu dict , key : là từ muốn tìm , value là index . Để khi mình tim được từ rồi thì mình dùng index đó cập nhật vào trong mảng

In [None]:
def vectorize(text, dictionary):
    vocabs = preprocessing_text(text)
    vector = np.zeros(len(dictionary))
    for vocab in vocabs:
        index = dictionary.get(vocab, -1)
        if index != -1:
            vector[index] += 1
    return vector

def create_doc_terms(corpus, dictionary):
    doc_terms = []
    for text in tqdm(corpus, desc = 'create docterms...'):
        vector = vectorize(text, dictionary)
        doc_terms.append(vector)
    return doc_terms

docterms = create_doc_terms(corpus, dictionary)




create docterms...: 100%|██████████| 7303/7303 [00:14<00:00, 519.39it/s]


In [None]:
print(np.array(docterms).shape)

(7303, 22111)


## Xây dựng hàm tính độ tương đồng giữa hai vector:

In [None]:
def compute_cosine_similarity(query, docterms):
    numerator = np.sum(query*docterms, axis = 1)
    norm_q = np.linalg.norm(query)
    norm_docterms = np.linalg.norm(docterms, axis = 1)

    return numerator / (norm_q * norm_docterms)


In [None]:
query_lst = 'what is the official language in Fiji'
top_k = 10
vector_q = vectorize(query_lst,dictionary)
ranking = sorted(enumerate(compute_cosine_similarity(vector_q, docterms)), key=lambda x: x[1], reverse=True)[:top_k]

for top, (index, similarity) in enumerate(ranking, start= 1):
    print(f"Top {top} \n",corpus[index])


Top 1 
 The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.
Top 2 
 The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.
Top 3 
 The official languages. Fiji’s 1997 Constitution established Fijian as one of the official languages of the country. Fijian is an Austronesian language, a grouping that includes thousands of other languages spanning the globe. The language is of the Malayo-Polynesian family, not too different from Hawaiian and Maori.
Top 4 
 Of all the languages of Russia, Russian is the only official language. 

In [None]:
! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [None]:
pip install --upgrade torch torchvision torchaudio

  pid, fd = os.forkpty()


Collecting torch
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.19.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting torchaudio
  Downloading torchaudio-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Downloading nvidia_cublas_cu12-12

# Text Retrieval (Sentence transformer)

In [None]:
import torch
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

In [None]:
from sentence_transformers import util

def similarity(query_embeddings, corpus_embeddings):
    return util.cos_sim(query_embeddings, corpus_embeddings)[0]

In [None]:
def ranking(query, corpus_embeddings, top_k = 10):
    query_embeddings = model.encode(
        query,
        convert_to_tensor = True
    )

    cos_scores = similarity(query_embeddings, corpus_embeddings)

    top_results = torch.topk(cos_scores, k=top_k)

    return top_results

In [None]:
query = 'what is the official language in Fiji'
ranks = ranking(query, corpus_embeddings)

indicies = ranks.indices.cpu().numpy()
similarities = ranks.values.cpu().numpy()

for top, (index, similarity) in enumerate(zip(indicies, similarities), start= 1):
    print(f"Top {top} \n",corpus[index])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Top 1 
 The official languages. Fiji’s 1997 Constitution established Fijian as one of the official languages of the country. Fijian is an Austronesian language, a grouping that includes thousands of other languages spanning the globe. The language is of the Malayo-Polynesian family, not too different from Hawaiian and Maori.
Top 2 
 Fiji has three official languages under the 1997 constitution (and not revoked by the 2013 Constitution): English, Fijian and Hindi. Fijian is spoken either as a first or second language by indigenous Fijians who make up around 54% of the population.
Top 3 
 The Republic of the Fiji Islands citizens speak British English. Fijian and Fiji-Hindi is the second language. Other major language that is taught in elementary/primary schools and high schools are Urdu and French. Urdu and French is never considered to be a benefit to the people of Fiji.
Top 4 
 The Republic of the Fiji Islands citizens speak British English. Fijian and Fiji-Hindi is the second languag