## Import Dependencies 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

from src.utils.construct_positional_indexes import construct_positional_indexes
from src.utils.load_index import load_index
from src.models.TextPreprocessor import PersianTextPreprocessor
from src.enums import *

In [3]:
manager = construct_positional_indexes()
manager.save_index("manager.pickle")

100%|██████████| 1572/1572 [00:08<00:00, 180.68it/s]
100%|██████████| 1572/1572 [00:09<00:00, 161.49it/s]


In [5]:
text_preprocessor = PersianTextPreprocessor()
manager = load_index("manager.pickle", text_preprocessor)

In [6]:
manager.corpus_index.get_posting_list("با")[0].get_positions(Fields.TEXT)

[278, 313, 330, 477]

In [7]:
word = "آقو"

In [8]:
manager.correct_word(word)

'آقوس'

In [9]:
manager.search("نیمه شعبان")

[5619,
 3666,
 5236,
 6208,
 7123,
 6177,
 3665,
 6791,
 3360,
 6475,
 4438,
 7101,
 6626,
 6548,
 4335]

In [10]:
manager.search("نیمه شعبان")

[5619,
 3666,
 5236,
 6208,
 7123,
 6177,
 3665,
 6791,
 3360,
 6475,
 4438,
 7101,
 6626,
 6548,
 4335]

In [5]:
manager.search("نیمه شعبان")

[5619,
 3666,
 5236,
 6208,
 7123,
 6177,
 3665,
 6791,
 3360,
 6475,
 4438,
 7101,
 6626,
 6548,
 4335]

In [14]:
manager.add_document_to_indexes("./data/Add.xml", 7157)

In [11]:
manager.search({Fields.TITLE: '"آقا علیرضا"'})

[7157]

In [12]:
manager.delete_document_from_indexes("./data/Add.xml", 7157)

In [15]:
print(manager.corpus_index.index["آقا"].posting_list[-1])


        Document ID: 7157
        Title Positions: [0]
        Text Positions: [0]
        


In [16]:
manager.corpus_index.index["آقا"].doc_frequency

{<Fields.TEXT: 'text'>: 62, <Fields.TITLE: 'title'>: 1}

In [17]:
from src.metrics import (
    f_measure,
    precision,
    r_precision,
    recall,
    ndcg_at_k,
    average_precision,
    evaluate_search_engine,
)

In [22]:
evaluate_search_engine(manager, Methods.LTN_LNN)

r_precision: 0.6105804602529432
ndcg: 0.6787020126128037
f_measure: 0.6279770638289564
map: 0.7569273771354816


In [None]:
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
from typing import Optional, List

from src.enums import Fields
from src.models.Document import Document
from src.types import DocID
from src.utils.read_document import read_documents_json
from src.models.TextPreprocessor import PersianTextPreprocessor, EnglishTextPreprocessor
from src.models.Manager import Manager

import numpy as np
from collections import Counter

In [4]:
text_preprocessor = EnglishTextPreprocessor(lemmatize=True)

In [5]:
train_documents = read_documents_json('./MIR_Phase2/data/train.json', text_preprocessor)

In [6]:
val_documents = read_documents_json('./MIR_Phase2/data/validation.json', text_preprocessor)

In [7]:
manager = Manager(train_documents, [Fields.BODY, Fields.TITLE], text_preprocessor)

100%|██████████| 24000/24000 [00:08<00:00, 2999.91it/s]


In [8]:
def cosine_normalize(mat: np.ndarray):
    return mat / np.linalg.norm(mat, axis=1)[:, np.newaxis]

In [9]:
fields = [Fields.TITLE, Fields.BODY]
num_terms = len(manager.corpus_index.index)
num_train_docs = len(manager.documents)
num_val_docs = len(val_documents)

In [10]:
train_matrix = np.zeros((num_train_docs, num_terms), dtype=np.float32)
val_matrix = np.zeros((num_val_docs, num_terms), dtype=np.float32)

In [11]:
for index, (token, token_item) in enumerate(manager.corpus_index.index.items()):
    for field in fields:
        df = token_item.doc_frequency[field]
        if df == 0:
            continue
        idf = np.log10(num_train_docs/df)
        for posting_list_item in token_item.posting_list:
            doc_id = posting_list_item.doc_id
            tf = posting_list_item.get_tf(field)
            train_matrix[doc_id, index] += tf * idf

In [12]:
normalized_train_matrix = cosine_normalize(train_matrix)

In [13]:
val_documents_counts = {}
for field in fields:
    val_documents_counts[field] = []
    for doc in val_documents:
        val_documents_counts[field].append(Counter(doc.get_tokens(field)))

In [14]:
for token_index, (token, token_item) in enumerate(manager.corpus_index.index.items()):
    for field in fields:
        df = token_item.doc_frequency[field]
        if df == 0:
            continue
        idf = np.log10(num_train_docs/df)
        for doc_index, count in enumerate(val_documents_counts[field]):
            if token in count:
                val_matrix[doc_index, token_index] += idf * count[token]

In [18]:
normalized_val_matrix = cosine_normalize(val_matrix).T

In [None]:
# similarities =  normalized_train_matrix @ normalized_val_matrix.T

In [19]:
import numba as nb
from numba import njit

@njit(nb.float32[:,:](nb.float32[:,:], nb.float32[:,:]), fastmath=True)
def calc_similarities(normalized_train_matrix, normalized_val_matrix):
    sims = np.dot(normalized_train_matrix, normalized_val_matrix)
    return sims

  sims = np.dot(normalized_train_matrix, normalized_val_matrix)


In [20]:
res = calc_similarities(normalized_train_matrix, normalized_val_matrix)

In [21]:
res.shape

(24000, 3000)

In [22]:
x = np.argmax(res, axis=0)

In [24]:
x[0]

13659

In [62]:
documents[13659].category

4

In [26]:
for doc_id, train_doc in manager.documents.items():
    for field in fields:
        for token in train_doc.get_tokens(field):
            print(manager.corpus_index.get_token_item(token).term_frequency[field])
        break
    break

1
69
10
25
2500
1
120


In [None]:
def run_knn(doc: Document, k:int = 1, distance: str = 'cosine'):
    

In [16]:
x = manager.corpus_index.get_posting_list("god")[0]

In [17]:
x.body_tf

1

In [18]:
x.doc_id

8

In [19]:
x.title_positions

[]