In [6]:
from google.colab import drive
drive.mount('/content/drive')

address = "MIRNews1401"
import sys
sys.path.append('/content/drive/My Drive/{}'.format(address))
%cd /content/drive/My\ Drive/$address

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1uNRlfx2_FJDSXd8hfzc-wkVrmSePMe3-/MIRNews1401


In [45]:
# Required installations
# -------------------------------------------
!pip install hazm
!pip install sentence_transformers
# -------------------------------------------

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [46]:
# Required imports
# -------------------------------------------
import json
import math
from tqdm import tqdm
from hazm import *
import numpy as np
import pandas as pd
from collections import Counter
import numexpr as ne
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from gensim.models.fasttext import FastText
from sentence_transformers import util, SentenceTransformer
# -------------------------------------------

In [47]:
# Required Variable Initializations (Loads, Downloads, ...)
# -------------------------------------------
stop_words = stopwords_list()
normalizer = Normalizer()
lemmatizer = Lemmatizer()
news_details = np.load("./data/news_details.npy", allow_pickle=True)
frequency_matrix = np.load("./data/boolean_frequency_matrix.npy")
unique_token_list = np.load("./data/boolean_unique_token_list.npy")
boolean_matrix = frequency_matrix.copy()
boolean_matrix[boolean_matrix > 0] = 1
transformer_model = SentenceTransformer('m3hrdadfi/bert-fa-base-uncased-wikinli-mean-tokens')
transformer_doc_embeddings = np.load("./data/transformer_doc_embeddings.npy")
tfidf_vectorizer = TfidfVectorizer(smooth_idf=False, norm="l2")
tfidf_vocabulary = json.load(open('./data/tfidf_vocabulary.json', mode = 'r'))
tfidf_idf = np.load("./data/tfidf_idf.npy")
tfidf_vectorizer.idf_ = tfidf_idf
tfidf_vectorizer.vocabulary_ = tfidf_vocabulary
tfidf_embeddings = sparse.load_npz("./data/tfidf_embeddings.npz")
ft_model = FastText.load("./data/fasttext.model")
features = tfidf_vectorizer.get_feature_names_out()
idfs = tfidf_vectorizer.idf_
embedding_size = 100
fast_weighted_embeddings = np.load("./data/fast_weighted_embeddings.npy")
frequency_matrix_df = pd.DataFrame(data = frequency_matrix, 
                                   index = unique_token_list, 
                                   columns = list(range(frequency_matrix.shape[1])))
boolean_matrix_df = pd.DataFrame(data = boolean_matrix, 
                                 index = unique_token_list, 
                                 columns = list(range(frequency_matrix.shape[1])))
# -------------------------------------------



In [48]:
# Required Functions (Variables in the previous block must be initialized first)
# -------------------------------------------
def preprocess_text(document):
        document = normalizer.normalize(document)
        
        tokens = word_tokenize(document)
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [word for word in tokens if len(word) > 2]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text



def boolean_retrieval(boolean_matrix_df, frequency_matrix_df, k, query_string, not_raw_tokens=[]):

    # get all query tokens
    query_raw_tokens = word_tokenize(query_string)

    # remove stop words
    query_tokens = [lemmatizer.lemmatize(token) for token in query_raw_tokens if token not in stop_words]

    # remove stop words from not_tokens
    not_tokens = [lemmatizer.lemmatize(token) for token in not_raw_tokens if token not in stop_words]

    # get all unique tokens which are already in our readmes 
    unique_tokens = boolean_matrix_df.index.values.tolist()

    # available tokens in query
    available_query_tokens = [token for token in query_tokens if token in unique_tokens]

    # only choose specific rows of df which correspond to query tokens
    summary_df = boolean_matrix_df.loc[available_query_tokens]
    summary_frequency_df = frequency_matrix_df.loc[available_query_tokens]

    # available tokens in not_tokens
    available_not_tokens = [token for token in not_tokens if token in unique_tokens]

    # df consisting of only available_not_tokens
    summary_not_df = boolean_matrix_df.loc[available_not_tokens]
    arr = summary_not_df.values
    summary_not_df = pd.DataFrame(ne.evaluate('1 - arr'), columns=summary_not_df.columns, index=summary_not_df.index)
    summary_not_frequency_df = frequency_matrix_df.loc[available_not_tokens]
    arr = summary_not_frequency_df.values
    summary_not_frequency_df = pd.DataFrame(ne.evaluate('0 - arr'), columns=summary_not_frequency_df.columns, index=summary_not_df.index)

    # specify boolean rating
    rating_1 = summary_df.sum().tolist()
    rating_2 = summary_not_df.sum().tolist()
    boolean_rating = [x + y for x, y in zip(rating_1, rating_2)]

    # specify frequency rating
    rating_1 = summary_frequency_df.sum().tolist()
    rating_2 = summary_not_frequency_df.sum().tolist()
    frequency_rating = [x + y for x, y in zip(rating_1, rating_2)]

    # sort indices based on ratings
    raw_indices = list(range(len(boolean_matrix_df.columns)))
    list_of_tuples = [(bool_item, freq_item) for bool_item, freq_item in zip(boolean_rating, frequency_rating)]
    zipped = zip(list_of_tuples, raw_indices)
    sorted_indices = [item for _, item in sorted(zipped, key=lambda pair: pair[0], reverse=True)]
    
    return sorted_indices[:k]


def query_expansion_embedding(query_emb, rel_list, non_rel_list):
    a = 1
    b = 0.8
    c = 0.1
    resault = a * query_emb + b * np.sum(rel_list)/len(rel_list) - c * np.sum(non_rel_list)/len(non_rel_list)
    return resault

def boolean_search(text, K):
    my_indices = boolean_retrieval(boolean_matrix_df, frequency_matrix_df, K, text)
    return my_indices

def transformer_search(text, K, expansion=False):
    query = preprocess_text(text)
    transformer_query_embedding = transformer_model.encode(query)
    transformer_list_sim = util.cos_sim(transformer_query_embedding, transformer_doc_embeddings)
    if expansion:
        sorted_list_sim = np.argsort(np.array(transformer_list_sim[0]))
        most_similar_emb = [transformer_doc_embeddings[i] for i in sorted_list_sim[::-1][:10]]
        least_similar_emb = [transformer_doc_embeddings[i] for i in sorted_list_sim[:][:10]]
        expanded = query_expansion_embedding(transformer_query_embedding, most_similar_emb, least_similar_emb)
        transformer_list_sim = util.cos_sim(expanded, transformer_doc_embeddings)
    
    transformer_most_similar_doc_indices = np.argsort(np.array(transformer_list_sim[0]))[::-1][:K]
    return transformer_most_similar_doc_indices

def tfidf_search(text, K, expansion=False):
    query = [preprocess_text(text)]
    tfidf_query_vec = tfidf_vectorizer.transform(query)
    tfidf_similarities = cosine_similarity(tfidf_embeddings, tfidf_query_vec).flatten()
    if expansion:
        sorted_sim = np.argsort(tfidf_similarities, axis=0)
        most_similar_emb = [tfidf_embeddings[i] for i in sorted_sim[:-11:-1]]
        least_similar_emb = [tfidf_embeddings[i] for i in sorted_sim[:10]]
        expanded = query_expansion_embedding(tfidf_query_vec, most_similar_emb, least_similar_emb)
        tfidf_similarities = cosine_similarity(tfidf_embeddings, expanded).flatten()

    tfidf_most_similar_doc_indices = np.argsort(tfidf_similarities, axis=0)[:-K-1:-1]
    return tfidf_most_similar_doc_indices

def query_expansion_embedding(query_emb, rel_list, non_rel_list):
    a = 1
    b = 0.8
    c = 0.1
    resault = a * query_emb + b * np.sum(rel_list)/len(rel_list) - c * np.sum(non_rel_list)/len(non_rel_list)
    return resault

def boolean_search(text, K):
    my_indices = boolean_retrieval(boolean_matrix_df, frequency_matrix_df, K, text)
    return my_indices

def transformer_search(text, K, expansion=False):
    query = preprocess_text(text)
    transformer_query_embedding = transformer_model.encode(query)
    transformer_list_sim = util.cos_sim(transformer_query_embedding, transformer_doc_embeddings)
    if expansion:
        sorted_list_sim = np.argsort(np.array(transformer_list_sim[0]))
        most_similar_emb = [transformer_doc_embeddings[i] for i in sorted_list_sim[::-1][:10]]
        least_similar_emb = [transformer_doc_embeddings[i] for i in sorted_list_sim[:][:10]]
        expanded = query_expansion_embedding(transformer_query_embedding, most_similar_emb, least_similar_emb)
        transformer_list_sim = util.cos_sim(expanded, transformer_doc_embeddings)
    
    transformer_most_similar_doc_indices = np.argsort(np.array(transformer_list_sim[0]))[::-1][:K]
    return transformer_most_similar_doc_indices

def tfidf_search(text, K, expansion=False):
    query = [preprocess_text(text)]
    tfidf_query_vec = tfidf_vectorizer.transform(query)
    tfidf_similarities = cosine_similarity(tfidf_embeddings, tfidf_query_vec).flatten()
    if expansion:
        sorted_sim = np.argsort(tfidf_similarities, axis=0)
        most_similar_emb = [tfidf_embeddings[i] for i in sorted_sim[:-11:-1]]
        least_similar_emb = [tfidf_embeddings[i] for i in sorted_sim[:10]]
        expanded = query_expansion_embedding(tfidf_query_vec, most_similar_emb, least_similar_emb)
        tfidf_similarities = cosine_similarity(tfidf_embeddings, expanded).flatten()

    tfidf_most_similar_doc_indices = np.argsort(tfidf_similarities, axis=0)[:-K-1:-1]
    return tfidf_most_similar_doc_indices

def ft_weighted_search(text, K, expansion):
    changed_text = preprocess_text(text)
    init_emb = np.zeros(embedding_size)
    for word in word_tokenize(changed_text):
        try:
            emb = ft_model.wv[word]
            resault = np.where(features == word)[0]
            if resault.size > 0:
                init_emb += idfs[resault[0]] * emb / np.linalg.norm(emb)
        except:
            pass
    ft_similarities = cosine_similarity(fast_weighted_embeddings, init_emb.reshape(1, -1)).flatten()
    if expansion:
        sorted_sim = np.argsort(ft_similarities, axis=0)
        most_similar_emb = [fast_weighted_embeddings[i] for i in sorted_sim[:-11:-1]]
        least_similar_emb = [fast_weighted_embeddings[i] for i in sorted_sim[:10]]
        expanded = query_expansion_embedding(init_emb, most_similar_emb, least_similar_emb)
        ft_similarities = cosine_similarity(fast_weighted_embeddings, expanded.reshape(1, -1)).flatten()

    ft_most_similar_doc_indices = np.argsort(ft_similarities, axis=0)[:-K-1:-1]
    return ft_most_similar_doc_indices


def query_search(query_text, output_num, search_type, expansion=False):
    result_indices = []
    if search_type == 'boolean':
        if expansion:
            pass
        result_indices = boolean_search(query_text, output_num)
    elif search_type == 'transformer':
        result_indices = transformer_search(query_text, output_num, expansion)
    elif search_type == 'tfidf':
        result_indices = tfidf_search(query_text, output_num, expansion)
    elif search_type == 'fasttext':
        result_indices = ft_weighted_search(query_text, output_num, expansion)
    
    result = [news_details[i] for i in result_indices]
    return result
# -------------------------------------------

In [49]:
# Examples
# -------------------------------------------
query = 'تیم فوتبال پرسپولیس'
print([i['url'] for i in query_search(query, 10, 'fasttext', True)])
print([i['url'] for i in query_search(query, 10, 'fasttext', False)])
print([i['title'] for i in query_search(query, 10, 'fasttext', False)])
# -------------------------------------------

['https://www.yjc.news/fa/news/8177512/تاریخ-مسابقات-لیگ-برتر-و-اردوی-تیم-ملی-فوتبال-اعلام-شد', 'https://www.yjc.news/fa/news/8162628/سمیعی-باشگاه-پانتولیکوس-یونان-از-استقلال-شکایت-کرده-است#comments', 'https://www.yjc.news/fa/news/8170441/رکورد-شکنی-فرزانه-فصیحی-در-ماده-۱۰۰-متر-بانوان', 'https://www.yjc.news/fa/news/8177591/نمایندگان-اسکیت-ایران-در-مسابقات-کاپ-آزاد-ایتالیا-طلایی-شدند', 'https://www.yjc.news/fa/news/8172584/لیگ-ملت\u200cهای-والیبال-۲۰۲۲-ایتالیا-میزبان-مرحله-پایانی-شد', 'http://www.yjc.news/fa/news/8175327/صلاح-در-لیورپول-ماندنی-شد&via=yjcagency', 'https://www.yjc.news/fa/news/8171357/نتایج-دوندگان-کشورمان-در-مسابقات-دو-و-میدانی-جام-کازانوف', 'https://www.yjc.news/fa/news/8172660/برگزاری-فینال-لیگ-دسته-یک-کشتی-همزمان-با-تولد-آقا-تختی', 'https://www.yjc.news/fa/news/8188227/شکسته-شدن-۳-رکورد-ملی-در-رقابت\u200cهای-دو-و-میدانی-قهرمانی-باشگاه\u200cهای-کشور', 'https://www.yjc.news/fa/news/8172592/اعزام-اسنوکرباز\u200cهای-جوان-ایران-به-مسابقات-جهانی']
['https://www.yjc.news/fa