In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from collections import defaultdict
from typing import Iterable, Optional

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import numpy as np

from utils import DBClients

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix

In [4]:
def gen_search_index_keys(stems: Iterable[str]) -> list[str]:
    return [f"devstop:index:{stem}" for stem in stems]

def get_stem_from_key(key: str) -> str:
    return key.split(":")[2]

In [5]:
def index_lookup(stems: Iterable[str]) -> dict[str, set[str]]:
    keys = gen_search_index_keys(stems)
    print(keys)
    redis_client = DBClients.redis_client()
    index_lkp = {}
    for key in keys:
        result = redis_client.smembers(key)
        if result:
            index_lkp[get_stem_from_key(key)] = {doc_id.decode() for doc_id in result}
    return index_lkp

In [6]:
def gen_freq_keys(lookup: dict[str, set[str]]) -> list[str]:
    return [f"devstop:freq:{stem}:{doc}" for stem, docs in lookup.items() for doc in docs]
    
def get_stem_doc_from_key(key: str) -> tuple[str, str]:
    split_key = key.split(":")
    return split_key[2], split_key[3]

In [7]:
def frequency_lookup(lookup: dict[str, set[str]]) -> dict[str, str]:
    keys = gen_freq_keys(lookup)
    # print(keys)
    redis_client = DBClients.redis_client()
    results = redis_client.mget(keys)
    
    # woi --> Words of interests
    docs_with_woi = defaultdict(str)
    for key, result in zip(keys, results):
        stem, doc = get_stem_doc_from_key(key)
        docs_with_woi[doc] += " " + " ".join([stem] * int(result))
    return docs_with_woi

In [8]:
def get_query_stems(query: str) -> list[str]:
    stpwords = stopwords.words("english")
    tokens = word_tokenize(query)
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(tok) for tok in tokens if tok not in stpwords]

In [9]:
def gen_tfidf_matrix(docs_with_woi: dict[str, str], stemmed_query: str) -> csr_matrix:
    corpus = list(docs_with_woi.values()) + [stemmed_query]
    tfidf_vec = TfidfVectorizer()
    doc_term_matrix = tfidf_vec.fit_transform(corpus)
    return doc_term_matrix

In [10]:
def query_processor(query: str, num_results: int = 10) -> list[tuple[str, float]]:
    stems = get_query_stems(query)
    index_lkp = index_lookup(set(stems))
    docs_with_woi = frequency_lookup(index_lkp)
    tfidf_matrix = gen_tfidf_matrix(docs_with_woi, " ".join(stems))
    cosine_sim = cosine_similarity(tfidf_matrix[0:-1], tfidf_matrix[-1]).flatten()
    sorted_docs = list(sorted(zip(docs_with_woi.keys(), cosine_sim), key=lambda x: x[1], reverse=True))
    return sorted_docs[:min(num_results, len(sorted_docs))]

In [11]:
def get_document(doc_id: str | list[str]) -> list[dict]:
    assert doc_id, "doc_id cannot be empty"
    if isinstance(doc_id, str):
        doc_id = [doc_id]
    
    reddit_docs = [id.split("-")[1] for id in doc_id if id.startswith("reddit")]
    so_docs = [int(id.split("-")[1]) for id in doc_id if id.startswith("so")]
    
    mongo_client = DBClients.mongo_client()
    reddit_collection = mongo_client["devstop"]["reddit_submissions"]
    so_collection = mongo_client["devstop"]["so_questions"]
    
    reddit_results = reddit_collection.find({"_id": {"$in": reddit_docs}}) if reddit_docs else []
    so_results = so_collection.find({"_id": {"$in": so_docs}}) if so_docs else []
    
    all_results = []
    for doc in reddit_results:
        all_results.append({"doc_id": f"reddit-{doc['_id']}", "title": doc["title"], "url": f"https://reddit.com{doc['permalink']}"})
    for doc in so_results:
        all_results.append({"doc_id": f"so-{doc['_id']}", "title": doc["title"], "url": doc["link"]})
    
    return list(sorted(all_results, key=lambda x: doc_id.index(x["doc_id"])))

In [14]:
top_hits = query_processor("function variable looping")
top_hits

['devstop:index:variable', 'devstop:index:function', 'devstop:index:looping']


[('reddit-t3_gdzu6e', 1.0000000000000002),
 ('so-6967463', 1.0000000000000002),
 ('so-13694034', 1.0000000000000002),
 ('so-9044084', 1.0000000000000002),
 ('so-3162271', 1.0000000000000002),
 ('so-41707229', 0.981144197450551),
 ('so-2081836', 0.9751160439809308),
 ('so-26666919', 0.9751160439809308),
 ('so-5518435', 0.9741339462621095),
 ('so-869885', 0.9723304330440237)]

In [17]:
get_document([hit[0] for hit in top_hits])

[{'doc_id': 'reddit-t3_gdzu6e',
  'title': "Holy heck I'm addicted.",
  'url': 'https://reddit.com/r/learnpython/comments/gdzu6e/holy_heck_im_addicted/'},
 {'doc_id': 'so-6967463',
  'title': 'Iterating over a numpy array',
  'url': 'https://stackoverflow.com/questions/6967463/iterating-over-a-numpy-array'},
 {'doc_id': 'so-13694034',
  'title': 'Is a Python list guaranteed to have its elements stay in the order they are inserted in?',
  'url': 'https://stackoverflow.com/questions/13694034/is-a-python-list-guaranteed-to-have-its-elements-stay-in-the-order-they-are-inse'},
 {'doc_id': 'so-9044084',
  'title': 'Efficient date range overlap calculation?',
  'url': 'https://stackoverflow.com/questions/9044084/efficient-date-range-overlap-calculation'},
 {'doc_id': 'so-3162271',
  'title': 'Get loop count inside a for-loop',
  'url': 'https://stackoverflow.com/questions/3162271/get-loop-count-inside-a-for-loop'},
 {'doc_id': 'so-41707229',
  'title': 'tqdm printing to newline',
  'url': 'ht