### Search engine text version

In [1]:
import pandas as pd
import pickle
import numpy as np
import scipy.sparse as sparse
import nltk
import emoji
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm, trange

In [2]:
class SearchEngine:
    def __init__(self, db_filename: str, map_filename: str, normal_matrix: str, idf_map: str = None, idf_matrix: str = None, svd: str = None):
        self.db = pd.read_csv(db_filename)
        with open(map_filename, 'rb') as f:
            self.vocab_map = pickle.load(f)
        if idf_map is None or idf_matrix is None:
            self.idf_map = None
        else:
            with open(idf_map, 'rb') as f:
                self.idf_map = pickle.load(f)
            self.idf_matrix = sparse.load_npz(idf_matrix)
        self.normal_matrix = sparse.load_npz(normal_matrix)
        if svd is None:
            self.svd = None
            self.max_k = -1
        else:
            with open(svd, 'rb') as f:
                self.svd = pickle.load(f)
            self.max_k = self.svd[1].shape[0]
            self.svd_cache = {}

    def normal_query(self, q: str, size: int = 10):
        content = self._preprocess_text(q)
        q = self.veccount_data([content])
        scores = np.asarray((q.T @ self.normal_matrix).todense())[0]
        return self.scores_to_query(scores, size)

    def idf_query(self, q: str, size: int = 10):
        if self.idf_map is None:
            raise AttributeError('Could not find IDF dataset')
        content = self._preprocess_text(q)
        q = self.veccount_data([content], idf_map=self.idf_map)
        scores = np.asarray((q.T @ self.idf_matrix).todense())[0]
        return self.scores_to_query(scores, size)
    
    def svd_query(self, q: str, size: int = 10, k: int = 100, use_idf=False):
        if self.svd is None:
            raise AttributeError('Could not find SVD dataset')
        if self.max_k < k:
            raise ValueError('K value is larger than maximal allowed')
        content = self._preprocess_text(q)
        if use_idf:
            q = self.veccount_data([content], idf_map=self.idf_map)
        else:
            q = self.veccount_data([content])
        u, s, v = self.svd
        scores = ((q.T @ u[:, :k]) @ (s[:k].reshape(-1, 1) * v[:k, :])).flatten()
        if k not in self.svd_cache:
            self.svd_cache[k] = np.linalg.norm(s[:k].reshape(-1, 1) * v[:k, :], axis=0)
        scores /= self.svd_cache[k]
        return self.scores_to_query(scores, size)

    def scores_to_query(self, scores, size):
        top = scores.argsort()
        vw = self.db.iloc[top[-size:][::-1]]
        return [
            (row, prob)
            for (_, row), prob in zip(vw.iterrows(), scores[top[-size:][::-1]])
        ]
    
    @staticmethod
    def _preprocess_text(document, drop_numbers=False, drop_punctuations=True, drop_stopwords=True,
                        stem=True, stop_words=None, drop_potential_links=False, transform_emoji=True,
                        drop_nonascii=True):
        if transform_emoji:
            document = emoji.demojize(document)
        if drop_punctuations:
            document = ''.join(char for char in document if char not in string.punctuation)
        if drop_numbers:
            document = ''.join(char for char in document if char not in string.digits)
        if drop_nonascii:
            document = document.encode('ascii', errors='ignore').decode()
        tokens = word_tokenize(document)
        if stem:
            porter = PorterStemmer()
            tokens = [porter.stem(token) for token in tokens]
        if drop_stopwords:
            if stop_words is None:
                stop_words = stopwords.words('english')
            tokens = [token for token in tokens if token not in stop_words]
        if drop_potential_links:
            tokens = list(filter(lambda x: 'http' not in x, tokens))
        return tokens
    
    def veccount_data(self, data, normalize=True, idf_map=None):
        if idf_map is None:
            idf_map = [1] * len(self.vocab_map)
        matrix = sparse.dok_matrix((len(self.vocab_map), len(data)), dtype='float')
        current_vec = np.empty(len(self.vocab_map))
        for i, doc in enumerate(data):
            current_vec[...] = 0 
            # Eval normalization constant
            for member in doc:
                if member in self.vocab_map:
                    current_vec[self.vocab_map[member]] += idf_map[self.vocab_map[member]]
            # Assign elements and normalize them
            vec_norm = np.linalg.norm(current_vec)
            if normalize:
                current_vec /= vec_norm
            for x in np.nonzero(current_vec):
                matrix[x, i] = current_vec[x]
        return matrix.tocsc()

In [3]:
engine = SearchEngine('OPT/smalldb.csv', 'OPT/map.pickle', 'OPT/count.npz', 'OPT/idf_map.pickle', 'OPT/idf.npz', 'svd.pickle')

In [4]:
vw = engine.svd_query('how to code', 5, 500, True)

In [5]:
vw

[(Unnamed: 0                                                20825
  title         PSA: To new programmers or to those new to pos...
  subreddit                                           learnpython
  url           https://www.reddit.com/r/learnpython/comments/...
  body          I've had some spare time to parse this subredd...
  Name: 20825, dtype: object,
  0.0603345054005201),
 (Unnamed: 0                                                49414
  title               An explanation of Morse Code with examples.
  subreddit                                           raidsecrets
  url           https://www.reddit.com/r/raidsecrets/comments/...
  body          The topic of morse code has exploded over the ...
  Name: 49414, dtype: object,
  0.05924570766698998),
 (Unnamed: 0                                                48161
  title                                   LET US CHANGE OUR NAMES
  subreddit                                               spotify
  url           https://www.reddit.

In [36]:
vw

[(Unnamed: 0                                                49414
  title               An explanation of Morse Code with examples.
  subreddit                                           raidsecrets
  url           https://www.reddit.com/r/raidsecrets/comments/...
  body          The topic of morse code has exploded over the ...
  Name: 49414, dtype: object,
  0.05439759497882125),
 (Unnamed: 0                                                20825
  title         PSA: To new programmers or to those new to pos...
  subreddit                                           learnpython
  url           https://www.reddit.com/r/learnpython/comments/...
  body          I've had some spare time to parse this subredd...
  Name: 20825, dtype: object,
  0.04627861711400796),
 (Unnamed: 0                                                22981
  title         Just a simple piece of advice as I code more e...
  subreddit                                      learnprogramming
  url           https://www.reddit