In [None]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget
!pip install --upgrade gensim
!pip install --user -U nltk
!pip install beautifulsoup4

In [None]:
import wget
wget.download("https://github.com/MIE451-1513-2019/course-datasets/raw/master/government.zip", "government.zip")
wget.download("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", "GoogleNews-vectors-negative300.bin.gz")

In [None]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import numpy as np
from whoosh import fields
from whoosh.analysis import StemmingAnalyzer

from gensim.models import KeyedVectors
from whoosh.analysis.tokenizers import Tokenizer
from whoosh.analysis.acore import Token
from whoosh.scoring import WeightingModel, BaseScorer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
import string

from gensim.models import Word2Vec
from whoosh.fields import FieldType
from whoosh.analysis.analyzers import Analyzer
from whoosh import analysis
from whoosh import query
from whoosh import formats
from whoosh.formats import Format, Existence
from whoosh.system import emptybytes
from whoosh.analysis.filters import STOP_WORDS
from whoosh.searching import Results
from whoosh.scoring import BM25F
from whoosh.lang import *

import bs4

In [None]:
def tokens(value, analyzer, kwargs):
    if isinstance(value, (tuple, list)):
        gen = entoken(value, **kwargs)
    else:
        gen = analyzer(value, **kwargs)
    return unstopped(gen)

In [None]:
class customTokenizer(Tokenizer):
  def __init__(self, remove_nonalpha=False, keep_special=False, lemmatize=False, stop_words_list=STOP_WORDS):
    self.lemmatize = lemmatize
    self.remove_nonalpha = remove_nonalpha
    self.keep_special = keep_special
    self.stop_words_list = stop_words_list

  def __call__(self, value, positions=False, stem=None, lemmatize=None, chars=False, keeporiginal=False,
                 removestops=False, start_pos=0, start_char=0, tokenize=True,
                 mode='', **kwargs):
    
    t = Token(positions, chars, removestops=removestops, mode=mode,
              **kwargs)
    
    # tokenizing words from text corpusn
    tokenizer = RegexpTokenizer(r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S') # Parsing words connected by '-' into separate words, and keeping special words connected by '.', like 'U.S.'
    tokenized_words = tokenizer.tokenize(value)
    
    filtered_words = []
    
    # If True, keep special words like 'U.S.' unchanged and make everything else lowercase
    # else, make everything lowercase
    if self.keep_special:
      for word in tokenized_words:
        if word not in string.punctuation:
          if any(char == '.' for char in word):
            filtered_words.append(word)
          else:
            filtered_words.append(word.lower())
    else:
      filtered_words = [w.lower() for w in tokenized_words if w not in string.punctuation]

    # If True, remove non-alphabetic characters (numbers and punctuations) 
    if self.remove_nonalpha:
      filtered_words2 = []
      for word in filtered_words:
        if any(char.isnumeric() or char=='_' for char in word) or word in string.punctuation:
          continue
        else:
          filtered_words2.append(word)
      filtered_words = filtered_words2
    
    # If True, lemmatize words
    if self.lemmatize:
      lemmatizer = WordNetLemmatizer()
      filtered_words = [lemmatizer.lemmatize(w) for w in filtered_words]

    # remove stop words
    filtered_words = [w for w in filtered_words if w not in self.stop_words_list]

    for w in filtered_words:
      t.text = w
      t.boost = 1.0
      t.pos = start_pos
      start_pos += 1
      yield t


In [None]:
class Word2VecFilter(Filter):
  def __call__(self, tokens):
    # Converting from words to word embeddings with Word2Vec
    for t in tokens:
      try:
        embedding = word2vec[t.text]
      except Exception as e:
        embedding  = np.array([])

      t.embedding = embedding

      yield t

In [None]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
# Computing document centroids
def compute_centroid(word_embeddings, schema):    
    centroid = np.zeros((300,))
    count = 0
    for w in word_embeddings:
      if len(w)!=0:
        centroid += w/np.linalg.norm(w)
        count += 1
    centroid = centroid/count

    return centroid

In [None]:
# custom Searcher object for ranking documents based on cosine distance between 
# their centroids and queries
class Searcher(object):
  def __init__(self, index):
    self.index = index
    self.ixreader = index.reader()
    self.is_closed = False
    self._doccount = self.ixreader.doc_count_all()
    # Cache for PostingCategorizer objects (supports fields without columns)
    self._field_caches = {}
    self.centroids = self.compute_document_centroids()
    # Copy attributes/methods from wrapped reader
    for name in ("stored_fields", "all_stored_fields", "has_vector",
                  "vector", "vector_as", "lexicon", "field_terms",
                  "frequency", "doc_frequency", "term_info",
                  "doc_field_length", "corrector", "iter_docs"):
        setattr(self, name, getattr(self.ixreader, name))

  def compute_document_centroids(self):
    centroids = {}
    for i in self.ixreader.iter_docs():
      text = i[1]['file_content']
      word_embeddings = self.index.schema['file_content'].process_text(text)
      centroid = compute_centroid(word_embeddings, self.index.schema)
      centroids[i[0]] = centroid

    return centroids

  def search(self, q, top_n = 20, **kwargs):
    scores = []
    vecs = []
    if type(q) == query.terms.Term:
      q = [q]

    for i in q:
      try:
        vec = np.fromstring(i.text, dtype=float, sep=' ')
        vecs.append(vec)
      except:
        continue

    for docnum in self.centroids:
      score = 0
      # compute similarity between each query term and document centroid, 
      # and takes the average
      for q in vecs:
        score += cosine_sim(q, self.centroids[docnum])
      score = score/len(vecs)
      scores.append((score, docnum))
    
    sorted_scores = sorted(scores, key = lambda x: x[0], reverse=True)

    results = Results(self, q, sorted_scores[:top_n])
    return results
