# Stop Words Test
Gensim includes a tokenizer, but it mostly handles lowercasing and de-accenting. We're just working with English, so de-accenting isn't a concern.
Tokenizing will require a stopwords list, but we don't have a good one for our domain (default lists are generally crap).
So, we'll use some scikit learn tools to create a stopword list.
But, we want to build the stopword list from the tokenized words, so we'll try out tokenizing, then generating the stopword list, then tokenizing again with the stopwords.

In [None]:
import gensim
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer

stoplists = list()

def get_stops(documents, max_df=0.75, min_df=1):
    """
    Documents should already have a basic tokenization pass
    """
    tokenizer = TreebankWordTokenizer()

    # first, make the documents consist of regularized text
    # lowercase, split contractions, etc.
    # this will be deterministic regardless of document set
    documents = [" ".join(tokenizer.tokenize(doc)) for doc in documents]

    # use count vectorizer to get stopwords set
    # i.e. words appearing in > 70% of documents or less than twice
    vectorizer = CountVectorizer(
        strip_accents="unicode",
        lowercase=True,
        stop_words=None,
        max_df=max_df,
        min_df=min_df,
    )
    vectorizer.fit(documents)
    return vectorizer.stop_words_


def tokenize_docs(documents, tokens_only=False, stops=set()):
    tokenizer = TreebankWordTokenizer()

    for i, doc in enumerate(documents):
        tokens = [token for token in tokenizer.tokenize(doc) if token not in stops]
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


In [None]:
from pathlib import Path

test_dat_dir = Path(gensim.__path__[0])/"test"/"test_data"
train_corpus = (test_dat_dir/"lee_background.cor").read_text(encoding="iso-8859-1").split("\n")
test_corpus = (test_dat_dir/"lee.cor").read_text(encoding="iso-8859-1").split("\n")

In [None]:
stops = get_stops(train_corpus + test_corpus, max_df=0.4)
len(stops)

In [None]:
stops

In [None]:
train_corpus = list(tokenize_docs(train_corpus, stops=stops))
test_corpus = list(tokenize_docs(test_corpus, stops=stops, tokens_only=True))

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [None]:
model.build_vocab(train_corpus)

In [None]:
print(f"Word 'penalty' appeared {model.wv.get_vecattr('penalty', 'count')} times in the training data")

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.infer_vector(["only", "you", "can", "prevent", "forest", "fires"])

In [None]:
ranks = list()
second_ranks = list()
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, _ in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
import collections

counter = collections.Counter(ranks)
counter

In [None]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))