In [1]:
import sys
sys.path += [
    './lib/',
    './lib/field_mappers'
]

In [2]:
from pymongo import MongoClient
import spacy

from extract_biographies import extract_biographies
from generate_sample_collection import copy_sample
from extract_first_sentence import FirstSentenceExtractor
from vectorizer import Vectorizer
from pageview_aggregator import PageviewAggregator
from query import Searcher

In [3]:
SPACY_MODEL = 'en_vectors_web_lg'

In [4]:
db = MongoClient().who
# clean up test db
db.bios.remove({})
db.sample_bios.remove({})

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


{'n': 10, 'ok': 1.0}

In [5]:
extract_biographies(collection_name='bios', limit=100)

inserting batch 0


In [6]:
db.bios.count_documents({})

100

In [7]:
copy_sample('bios', 'sample_bios', 10)
db.sample_bios.count_documents({})

10

In [8]:
FirstSentenceExtractor().extract('sample_bios')

insertation count: 0


In [9]:
next(db.sample_bios.find({})).keys()

dict_keys(['_id', 'name', 'bio', 'first_sentence'])

In [10]:
next(db.sample_bios.find({}))['first_sentence']

'Arthur Schopenhauer  was a German philosopher.'

In [11]:
nlp = spacy.load(SPACY_MODEL)

In [12]:
Vectorizer(nlp).vectorize_text(
    collection_name='sample_bios',
    text_field_name='first_sentence',
    vector_field_name='fresh_new_vectorization'
)

insertation count: 0


In [13]:
pva = PageviewAggregator()
# do but once:
# pva.aggregate('./data/monthly_view_totals/filtered')
pva.load_totals()
pva.insert(collection_name='sample_bios')

pageview total insertation count: 0


In [14]:
next(db.sample_bios.find({}))['fresh_new_vectorization'][:9]

[0.06783337891101837,
 0.2521723508834839,
 -0.018867749720811844,
 0.002851255238056183,
 0.027646128088235855,
 -0.012560253962874413,
 -0.0007327478379011154,
 -0.007898379117250443,
 -0.08137384057044983]

In [15]:
next(db.sample_bios.find({})).keys()

dict_keys(['_id', 'name', 'bio', 'first_sentence', 'fresh_new_vectorization', 'views'])

In [16]:
Searcher(
    nlp,
    collection_name='sample_bios',
    vector_field_name='fresh_new_vectorization'
).query('electromagnetism scientist', limit=10)

[{'name': 'Arthur Schopenhauer', 'rank': 0.6466393476914676},
 {'name': 'Arne Kaijser', 'rank': 0.6916283737936731},
 {'name': 'Alfred Korzybski', 'rank': 0.7026181300665632},
 {'name': 'Alfred Hitchcock', 'rank': 0.7310305964276884},
 {'name': 'Alain de Lille', 'rank': 0.7839900624569994},
 {'name': 'Alfonso Cuarón', 'rank': 0.7976252461859169},
 {'name': 'Alexander II of Epirus', 'rank': 0.8012006927770886},
 {'name': 'Alaric II', 'rank': 0.801876421889219},
 {'name': 'Akira Kurosawa', 'rank': 0.8038576091940953},
 {'name': 'Anaximander', 'rank': 0.9475158542590767}]