In [162]:
# Setup Notebook to load Django code
# From project root, run: jupyter-lab

import os
import sys
from pathlib import Path

django_project_dir = Path('../')
sys.path.insert(0, str(django_project_dir))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ratom_api.settings.local")

import django
django.setup()

In [267]:
! time python ../manage.py search_index -f --rebuild --parallel

Deleting index '<elasticsearch_dsl.index.Index object at 0x7f935a7e0f50>'
2019-12-03 08:37:24,844 elasticsearch        INFO     DELETE http://localhost:9200/message [status:200 request:0.040s]
Creating index '<elasticsearch_dsl.index.Index object at 0x7f935a7e0f50>'
2019-12-03 08:37:24,991 elasticsearch        INFO     PUT http://localhost:9200/message [status:200 request:0.146s]
Indexing 1328 'Message' objects (parallel)
2019-12-03 08:37:25,986 elasticsearch        INFO     POST http://localhost:9200/_bulk?refresh=true [status:200 request:0.452s]
2019-12-03 08:37:26,776 elasticsearch        INFO     POST http://localhost:9200/_bulk?refresh=true [status:200 request:0.397s]
2019-12-03 08:37:27,119 elasticsearch        INFO     POST http://localhost:9200/_bulk?refresh=true [status:200 request:0.196s]
2.04user 0.14system 0:03.09elapsed 70%CPU (0avgtext+0avgdata 86624maxresident)k
0inputs+8outputs (0major+20682minor)pagefaults 0swaps


In [268]:
import pandas as pd

from elasticsearch_dsl import Q, FacetedSearch, TermsFacet, DateHistogramFacet

from ratom.documents import MessageDocument

# Match
What does a message document look like right now

In [269]:
md = MessageDocument()
md.__dict__

{'meta': {},
 '_d_': {'_related_instance_to_ignore': None},
 '_prepared_fields': [('collection',
   ObjectField(),
   functools.partial(<bound method ObjectField.get_value_from_instance of ObjectField()>, field_value_to_ignore=None)),
  ('labels',
   KeywordField(),
   <bound method MessageDocument.prepare_labels of MessageDocument()>),
  ('msg_from',
   TextField(),
   functools.partial(<bound method DEDField.get_value_from_instance of TextField()>, field_value_to_ignore=None)),
  ('msg_subject',
   TextField(),
   functools.partial(<bound method DEDField.get_value_from_instance of TextField()>, field_value_to_ignore=None)),
  ('msg_body',
   TextField(),
   functools.partial(<bound method DEDField.get_value_from_instance of TextField()>, field_value_to_ignore=None)),
  ('directory',
   TextField(),
   functools.partial(<bound method DEDField.get_value_from_instance of TextField()>, field_value_to_ignore=None)),
  ('sent_date',
   DateField(),
   functools.partial(<bound method DEDFie

# Facet

In [270]:
class MessageSearch(FacetedSearch):
    doc_types = [MessageDocument, ]
    fields = ['msg_subject', 'msg_body', 'directory']

    facets = {
        'labels': TermsFacet(field='labels'),
        'Message Sent': DateHistogramFacet(field='sent_date', interval='month'),
    }

In [271]:
# how do you combine with query?
search = MessageSearch("pipeline")
response = search.execute()

2019-12-03 08:37:32,329 elasticsearch        INFO     GET http://localhost:9200/_all/_search [status:200 request:0.041s]


In [272]:
# access hits and other attributes as usual
total = response.hits.total
print('total hits', total.relation, total.value)
for hit in response:
    print(hit.meta.score, hit.msg_subject)

for (tag, count, selected) in response.facets['Message Sent']:
    print(tag, ' (SELECTED):' if selected else ':', count)

total hits eq 46
6.2172685 Scaled-back Alaskan pipeline proposal in works
5.500802 Organization Changes
5.1720777 Northern Natural Gas
4.3007917 Williams Energy News Live -- today's video newscast
3.9822066 Williams Energy News Live -- today's video newscast
3.758673 Shedding light on power prices
3.65606 Williams Energy News Live -- today's video newscast
3.5589008 Tabled, canceled project numbers climbing
3.2581546 GREAT ROTATION OPPORTUNITY IN EBS ORIGINATION AMERICAS
3.2581546 Enron Update
2001-09-01 00:00:00 : 5
2001-10-01 00:00:00 : 9
2001-11-01 00:00:00 : 8
2001-12-01 00:00:00 : 5
2002-01-01 00:00:00 : 17
2002-02-01 00:00:00 : 2


# Add more facets profit

In [273]:
class MessageDateSearch(FacetedSearch):
    doc_types = [MessageDocument, ]
    fields = ['msg_subject', 'msg_body']
    
    facets = {
        'NER Tags': TermsFacet(field='labels'),
        'Message Sent': DateHistogramFacet(field='sent_date', interval='month'),
        'Folder': TermsFacet(field='directory'),
        'Sender': TermsFacet(field='msg_from')
    }

In [274]:
mds = MessageDateSearch("pipeline")
resp = mds.execute()

2019-12-03 08:37:35,758 elasticsearch        INFO     GET http://localhost:9200/_all/_search [status:200 request:0.053s]


In [275]:
vars(resp.facets)

{'_d_': {'NER Tags': [('CARDINAL', 46, False),
   ('DATE', 46, False),
   ('ORG', 46, False),
   ('PERSON', 44, False),
   ('GPE', 40, False),
   ('MONEY', 35, False),
   ('NORP', 26, False),
   ('LOC', 23, False),
   ('PERCENT', 23, False),
   ('PRODUCT', 21, False)],
  'Message Sent': [(datetime.datetime(2001, 9, 1, 0, 0), 5, False),
   (datetime.datetime(2001, 10, 1, 0, 0), 9, False),
   (datetime.datetime(2001, 11, 1, 0, 0), 8, False),
   (datetime.datetime(2001, 12, 1, 0, 0), 5, False),
   (datetime.datetime(2002, 1, 1, 0, 0), 17, False),
   (datetime.datetime(2002, 2, 1, 0, 0), 2, False)],
  'Folder': [('/Top of Personal Folders/saibi-e/ESAIBI (Non-Privileged)/Saibi, Eric/Inbox',
    22,
    False),
   ('/Top of Personal Folders/saibi-e/ExMerge - Saibi, Eric/Inbox', 21, False),
   ('/Top of Personal Folders/saibi-e/Eric_Saibi_Jan2002/Saibi, Eric/Deleted Items',
    1,
    False),
   ('/Top of Personal Folders/saibi-e/Eric_Saibi_Jan2002/Saibi, Eric/Inbox',
    1,
    False),
   ('

In [283]:
# access hits and other attributes as usual
total = response.hits.total
print('total hits', total.relation, total.value)
for hit in resp:
    print(hit.meta.score, hit.msg_subject)

for facet in resp.facets:
    print(f"::::{facet}:::")
    for (tag, count, selected) in resp.facets[facet]:
        print(tag, ' (SELECTED):' if selected else ':', count)

total hits eq 46
6.2172685 Scaled-back Alaskan pipeline proposal in works
5.500802 Organization Changes
5.1720777 Northern Natural Gas
4.3007917 Williams Energy News Live -- today's video newscast
3.9822066 Williams Energy News Live -- today's video newscast
3.758673 Shedding light on power prices
3.65606 Williams Energy News Live -- today's video newscast
3.5589008 Tabled, canceled project numbers climbing
3.2581546 GREAT ROTATION OPPORTUNITY IN EBS ORIGINATION AMERICAS
3.2581546 Enron Update
::::NER Tags:::
CARDINAL : 46
DATE : 46
ORG : 46
PERSON : 44
GPE : 40
MONEY : 35
NORP : 26
LOC : 23
PERCENT : 23
PRODUCT : 21
::::Message Sent:::
2001-09-01 00:00:00 : 5
2001-10-01 00:00:00 : 9
2001-11-01 00:00:00 : 8
2001-12-01 00:00:00 : 5
2002-01-01 00:00:00 : 17
2002-02-01 00:00:00 : 2
::::Folder:::
/Top of Personal Folders/saibi-e/ESAIBI (Non-Privileged)/Saibi, Eric/Inbox : 22
/Top of Personal Folders/saibi-e/ExMerge - Saibi, Eric/Inbox : 21
/Top of Personal Folders/saibi-e/Eric_Saibi_Jan200

In [284]:
resp.hits.total

{'value': 46, 'relation': 'eq'}

In [285]:
h1 = resp.hits[0]

In [286]:
h1

MessageDocument(index='message', id='102')

In [288]:
h1.__dict__

{'meta': {'index': 'message', 'id': '102', 'score': 6.2172685, 'highl...},
 '_d_': {'_related_instance_to_ignore': None,
  'collection': {'title': 'eric_saibi', 'accession_date': datetime.datetime(...},
  'labels': ['PERSON',
   'PRODUCT',
   'PERCENT',
   'CARDINAL',
   'ORG',
   'NORP',
   'MONEY',
   'FAC',
   'GPE',
   'LOC',
   'DATE'],
  'msg_from': '"Energy Insight Editor EI_editor@PLATTS.COM@ENRON" <IMCEANOTES-Energy+20Insight+20Editor+20+3CEI+5Feditor+40PLATTS+2ECOM+3E+40ENRON@ENRON.com>',
  'directory': '/Top of Personal Folders/saibi-e/ExMerge - Saibi, Eric/Inbox',
  'msg_subject': 'Scaled-back Alaskan pipeline proposal in works',
  'msg_body': 'Body-Type: plain-text\r\n\r\nEnergy Insight \r\n\t\r\n\r\n\r\n                        \t                       Updated: Jan. 3, 2002                               [IMAGE]Scaled-back Alaskan pipeline proposal in works  With prospects for a gas pipeline from Alaska\'s North Slope looking shakier by the month, Foothills Pipe Lines is ab

In [123]:
h1.directory

'/Top of Personal Folders/symes-k/kate symes 6-27-02/Notes Folders/Discussion threads'