In [1]:
# Setup Notebook to load Django code
# From project root, run: jupyter-lab

import os
import sys
from pathlib import Path

django_project_dir = Path('../')
sys.path.insert(0, str(django_project_dir))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ratom_api.settings.local")

import django
django.setup()

In [2]:
import pandas as pd

from elasticsearch_dsl import Q, FacetedSearch, TermsFacet, DateHistogramFacet

from ratom.documents import MessageDocument

# Start elasticsearch

In [3]:
! docker-compose up -d elasticsearch kibana

Starting kibana ... 
Starting elasticsearch ... 
[2Bting kibana        ... [32mdone[0m[2A[2K

In [4]:
! docker-compose ps

    Name                  Command             State             Ports           
--------------------------------------------------------------------------------
elasticsearch   /usr/local/bin/docker-entr    Up      0.0.0.0:9200->9200/tcp,   
                ...                                   0.0.0.0:9300->9300/tcp    
kibana          /usr/local/bin/dumb-init -    Up      0.0.0.0:5601->5601/tcp    
                ...                                                             


elasticsearch should be running at: http://localhost:9200/

In [None]:
# If it doesn't start.. look at logs:

! docker-compose logs elasticsearch

## Re-build index

https://django-elasticsearch-dsl.readthedocs.io/en/latest/quickstart.html#populate

In [1]:
! time python ../manage.py search_index -f --rebuild --parallel

the JSON object must be str, bytes or bytearray, not dict

real	0m1.589s
user	0m0.866s
sys	0m0.321s


# Match

In [4]:
# https://django-elasticsearch-dsl.readthedocs.io/en/latest/quickstart.html#search

search = MessageDocument.search().filter("match", msg_subject="help")
search

<django_elasticsearch_dsl.search.Search at 0x11f88fc50>

## Count

In [5]:
search.count()

2019-12-02 09:27:43,365 elasticsearch        INFO     GET http://localhost:9200/message/_count [status:200 request:0.014s]


90

In [6]:
response = search.execute()

2019-12-02 09:27:44,405 elasticsearch        INFO     GET http://localhost:9200/message/_search [status:200 request:0.014s]


In [7]:
# how is this different from search.count()?
response.hits.total

{'value': 90, 'relation': 'eq'}

In [8]:
response.took

10

## Response

In [9]:
# simple using response
for msg in response[:10]:
    print(msg.msg_subject)

"Help Millions" - Pledge Today!
Help with statistical analysis
Thanks for your help!!
FW: HELP!!! I'VE FAINTED AND I CAN'T COME TO!!!!!
"Help Millions" - Pledge Today!
Help on cluster analysis
Re: Info help.
Help on DPC LNG Options
Fwd: Please, help
Help with the Tiger Team


In [10]:
# using pandas
results_df = pd.DataFrame((d.to_dict() for d in search[:10]))
results_df

2019-12-02 09:27:52,557 elasticsearch        INFO     GET http://localhost:9200/message/_search [status:200 request:0.005s]


Unnamed: 0,collection,msg_from,msg_subject,msg_body,directory
0,"{'title': 'vkaminski', 'accession_date': 2019-...","""Zulie Flores""","""Help Millions"" - Pledge Today!","date: Fri, 4 Aug 2000 16:08:00 -0700 (PDT) Fri...",/Top of Personal Folders/test/Vincent_Kaminski...
1,"{'title': 'vkaminski', 'accession_date': 2019-...","""Vince J Kaminski""",Help with statistical analysis,"Date: Thu, 20 Apr 2000 08:05:00 -0700 (PDT),Th...",/Top of Personal Folders/test/Vincent_Kaminski...
2,"{'title': 'vkaminski', 'accession_date': 2019-...","""Bridget D'Silva""",Thanks for your help!!,"Date: Fri, 11 Feb 2000 15:05:00 -0800 (PST),Fr...",/Top of Personal Folders/test/Vincent_Kaminski...
3,"{'title': 'dana_davis', 'accession_date': 2019...",Davis,FW: HELP!!! I'VE FAINTED AND I CAN'T COME TO!!!!!,"Date: Mon, 8 Oct 2001 21:57:25 -0700 (PDT),Mon...",/Top of Personal Folders/davis-d/DDAVIS (Non-P...
4,"{'title': 'vkaminski', 'accession_date': 2019-...","""Zulie Flores""","""Help Millions"" - Pledge Today!","date: Fri, 4 Aug 2000 16:08:00 -0700 (PDT) Fri...",/Top of Personal Folders/kaminski-v/Vincent_Ka...
5,"{'title': 'vkaminski', 'accession_date': 2019-...","""Lance Cunningham""",Help on cluster analysis,"date: Thu, 22 Mar 2001 14:45:00 -0800 (PST) Th...",/Top of Personal Folders/kaminski-v/Vincent_Ka...
6,"{'title': 'vkaminski', 'accession_date': 2019-...","""Vince J Kaminski""",Re: Info help.,"date: Tue, 15 Aug 2000 17:51:00 -0700 (PDT) Tu...",/Top of Personal Folders/kaminski-v/Vincent_Ka...
7,"{'title': 'vkaminski', 'accession_date': 2019-...","""Vince J Kaminski""",Help on DPC LNG Options,"date: Mon, 12 Mar 2001 17:07:00 -0800 (PST) Mo...",/Top of Personal Folders/kaminski-v/Vincent_Ka...
8,"{'title': 'vkaminski', 'accession_date': 2019-...",VKaminski@aol.com,"Fwd: Please, help","date: Mon, 9 Apr 2001 23:12:00 -0700 (PDT) Mon...",/Top of Personal Folders/kaminski-v/Vincent_Ka...
9,"{'title': 'vkaminski', 'accession_date': 2019-...","""Vince J Kaminski""",Help with the Tiger Team,"Date: Thu, 11 Jan 2001 09:51:00 -0800 (PST),Th...",/Top of Personal Folders/kaminski-v/Vincent_Ka...


## Multi-match

https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#queries

In [11]:
q = Q("match", msg_subject='help') & Q("match", collection__title="kate_symes")
q

Bool(must=[Match(msg_subject='help'), Match(collection__title='kate_symes')])

In [12]:
search = MessageDocument.search().query(q)
response = search.execute()
search.count()

2019-12-02 09:27:55,560 elasticsearch        INFO     GET http://localhost:9200/message/_search [status:200 request:0.013s]


9

In [13]:
results_df = pd.DataFrame((d.to_dict() for d in search[:10]))
results_df

2019-12-02 09:27:55,916 elasticsearch        INFO     GET http://localhost:9200/message/_search [status:200 request:0.007s]


Unnamed: 0,collection,labels,msg_from,msg_subject,msg_body,directory
0,"{'title': 'kate_symes', 'accession_date': 2019...","[FAC, NORP, MONEY, ORG, CARDINAL, LOC, DATE, P...","""Kate Symes""",Re: Help!,Body-Type: plain-text\r\n\r\nWhatever she's ma...,/Top of Personal Folders/symes-k/kate symes 6-...
1,"{'title': 'kate_symes', 'accession_date': 2019...","[FAC, NORP, MONEY, ORG, CARDINAL, LOC, DATE, P...","""Kate Symes""",Re: Help!,Body-Type: plain-text\r\n\r\nWhatever she's ma...,/Top of Personal Folders/symes-k/kate symes 6-...
2,"{'title': 'kate_symes', 'accession_date': 2019...","[FAC, NORP, MONEY, ORG, CARDINAL, LOC, DATE, P...","""Kate Symes""",Re: Help!,Body-Type: plain-text\r\n\r\nWhatever she's ma...,/Top of Personal Folders/symes-k/kate symes 6-...
3,"{'title': 'kate_symes', 'accession_date': 2019...","[MONEY, ORG, CARDINAL, PERSON, GPE]","""Mark Confer""",Can You Help,"Body-Type: plain-text\r\n\r\nKate, could you h...",/Top of Personal Folders/symes-k/kate symes 6-...
4,"{'title': 'kate_symes', 'accession_date': 2019...","[MONEY, ORG, CARDINAL, PERSON, GPE]","""Mark Confer""",Can You Help,"Body-Type: plain-text\r\n\r\nKate, could you h...",/Top of Personal Folders/symes-k/kate symes 6-...
5,"{'title': 'kate_symes', 'accession_date': 2019...","[MONEY, ORG, CARDINAL, PERSON, GPE]","""Mark Confer""",Can You Help,"Body-Type: plain-text\r\n\r\nKate, could you h...",/Top of Personal Folders/symes-k/kate symes 6-...
6,"{'title': 'kate_symes', 'accession_date': 2019...","[ORG, CARDINAL, LOC, PERSON, GPE]","""Mary Jane Symes"" <marys@tvapdx.com>",Help Save the Arctic Refuge,Body-Type: plain-text\r\n\r\nI just signed thi...,/Top of Personal Folders/symes-k/kate symes 6-...
7,"{'title': 'kate_symes', 'accession_date': 2019...","[ORG, CARDINAL, LOC, PERSON, GPE]","""Mary Jane Symes"" <marys@tvapdx.com>",Help Save the Arctic Refuge,Body-Type: plain-text\r\n\r\nI just signed thi...,/Top of Personal Folders/symes-k/kate symes 6-...
8,"{'title': 'kate_symes', 'accession_date': 2019...","[ORG, CARDINAL, LOC, PERSON, GPE]","""Mary Jane Symes"" <marys@tvapdx.com>",Help Save the Arctic Refuge,Body-Type: plain-text\r\n\r\nI just signed thi...,/Top of Personal Folders/symes-k/kate symes 6-...


# Facet

In [14]:
class MessageSearch(FacetedSearch):
    doc_types = [MessageDocument, ]
    fields = ['msg_subject', 'msg_body']

    facets = {
        'labels': TermsFacet(field='labels'),
    }

In [15]:
# how do you combine with query?
search = MessageSearch("help", {"labels": "PERSON"})
response = search.execute()

2019-12-02 09:27:58,626 elasticsearch        INFO     GET http://localhost:9200/_all/_search [status:200 request:0.019s]


In [16]:
# access hits and other attributes as usual
total = response.hits.total
print('total hits', total.relation, total.value)
for hit in response:
    print(hit.meta.score, hit.msg_subject)

for (tag, count, selected) in response.facets.labels:
    print(tag, ' (SELECTED):' if selected else ':', count)

total hits eq 3083
9.111702 Re: Help!
9.111702 Re: Help!
9.111702 Re: Help!
8.221744 Can You Help
8.221744 Can You Help
8.221744 Can You Help
6.878141 Re: Help - Missing Profile Books!
6.878141 Help Save the Arctic Refuge
6.878141 Help Save the Arctic Refuge
6.878141 Help Save the Arctic Refuge
CARDINAL : 3368
ORG : 3368
PERSON  (SELECTED): 3083
DATE : 3021
GPE : 2115
TIME : 2047
ORDINAL : 856
WORK_OF_ART : 651
LOC : 610
NORP : 595
