In [1]:
import nltk
import wikipedia

In [2]:
def named_entity_recognition(text):
    data = nltk.word_tokenize(text)
    data = nltk.pos_tag(data)
    data = nltk.ne_chunk(data)
    data = list(data)
    
    final = False
    
    for x in data:
        if type(x) is nltk.tree.Tree:
            final = x[0][0]
            
    return final

In [3]:
d = named_entity_recognition("Can you give me some info about Cahrles")

In [4]:
wikipedia.search("Bulgaria")

['Bulgaria',
 'Demographics of Bulgaria',
 'Sofia',
 'Bulgaria Air',
 "People's Republic of Bulgaria",
 'Balkan Wars',
 'Volga Bulgaria',
 'List of Bulgarian consorts',
 'Georgi Markov',
 'Provinces of Bulgaria']

In [5]:
bg = wikipedia.page("Bulgaria")

In [7]:
" ".join(bg.categories)

'All articles containing potentially dated statements All articles needing additional references All articles with dead external links All articles with unsourced statements All articles with vague or ambiguous time Articles containing Bulgarian-language text Articles containing potentially dated statements from 2012 Articles including recorded pronunciations (English) Articles needing additional references from March 2018 Articles with Bulgarian-language external links Articles with Curlie links Articles with dead external links from December 2017 Articles with hAudio microformats Articles with permanently dead external links Articles with unsourced statements from October 2011 Balkan countries Bulgaria CS1 Bulgarian-language sources (bg) CS1 French-language sources (fr) CS1 German-language sources (de) Coordinates on Wikidata Former empires Good articles Member states of NATO Member states of the Council of Europe Member states of the European Union Member states of the Union for the

In [37]:
len(bg.content[:1*10**3])

1000

In [38]:
len(bg.content)

52454

In [104]:
from elasticsearch import Elasticsearch
from datetime import datetime

In [116]:
es = Elasticsearch()
INDEX_NAME = "test"

if es.indices.exists(INDEX_NAME):
    print("deleting '%s' index..." % (INDEX_NAME))
    res = es.indices.delete(index = INDEX_NAME)
    print(" response: '%s'" % (res))

request_body = {
    "settings" : {
        "number_of_shards" : 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_" 
                },
                "english_keywords": {
                    "type": "keyword_marker",
                    "keywords": ["example"] 
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                  "language": "possessive_english"
                }
            },
            "analyzer": {
                "english": {
                    "tokenizer":  "standard",
                    "filter": [
                        "english_possessive_stemmer",
                        "lowercase",
                        "english_stop",
                        "english_keywords",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings" : {
        "blog" : {
            "properties" : {
                "title": {"type" : "text"},
                "text": {"type" : "text"},
                "date": {"type": "date"}
            }
        }
    },
}

print("creating '%s' index..." % (INDEX_NAME))
res = es.indices.create(index = INDEX_NAME, body = request_body)
print(" response: '%s'" % (res))

deleting 'test' index...
 response: '{'acknowledged': True}'
creating 'test' index...
 response: '{'acknowledged': True, 'shards_acknowledged': True, 'index': 'test'}'


In [117]:
for i in range(5):
    doc = {
        'title': 'victor',
        'text': 'Hello World',
        "date": datetime.now()
    }

    res = es.index(index=INDEX_NAME, doc_type="blog", id=i, body=doc)

In [118]:
es.get(index=INDEX_NAME, doc_type="blog", id=4)

{'_id': '4',
 '_index': 'test',
 '_source': {'date': '2018-03-17T14:41:05.070610',
  'text': 'Hello World',
  'title': 'victor'},
 '_type': 'blog',
 '_version': 1,
 'found': True}

In [119]:
es.indices.refresh(index=INDEX_NAME)

{'_shards': {'failed': 0, 'successful': 1, 'total': 2}}

In [120]:
body = {
    "query": {
        "more_like_this" : {
            "fields" : ["text"],
            "like" : "Hello",
            "min_term_freq" : 1,
            "max_query_terms" : 12
        }
    }
}
res = es.search(index=INDEX_NAME, body=body)
# print(res)
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit)

Got 5 Hits:
{'_index': 'test', '_type': 'blog', '_id': '0', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-17T14:41:05.007952'}}
{'_index': 'test', '_type': 'blog', '_id': '1', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-17T14:41:05.029934'}}
{'_index': 'test', '_type': 'blog', '_id': '2', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-17T14:41:05.044289'}}
{'_index': 'test', '_type': 'blog', '_id': '3', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-17T14:41:05.057697'}}
{'_index': 'test', '_type': 'blog', '_id': '4', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-17T14:41:05.070610'}}
