In [13]:
import nltk
import wikipedia

In [14]:
def named_entity_recognition(text):
    data = nltk.word_tokenize(text)
    data = nltk.pos_tag(data)
    data = nltk.ne_chunk(data)
    data = list(data)
    
    final = False
    
    for x in data:
        if type(x) is nltk.tree.Tree:
            final = x[0][0]
            
    return final

In [15]:
d = named_entity_recognition("Can you give me some info about Cahrles")

In [16]:
wikipedia.search("Bulgaria")

['Bulgaria',
 'Demographics of Bulgaria',
 'Sofia',
 'Bulgaria Air',
 "People's Republic of Bulgaria",
 'Balkan Wars',
 'Volga Bulgaria',
 'List of Bulgarian consorts',
 'Georgi Markov',
 'Provinces of Bulgaria']

In [17]:
bg = wikipedia.page("Bulgaria")

In [18]:
from gensim.summarization.summarizer import summarize

In [19]:
summarize(bg.content, ratio=0.01)

'Thracians, one of the three primary ancestral groups of modern Bulgarians, began appearing in the region during the Iron Age. In the late 6th century BC, the Persians conquered most of present-day Bulgaria.\nIt was superseded by the subsequent Treaty of Berlin, signed on 13 July, which provided for a much smaller state comprising Moesia and the region of Sofia, leaving large populations of Bulgarians outside the new country.\nFrom a largely agricultural country with a predominantly rural population in 1948, by the 1980s Bulgaria had transformed into an industrial economy with scientific and technological research at the top of its budgetary expenditure priorities.\nNine historical and natural objects have been inscribed in the list of UNESCO World Heritage Sites: the Madara Rider, the Thracian tombs in Sveshtari and Kazanlak, the Boyana Church, the Rila Monastery, the Rock-hewn Churches of Ivanovo, Pirin National Park, Sreburna Nature Reserve and the ancient city of Nesebar.'

In [20]:
len(bg.content[:1*10**3])

1000

In [21]:
len(bg.content)

52454

In [22]:
a = wikipedia.random(pages=10)

In [23]:
wikipedia.page(a[3]).content

'J. Gordon Edwards (1919–2004) was an entomologist, mountain climber, author, and park ranger. Edwards was professor, and later emeritus professor of Biology, San Jose State University.\n\n\n== DDT and environmental issues ==\nEdwards was prominent as a supporter of the use of DDT and critic of Rachel Carson. He was active as a member of, or consultant for, a wide range of lobby groups opposed to environmental regulation, including the American Council on Science and Health. According to Edwards, he was also active as a member of several environmental groups, such as the Sierra Club (which published one of his books,) and the Audubon Society. Edwards was a fellow of the California Academy of Sciences. He published his ideas in 21st Century Science and Technology, a publication of the Lyndon LaRouche Movement. He was co-author, with Steven Milloy of 100 things you should know about DDT. Edwards last work, titled DDT: A Case Study in Scientific Fraud was published in 2004 after his death

In [24]:
from elasticsearch import Elasticsearch
from datetime import datetime

In [25]:
es = Elasticsearch()
INDEX_NAME = "test"

if es.indices.exists(INDEX_NAME):
    print("deleting '%s' index..." % (INDEX_NAME))
    res = es.indices.delete(index = INDEX_NAME)
    print(" response: '%s'" % (res))

request_body = {
    "settings" : {
        "number_of_shards" : 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_" 
                },
                "english_keywords": {
                    "type": "keyword_marker",
                    "keywords": ["example"] 
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                  "language": "possessive_english"
                }
            },
            "analyzer": {
                "english": {
                    "tokenizer":  "standard",
                    "filter": [
                        "english_possessive_stemmer",
                        "lowercase",
                        "english_stop",
                        "english_keywords",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings" : {
        "blog" : {
            "properties" : {
                "title": {"type" : "text"},
                "text": {"type" : "text"},
                "date": {"type": "date"}
            }
        }
    },
}

print("creating '%s' index..." % (INDEX_NAME))
res = es.indices.create(index = INDEX_NAME, body = request_body)
print(" response: '%s'" % (res))

deleting 'test' index...
 response: '{'acknowledged': True}'
creating 'test' index...
 response: '{'acknowledged': True, 'shards_acknowledged': True, 'index': 'test'}'


In [26]:
for i in range(5):
    doc = {
        'title': 'victor',
        'text': 'Hello World',
        "date": datetime.now()
    }

    res = es.index(index=INDEX_NAME, doc_type="blog", id=i, body=doc)

In [27]:
es.get(index=INDEX_NAME, doc_type="blog", id=4, )

{'_id': '4',
 '_index': 'test',
 '_source': {'date': '2018-03-18T02:03:42.188049',
  'text': 'Hello World',
  'title': 'victor'},
 '_type': 'blog',
 '_version': 1,
 'found': True}

In [28]:
es.indices.refresh(index=INDEX_NAME)

{'_shards': {'failed': 0, 'successful': 1, 'total': 2}}

In [32]:
body = {
    "query": {
        "more_like_this" : {
            "fields" : ["text"],
            "like" : "Hello",
            "min_term_freq" : 1,
            "max_query_terms" : 12
        }
    }
}
res = es.search(index=INDEX_NAME, body=body)
# print(res)
# print("Got %d Hits:" % res['hits']['total'])
# for hit in res['hits']['hits']:
#    print(hit)
    
print(res["hits"]["hits"])

[{'_index': 'test', '_type': 'blog', '_id': '0', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-18T02:03:42.138694'}}, {'_index': 'test', '_type': 'blog', '_id': '1', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-18T02:03:42.153419'}}, {'_index': 'test', '_type': 'blog', '_id': '2', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-18T02:03:42.164823'}}, {'_index': 'test', '_type': 'blog', '_id': '3', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-18T02:03:42.176975'}}, {'_index': 'test', '_type': 'blog', '_id': '4', '_score': 0.087011375, '_source': {'title': 'victor', 'text': 'Hello World', 'date': '2018-03-18T02:03:42.188049'}}]


In [144]:
es.get(INDEX_NAME, "blog", 2)

{'_id': '2',
 '_index': 'test',
 '_source': {'date': '2018-03-17T18:15:45.231983',
  'text': 'Hello World',
  'title': 'victor'},
 '_type': 'blog',
 '_version': 1,
 'found': True}