In [3]:
import os
import json
from elasticsearch import Elasticsearch
import spacy
from spacy.tokens import Doc

In [None]:
# Load the spaCy models for English
nlp = spacy.load("en_core_web_sm")

In [None]:
es = Elasticsearch([{"host": "localhost", "port": 9200, "scheme": "http"}], basic_auth=('elastic', 'CbIxwM6z85Dm6fKtAJte'))

#name of the created index
index_name = "index"

In [None]:
print(es.ping())

In [None]:
# Define Elasticsearch index settings and mappings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "filter": {
                "autocomplete_filter": {
                    "type": "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 20
                },
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                }
            },
            "analyzer": {
                "autocomplete": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "autocomplete_filter",
                        "english_stop",
                        "english_stemmer"
                    ]
                },
                "content_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "autocomplete",
                "search_analyzer": "standard"
            },
            "content": {
                "type": "text",
                "analyzer": "content_analyzer"
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {"type": "text"},
                    "last_name": {"type": "text"},
                    "email": {"type": "text"}
                }
            },
            "date": {"type": "date"},
            "geopoint": {"type": "geo_point"},
            "temporal_expressions": {"type": "keyword"},
            "georeferences": {"type": "keyword"}
        }
    }
}
# create index
es.indices.create(index=index_name, ignore=400, body=configurations)

In [None]:
# Function to extract georeferenced expressions using spaCy
def extract_georeferences(text):
    doc = nlp(text)
    georeferences = []
    for ent in doc.ents:
        if ent.label_ == "GPE":
            georeferences.append(ent.text)
    return georeferences

In [None]:
# Function to extract temporal expressions using spaCy
def extract_temporal_expressions(text):
    doc = nlp(text)
    temporal_expressions = []
    for ent in doc.ents:
        if ent.label_ == "DATE":
            temporal_expressions.append(ent.text)
    return temporal_expressions


In [None]:
def index_single_document(es, index_name, title, content, authors, date, geopoint, temporal_expressions, georeferences):
    if not date:
        date = extract_document_date(temporal_expressions)

    if not geopoint:
        geopoint = extract_document_geopoint(georeferences)

    document = {
        'title': title,
        'content': process_text(content),
        'authors': authors,
        'date': date,
        'geopoint': geopoint,
        'temporalExpressions': temporal_expressions,
        'georeferences': georeferences
    }

    es.index(index=index_name, body=document)

In [None]:
def index_single_document(es, index_name, title, content, authors, date, geopoint, temporal_expressions, georeferences):
    if not date:
        date = extract_document_date(temporal_expressions)

    if not geopoint:
        geopoint = extract_document_geopoint(georeferences)

    document = {
        'title': title,
        'content': process_text(content),
        'authors': authors,
        'date': date,
        'geopoint': geopoint,
        'temporalExpressions': temporal_expressions,
        'georeferences': georeferences
    }

    es.index(index=index_name, body=document)


In [None]:
def extract_document_info(document_content):
    # Replace with your actual implementations for extracting title, content, authors, etc.
    title = extract_title(document_content)
    content = extract_content(document_content)
    authors = extract_authors(document_content)
    date = extract_date(document_content)
    geopoint = extract_geopoint(document_content)
    temporal_expressions = extract_temporal_expressions(document_content)
    georeferences = extract_georeferences(document_content)

    return title, content, authors, date, geopoint, temporal_expressions, georeferences


In [None]:
# Inside your document processing loop
def index_document(title, content, authors, date, geopoint, temporal_expressions, georeferences):
    if not date:
        date = extract_approximate_date(temporal_expressions)

    if not geopoint:
        geopoint = extract_approximate_geopoint(georeferences)

    document = {
        'title': title,
        'content': process_text(content),
        'authors': authors,
        'date': date,
        'geopoint': geopoint,
        'temporalExpressions': temporal_expressions,
        'georeferences': georeferences
    }
