In [None]:
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from elasticsearch import Elasticsearch
import spacy
import os
from datetime import datetime

In [None]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],http_auth=('osama', 'osama123'))

In [None]:
index_mapping = {
    'mappings': {
        'properties': {
            "title": {
                "type": "text",
                "analyzer": "title_analyzer",
                "search_analyzer": "standard"
            },
            'content': {
                'type': 'text',
                'analyzer': 'content_analyzer'
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {"type": "text"},
                    "last_name": {"type": "text"}
                }
            },
            "date": {
                "type": "date"
            },
            "geopoint": {"type": "geo_point"},
            "temporal_expressions": {"type": "text"},
            "georeferences": {"type": "text"}
        },
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "title_analyzer": {
                    "type": "custom",
                    "tokenizer": "autocomplete_tokenizer",
                    "filter": ["lowercase"]
                },
                "content_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "char_filter": ["html_strip"],
                    "filter": ["lowercase", "stop", "length", "porter_stem"]
                }
            },
            "tokenizer": {
                "autocomplete_tokenizer": {
                    "type": "edge_ngram",
                    "min_gram": 2,
                    "max_gram": 10,
                    "token_chars": ["letter", "digit"]
                }
            }
        }
    }
}

index_name = "reuter_news_index"
es.indices.create(index=index_name, body=index_mapping)

In [None]:
# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Create a geocoder instance
geolocator = Nominatim(user_agent="geo_app")

In [None]:
from datetime import datetime

def extract_date(reuters_tag):
    date_tag = reuters_tag.find('date')
    
    if date_tag:
        date_text = date_tag.get_text() 
        # Adjust the format string to match the actual format of your date string
        parsed_date = datetime.strptime(date_text, "%d-%b-%Y %H:%M:%S.%f")
        # Format the datetime object as a string compatible with Elasticsearch date fields
        elasticsearch_date = parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        
        return elasticsearch_date

    return None


In [None]:
def extract_authors(reuters_tag):
    author_tag = reuters_tag.find('author')
    
    if author_tag:
        author_text = author_tag.get_text().strip()
        names = author_text.split(' ')
        
        # Assuming the first name is the first element and the last name is the last element
        if len(names) >= 2:
            first_name = names[0]
            last_name = names[-1]
            return (first_name, last_name)
    
    return None


In [None]:
def extract_article_title(reuters_tag):
    title_tag = reuters_tag.find('title')
    return title_tag.get_text() if title_tag else None

In [None]:
def extract_content(reuters_tag):
    text_tag = reuters_tag.find('text')
    return text_tag.get_text() if text_tag else None

In [None]:
def extract_georeferences(reuters_tag):
    places_tag = reuters_tag.find('places')
    return [place.get_text() for place in places_tag.find_all('d')] if places_tag else None

In [None]:
def extract_temporal_expressions(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == 'DATE']

In [None]:
def extract_geopoints(georeferences):
    geolocator = Nominatim(user_agent="geo_app")
    geopoints = []
    
    for place_name in georeferences:
        location = geolocator.geocode(place_name)
        if location:
            geopoints.append({'latitude': location.latitude, 'longitude': location.longitude})
    return geopoints

In [None]:
def index_document(title, content, authors, date, geopoints, temporal_expressions, georeferences):
    try:
        # Ensure that required fields have valid values before indexing
        if title and content and geopoints:
            document = {
                'title': title,
                'content': content,
                'date': date,
                'geopoint': [{'lat': point['latitude'], 'lon': point['longitude']} for point in geopoints],
                'temporalExpressions': temporal_expressions,
                'georeferences': georeferences
            }

            # Include authors field only if authors is not None
            if authors is not None:
                document['authors'] = authors

            # Index the document
            es.index(index='reuter_news_index', body=document)
            print(f"Document indexed successfully: {title}")
        else:
            print("Skipping document due to missing required fields.")
    except Exception as e:
        print(f"Error indexing document: {e}")


In [None]:
def read_gsm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    soup = BeautifulSoup(content, 'html.parser')
    reuters_tags = soup.find_all('reuters')
    
    for reuters_tag in reuters_tags:
        date = extract_date(reuters_tag)
        authors = extract_authors(reuters_tag)
        title = extract_article_title(reuters_tag)
        file_content = extract_content(reuters_tag)
        georeferences = extract_georeferences(reuters_tag)
        temporal_expressions = extract_temporal_expressions(file_content)
        
        if georeferences:
            geopoints = extract_geopoints(georeferences)
        else:
            geopoints = None
        
        # Check if authors is not None before indexing
        if authors is not None:
            # Unpack the authors tuple and pass it to the index_document function
            index_document(title, file_content, authors, date, geopoints, temporal_expressions, georeferences)
        else:
            # Handle the case where authors is None (provide default values or skip indexing)
            print("Skipping document due to missing authors.")

In [None]:
def process_sgm_folder(folder_path):
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm"):
            file_path = os.path.join(folder_path, filename)
            read_gsm_file(file_path)

In [None]:
# Example usage:
data_folder = "./data/"
process_sgm_folder(data_folder)