# Import libraries needed


In [None]:
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from elasticsearch import Elasticsearch
import spacy
import os
from datetime import datetime
import re

# Define Elasticsearch and mapping


In [None]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],http_auth=('osama', 'osama123'))

In [None]:
index_mapping = {
    'mappings': {
        'properties': {
            "title": {
                "type": "text",
                "analyzer": "title_analyzer",
                "search_analyzer": "standard"
            },
            'content': {
                'type': 'text',
                'analyzer': 'content_analyzer'
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {
                        "type": "text"
                    },
                    "last_name": {
                        "type": "text"
                    }
                }
            },
            "date": {
                "type": "date"
            },
            "geopoint": {
                "type": "geo_point"
            },
            "temporal_expressions": {
                "type": "text"
            },
            "georeferences": {
                "type": "text"
            }
        },
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "title_analyzer": {
                    "type": "custom",
                    "tokenizer": "autocomplete_tokenizer",
                    "filter": [
                        "lowercase"
                    ]
                },
                "content_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "char_filter": [
                        "html_strip"
                    ],
                    "filter": [
                        "lowercase",
                        "stop",
                        "length",
                        "porter_stem"
                    ]
                }
            },
            "tokenizer": {
                "autocomplete_tokenizer": {
                    "type": "edge_ngram",
                    "min_gram": 2,
                    "max_gram": 10,
                    "token_chars": [
                        "letter",
                        "digit"
                    ]
                }
            }
        }
    }
}

index_name = "reuter_news_index"
es.indices.create(index=index_name, body=index_mapping)

In [None]:
# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Data Extracting for all fields


* Use the `extract date` function to extract dates from data and make sure it is applicable with Elasticsearch.

In [107]:
def extract_date(reuters_tag):
    """
    Parameters:
    - reuters_tag: A BeautifulSoup Tag object representing a 'reuters' element with the date information.

    Return:
    - A formatted date string in Elasticsearch-friendly format if the 'date' tag is present; otherwise, returns None.
    """
    date_tag = reuters_tag.find('date')
    
    if date_tag:
        date_text = date_tag.get_text()
        parsed_date = datetime.strptime(date_text, "%d-%b-%Y %H:%M:%S.%f")
        elasticsearch_date = parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

        return elasticsearch_date

    return None

* Use `extract_first_last_names` function to split the author name into the first and last names from the `author tag` in the data.

* Use `extract_author` function to extract the author's name from the data.

In [None]:
def extract_first_last_names(author_tag):
    # Extract author information from the given tag
    author_info = author_tag.get_text() if author_tag else None
    
    # Check if author_info is not None before proceeding
    if author_info:
        # Define a case-insensitive regular expression pattern to extract the full author's name
        pattern = re.compile(r'by (.*?),', re.IGNORECASE)

        # Use the regular expression to find the match
        match = re.search(pattern, author_info)

        # Extract the full author's name
        full_name = match.group(1) if match else None

        # Split the full name into first and last names
        if full_name:
            names = full_name.split()
            first_name = names[0] if names else None
            last_name = names[-1] if len(names) > 1 else None
            return first_name, last_name
    return None, None

def extract_authors(reuters_tag):
    author_tag = reuters_tag.find('author')
    return extract_first_last_names(author_tag)

* Use `extract_article_title` function to extract the article's title from the data `title` tag. 

In [None]:
def extract_article_title(reuters_tag):
    title_tag = reuters_tag.find('title')
    return title_tag.get_text() if title_tag else None

* Use `clean_text` function to remove unnecessary spaces and some special characters, like `\n`.

* Use `extract_content` function to extract the content of news content from the data `text` tag.

In [None]:
def clean_text(raw_text):
    # Remove control characters and extra spaces
    cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]+', ' ', raw_text)

    # Remove leading and trailing whitespaces
    cleaned_text = cleaned_text.strip()

    return cleaned_text

def extract_content(reuters_tag):
    text_tag = reuters_tag.find('text')
    raw_content = text_tag.get_text() if text_tag else None

    # Use clean_text function to clean the extracted content
    cleaned_content = clean_text(raw_content) if raw_content else None

    return cleaned_content

* Use `extract_georeferences` to extract places and then send result of this function to `extract_geopoints` to get the location of place using

* `geocode` method from `geopy` library to get latitude and longitude of this place.

In [None]:
def extract_georeferences(reuters_tag):
    places_tag = reuters_tag.find('places')
    return [place.get_text() for place in places_tag.find_all('d')] if places_tag else None

* Use `extract_temporal_expressions` to extract temporal expressions from the content of news using `spacy` library.

In [None]:
def extract_temporal_expressions(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == 'DATE']

* Use `extract_geopoints` to convert place name to coordinates by get the location of place using `geocode` method from `geopy` library 
* to return latitude and longitude of this place.

In [None]:
def extract_geopoints(georeferences):
    geolocator = Nominatim(user_agent="geo_app")
    geopoints = []
    
    for place_name in georeferences:
        location = geolocator.geocode(place_name)
        if location:
            geopoints.append({'latitude': location.latitude, 'longitude': location.longitude})
    return geopoints

# Data indexing

- Excluding documents that do not contain a `title` or `content`


In [None]:
def index_document(title, content, first_name,last_name, date, geopoints, temporal_expressions, georeferences):
    try:
        if title and content:
            document = {
                'title': title,
                'content': content,
                'date': date,
                'authors':[{"first_name":first_name,"last_name":last_name}],
                'geopoint': [{'lat': point['latitude'], 'lon': point['longitude']} for point in geopoints],
                'temporalExpressions': temporal_expressions,
                'georeferences': georeferences
            }
            
            es.index(index='reuter_news_index', body=document)
            print(f"Document indexed successfully: {title}")
        else:
            print("Skipping document due to missing required fields.")
    except Exception as e:
        print(f"Error indexing document: {e}")

In [None]:
def read_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    soup = BeautifulSoup(content, 'html.parser')
    reuters_tags = soup.find_all('reuters')
    
    for reuters_tag in reuters_tags:
        date = extract_date(reuters_tag)
        first_name, last_name = extract_authors(reuters_tag)
        title = extract_article_title(reuters_tag)
        file_content = extract_content(reuters_tag)
        georeferences = extract_georeferences(reuters_tag)
        temporal_expressions = extract_temporal_expressions(file_content)
        
        if georeferences:
            geopoints = extract_geopoints(georeferences)
        else:
            geopoints = None
            
        index_document(title, file_content, first_name,last_name, date, geopoints, temporal_expressions, georeferences)

In [None]:
def process_sgm_folder(folder_path):
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm"):
            file_path = os.path.join(folder_path, filename)
            read_sgm_file(file_path)

# Example of use


In [None]:
data_folder = "./dummy data/"
process_sgm_folder(data_folder)