In [1]:
import re
from elasticsearch import Elasticsearch
import spacy
from datetime import datetime
import xml.etree.ElementTree as ET
from opencage.geocoder import OpenCageGeocode

### Load spaCy model and Connect to Elasticsearch

In [2]:
# Load the spaCy models for English
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000 

In [3]:
es = Elasticsearch([{"host": "localhost", "port": 9200, "scheme": "http"}], basic_auth=('elastic', 'CbIxwM6z85Dm6fKtAJte'))
index_name = "reuters"
api_key = '8be579e383db4dc2a7f4895ebf6923d0' 

In [4]:
print(es.ping())

True


### Create index 

In [22]:
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Index '{index_name}' deleted.")
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {
                "autocomplete": {
                    "tokenizer": "autocomplete",
                    "filter": ["lowercase"]
                },
                "custom_content_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "stop",
                        "custom_stemmer"
                    ]
                }
            },
            "tokenizer": {
                "autocomplete": {
                    "type": "edge_ngram",
                    "min_gram": 3,
                    "max_gram": 20,
                    "token_chars": ["letter", "digit"]
                }
            },
            "filter": {
                "custom_stemmer": {
                    "type": "stemmer",
                    "name": "english"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "autocomplete",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "content": {
                "type": "text",
                "analyzer": "custom_content_analyzer"
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {"type": "text"},
                    "last_name": {"type": "text"}
                }
            },
            "date": {
                    "type": "date",
                     "format": "yyyy-MM-dd HH:mm:ss"
                     },
            "georeferences":{
                    "type": "nested",
                    "properties": {
                    "expression": {
                        "type": "text",
                        "fields": {
                        "keyword": {
                            "type": "keyword"
                        }
                    }
                    }
                    }}, 
            "geopoint": {"type": "geo_point"},  
             
            "temporal_expressions": {
                    "type": "nested",
                    "properties": {
                    "expression": {
                        "type": "text"
                    }
                    }
                }
        }
    }
}

es.indices.create(index=index_name, ignore=400, body=index_settings)
print(f"Index '{index_name}' created successfully.")


  es.indices.create(index=index_name, ignore=400, body=index_settings)


Index 'reuters' created successfully.


### Extract Temporal expressions and Georeferences

In [None]:
def extract_temporal_expressions(text):
    """
    Extracts temporal expressions from the given text using spaCy.

    Args:
        text (str): Input text.

    Returns:
        list: A list of dictionaries, where each dictionary contains an "expression" key representing a temporal expression.
    """
    # Process the text using spaCy
    doc = nlp(text)
    
    # Extract entities labeled as "DATE"
    temporal_expressions = [{"expression": ent.text} for ent in doc.ents if ent.label_ == "DATE"]

    return temporal_expressions 

In [5]:
def extract_georeferences(text):
    """
    Extracts georeferences from the given text using spaCy.

    Args:
        text (str): Input text.

    Returns:
        list: A list of dictionaries, where each dictionary contains an "expression" key representing a georeference.
    """
    # Process the text using spaCy
    doc = nlp(text)

    # Use a set to automatically remove duplicate georeferences
    georeferences_set = set()

    # Extract entities labeled as "GPE"
    for ent in doc.ents:
        if ent.label_ == "GPE":
            georeferences_set.add(ent.text)

    # Convert the set to a list of dictionaries
    georeferences_list = [{"expression": geo} for geo in georeferences_set]

    return georeferences_list

### Process Text and Format Date

In [6]:
def process_text(text):
    #converting text to lowercase and removing leading/trailing whitespaces
    processed_text = text.lower().strip()
    return processed_text 

In [None]:
def format_date(date_string):
    parsed_date = datetime.strptime(date_string, '%d-%b-%Y %H:%M:%S.%f')
    formatted_date = parsed_date.strftime('%Y-%m-%d %H:%M:%S')
    return formatted_date


def format_date_to_iso8601(date_string):
    try:
        # Parse the date string
        parsed_date = datetime.strptime(date_string, "%B %d")
        
        # Convert it to ISO 8601 format
        iso8601_date = parsed_date.strftime("%Y-%m-%dT%H:%M:%S")
        
        return iso8601_date
    except ValueError:
        # Handle cases where parsing is not successful
        return None

def extract_and_clean_month_day(expression):
    # Regular expression to match 'Month day' format without unwanted words
    month_day_pattern = re.compile(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}\b')

    # Extract 'Month day' format from the expression
    month_day_extraction = re.search(month_day_pattern, expression).group() if re.search(month_day_pattern, expression) else None

    # Remove unwanted words from the extraction
    cleaned_month_day_extraction = re.sub(r'\b(?:the\s+week\s+ended|the\s+week|weekly|late this year|the season|last year|recent weeks)\s+', '', month_day_extraction) if month_day_extraction else None

    return cleaned_month_day_extraction

### Extract geopoints and Approximate temporal expressions and Approximate geopoint

In [None]:
def extract_geopoints(api_key, address):
    """
    Extracts geopoints (latitude and longitude) from a given address using an external geocoding service.

    Args:
        api_key (str): API key for the geocoding service.
        address (str): The address for which geopoints are to be extracted.

    Returns:
        dict or None: A dictionary representing the extracted geopoint (latitude and longitude), or None if no geopoint could be extracted.
    """
    # Create an instance of the geocoding service with the provided API key
    geocoder = OpenCageGeocode(api_key)

    # Make the geocoding request
    results = geocoder.geocode(address)

    # Check if results are available and not empty
    if results and len(results):
        # Extract latitude and longitude from the first result
        lat = results[0]['geometry']['lat']
        lon = results[0]['geometry']['lng']
        geopoint = {"lat": lat, "lon": lon}
        return geopoint

    # Return None if no geopoint could be extracted
    return None


In [7]:
def approximate_temporal_expressions(temporal_expressions):
    """
    Approximates a temporal expression based on a list of temporal expressions.

    Args:
        temporal_expressions (list): A list of temporal expression dictionaries, where each dictionary contains an "expression" key representing a temporal expression.

    Returns:
        str or None: An ISO 8601 formatted date string representing the first cleaned 'Month day' expression, or None if no valid expression is found.
    """
    # Extract and clean 'Month day' format from each expression
    cleaned_extractions = [extract_and_clean_month_day(entry["expression"]) for entry in temporal_expressions]

    # Find the first cleaned extraction that is not None
    first_cleaned_extraction = next((extraction for extraction in cleaned_extractions if extraction), None)

    # If a valid cleaned extraction is found, convert it to ISO 8601 format
    if first_cleaned_extraction:
        iso8601_date = format_date_to_iso8601(first_cleaned_extraction)
        return iso8601_date

    # Return None if no valid expression is found
    return None


In [None]:
def approximate_geopoint(georeferences, api_key):
    """
    Approximates a geopoint based on a given set of georeferences and an API key.

    Args:
        georeferences (list): A list of georeference dictionaries, where each dictionary contains an "expression" key representing a location.
        api_key (str): API key for the geopoint extraction service.

    Returns:
        dict or None: A dictionary representing the extracted geopoint (latitude and longitude), or None if no valid geopoint could be approximated.
    """
    # Extract the location expression from the georeferences
    location = georeferences[0]["expression"] if georeferences else None

    # If no location is provided, set geopoint to None
    if location is None:
        geopoint = None
    else:
        # Attempt to extract geopoint using the provided API key
        geopoint = extract_geopoints(api_key, location)

    return geopoint


### Extract Authors and Location

In [None]:
def extract_authors(reuters_data):
    """
    Extracts author information from Reuters data.

    Args:
        reuters_data (str): Input Reuters data containing author information.

    Returns:
        list: A list of dictionaries, where each dictionary contains "First Name" and "Last Name" keys representing author properties.
    """
    # Initialize a list to store author objects
    authors = []

    # Use a more descriptive regular expression to match author data
    author_pattern = re.compile(r'<AUTHOR>(.*?)<\/AUTHOR>', re.DOTALL)

    # Iterate through matches of the author pattern in the Reuters data
    for author_match in re.finditer(author_pattern, reuters_data):
        author_data = author_match.group(1).strip()

        # Use a more robust method to split combined names
        names_match = re.search(r'by\s+([^,]+),\s*([^<]+)\s*$', author_data, re.IGNORECASE)
        if names_match:
            full_name = names_match.group(1).strip()
            first_name, _, last_name = full_name.rpartition(' ')
        else:
            first_name = last_name = ''

        # Create a dictionary for each author and add it to the list
        author_dict = {'First Name': first_name, 'Last Name': last_name}
        authors.append(author_dict)

    return authors


In [None]:
def extract_location(dateline_text):
    """
    Extracts location information from dateline text.

    Args:
        dateline_text (str): Input dateline text.

    Returns:
        str: Extracted location value.
    """
    try:
        # Create an XML element from the dateline text
        dateline_root = ET.fromstring(f"<DATELINE>{dateline_text}</DATELINE>")
        
        # Extract the location value (first part before the first comma) from the text
        location_value = dateline_root.text.split(',')[0].strip() if dateline_root.text is not None else ''
        
        return location_value
    except Exception as e:
        print(f"Error extracting location: {e}")
        return ''

### Indexes all documents from an SGML file into Elasticsearch.

In [10]:
def index_single_document(es, index_name, title, content, authors, date, geopoint, temporal_expressions, georeferences):
       
    document = {
        'title': title,
        'content': process_text(content),
        'authors':authors,
        'date': date,
        'geopoint': geopoint,
        'temporalExpressions': temporal_expressions,
        'georeferences': georeferences
    }

    es.index(index=index_name, body=document)

In [14]:
def index_all_documents(es, index_name, document_path, api_key):
    """
    Indexes all documents from an SGML file into Elasticsearch.

    Args:
        es: Elasticsearch client instance.
        index_name (str): Name of the Elasticsearch index.
        document_path (str): Path to the SGML document file.
        api_key (str): API key for geopoint extraction service.

    Returns:
        None
    """
    try:
        with open(document_path, "r", encoding="utf-8") as file:
            lines = file.readlines()

        document_content = ''.join(lines)

        # Use regular expressions to find all <REUTERS> elements
        reuters_pattern = re.compile(r'<REUTERS.*?>(.*?)<\/REUTERS>', re.DOTALL)
        reuters_matches = re.finditer(reuters_pattern, document_content)

        # Get the total number of <REUTERS> elements found
        num_reuters_elements = len(list(reuters_matches))
        print(f"Number of <REUTERS> elements found: {num_reuters_elements}")

        # Add a counter to limit the processing to 200 elements
        counter = 0

        for reuters_match in re.finditer(reuters_pattern, document_content):
            # Increment the counter
            counter += 1

            # Extract relevant information from each <REUTERS> element
            reuters_data = reuters_match.group(1).strip()

            title_match = re.search(r'<TITLE>(.*?)<\/TITLE>', reuters_data)
            title = title_match.group(1).strip() if title_match else ''

            body_match = re.search(r'<BODY>(.*?)<\/BODY>', reuters_data, re.DOTALL)
            content = body_match.group(1).strip() if body_match else ''

            authors = extract_authors(reuters_data)

            temporal_expressions = extract_temporal_expressions(content)

            date_match = re.search(r'<DATE>(.*?)<\/DATE>', reuters_data)
            date = date_match.group(1).strip() if date_match else None
            date = format_date(date)

            if not date:
                date = approximate_temporal_expressions(temporal_expressions)

            date_match = re.search(r'<DATELINE>(.*?)<\/DATELINE>', reuters_data)
            date_line = date_match.group(1).strip() if date_match else ''
            location_value = extract_location(date_line)
            georeferences = extract_georeferences(content)

            if not location_value and georeferences:
                geopoint = approximate_geopoint(georeferences, api_key)
            else:
                geopoint = extract_geopoints(api_key, location_value)

            # Index the document into Elasticsearch
            index_single_document(es, index_name, title, content, authors, date, geopoint, temporal_expressions, georeferences)

            # Limit the processing to 200 elements (for testing or other reasons)
            if counter == 200:
                break

        # Refresh the index after indexing all documents
        es.indices.refresh(index=index_name)

    except Exception as e:
        print(f"Error processing SGML: {e}")

In [15]:
collection_path = r"C:\Users\hp\OneDrive\Desktop\projectIR\reut2-009.sgm" 
index_all_documents(es, index_name, collection_path,api_key)

Number of <REUTERS> elements found: 1000
