# Import libraries needed


In [7]:
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from elasticsearch import Elasticsearch
import spacy
import os
from datetime import datetime
import re

# Define Elasticsearch and  index mapping


In [2]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],basic_auth=('osama', 'osama123'))
index_name = "testing_reuter_news_index"


NameError: name 'Elasticsearch' is not defined

In [4]:
    # Nested.

index_mapping = {
    'mappings': {
        'properties': {
            "title": {
                "type": "text",
                "analyzer": "title_analyzer",
                "search_analyzer": "standard"
            },
            'content': {
                'type': 'text',
                'analyzer': 'content_analyzer'
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {
                        "type": "text"
                    },
                    "last_name": {
                        "type": "text"
                    }
                }
            },
            "date": {
                "type": "date"
            },
            "geopoint": {
                "type": "geo_point"
            },
            "temporal_expressions": {
                "type": "text"
            },
            "georeferences": {
                "type": "text"
            }
        },
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "title_analyzer": {
                    "type": "custom",
                    "tokenizer": "autocomplete_tokenizer",
                    "filter": [
                        "lowercase"
                    ]
                },
                "content_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "char_filter": [
                        "html_strip"
                    ],
                    "filter": [
                        "lowercase",
                        "stop",
                        "length",
                        "porter_stem"
                    ]
                }
            },
            "tokenizer": {
                "autocomplete_tokenizer": {
                    "type": "edge_ngram", # example => ex , am , pl , le
                    "min_gram": 2,
                    "max_gram": 10,
                    "token_chars": [
                        "letter",
                        "digit"
                    ]
                }
            }
        }
    }
}

# index_name = "testing_reuter_news_index"
# es.indices.create(index=index_name, body=index_mapping)

In [None]:
# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Data Extracting for all fields


* Use the `extract date` function to extract dates from data and make sure it is applicable with Elasticsearch.

In [None]:
def extract_date(reuters_tag):
    """
    Parameters:
    - reuters_tag: A BeautifulSoup Tag object representing a 'reuters' element with the date information.

    Return:
    - A formatted date string in Elasticsearch-friendly format if the 'date' tag is present; otherwise, returns None.
    """
    date_tag = reuters_tag.find('date')
    
    if date_tag:
        date_text = date_tag.get_text()
        parsed_date = datetime.strptime(date_text, "%d-%b-%Y %H:%M:%S.%f")
        elasticsearch_date = parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

        return elasticsearch_date

    return None

* Use `extract_first_last_names` function to split the author name into the first and last names from the `author tag` in the data.

* Use `extract_author` function to extract the author's name from the data.

In [None]:
def extract_first_last_names(author_tag):
    # Extract author information from the given tag
    author_info = author_tag.get_text() if author_tag else None
    
    # Check if author_info is not None before proceeding
    if author_info:
        # Define a case-insensitive regular expression pattern to extract the full author's name
        pattern = re.compile(r'by (.*?),', re.IGNORECASE)

        # Use the regular expression to find the match
        match = re.search(pattern, author_info)

        # Extract the full author's name
        full_name = match.group(1) if match else None

        # Split the full name into first and last names
        if full_name:
            names = full_name.split()
            first_name = names[0] if names else None
            last_name = names[-1] if len(names) > 1 else None
            return first_name, last_name
    return None, None

def extract_authors(reuters_tag):
    author_tag = reuters_tag.find('author')
    return extract_first_last_names(author_tag)

* Use `extract_article_title` function to extract the article's title from the data `title` tag. 

In [None]:
def extract_article_title(reuters_tag):
    title_tag = reuters_tag.find('title')
    return title_tag.get_text() if title_tag else None

* Use `clean_text` function to remove unnecessary spaces and some special characters, like `\n`.

* Use `extract_content` function to extract the content of news content from the data `text` tag.

In [None]:
def clean_text(raw_text):
    # Remove control characters and extra spaces
    cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]+', ' ', raw_text)

    # Remove leading and trailing whitespaces
    cleaned_text = cleaned_text.strip()

    return cleaned_text

def extract_content(reuters_tag):
    text_tag = reuters_tag.find('text')
    raw_content = text_tag.get_text() if text_tag else None

    # Use clean_text function to clean the extracted content
    cleaned_content = clean_text(raw_content) if raw_content else None

    return cleaned_content

* Use `extract_georeferences` to extract places and then send result of this function to `extract_geopoints` to get the location of place using

* `geocode` method from `geopy` library to get latitude and longitude of this place.

In [None]:
def extract_georeferences(reuters_tag):
    places_tag = reuters_tag.find('places')
    return [place.get_text() for place in places_tag.find_all('d')] if places_tag else None

* Use `extract_temporal_expressions` to extract temporal expressions from the content of news using `spacy` library.
* Analysis the text, the ent read text and get entites of text and return text.lebl == 'DATE' to return temporal expressions.

In [None]:
def extract_temporal_expressions(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == 'DATE']

* Use `extract_geopoints` to convert place name to coordinates by get the location of place using `geocode` method from `geopy` library 
* to return latitude and longitude of this place.

In [None]:
def extract_geopoints(georeferences):
    geolocator = Nominatim(user_agent="geo_app")
    geopoints = []
    
    for place_name in georeferences:
        location = geolocator.geocode(place_name)
        if location:
            geopoints.append({'latitude': location.latitude, 'longitude': location.longitude})
    return geopoints

# Data indexing

- Excluding documents that do not contain a `title` or `content`


In [None]:
def index_document(title, content, first_name, last_name, date, georeferences, temporal_expressions):
    try:
        if title and content:
            document = {
                'title': title,
                'content': content,
                'date': date,
                'authors': [{"first_name": first_name, "last_name": last_name}],
                'geopoint': [{'lat': point['latitude'], 'lon': point['longitude']} for point in extract_geopoints(georeferences)],
                'temporal_expressions': [{'expression': expr} for expr in temporal_expressions],
                'georeferences': [{'reference': ref} for ref in georeferences]
            }
            
            # print(document)
            es.index(index='testing_reuter_news_index', body=document)
            print(f"Document indexed successfully: {title}")
        else:
            print("Skipping document due to missing required fields.")
    except Exception as e:
        print(f"Error indexing document: {e}")


In [None]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    soup = BeautifulSoup(content, 'html.parser')
    reuters_tags = soup.find_all('reuters')
    
    for reuters_tag in reuters_tags:
        date = extract_date(reuters_tag)
        first_name, last_name = extract_authors(reuters_tag)
        title = extract_article_title(reuters_tag)
        file_content = extract_content(reuters_tag)
        georeferences = extract_georeferences(reuters_tag)
        temporal_expressions = extract_temporal_expressions(file_content)
        
        if georeferences:
            geopoints = extract_geopoints(georeferences)
        else:
            geopoints = None
            
        # Call index_document with the adjusted structure
        index_document(title, file_content, first_name, last_name, date, georeferences, temporal_expressions)


In [None]:
def read_folder(folder_path):
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm"):
            file_path = os.path.join(folder_path, filename)
            read_file(file_path)

# Example of use


In [None]:
data_folder = "./testing data/"
read_folder(data_folder)

Error indexing document: BadRequestError(400, 'document_parsing_exception', "[1:3204] failed to parse field [temporal_expressions] of type [text] in document with id 'xd2VpowB-Py2aUC-deyt'. Preview of field's value: '{expression=the week}'")
Error indexing document: BadRequestError(400, 'document_parsing_exception', "[1:766] failed to parse field [georeferences] of type [text] in document with id 'xt2VpowB-Py2aUC-euwb'. Preview of field's value: '{reference=usa}'")
Error indexing document: BadRequestError(400, 'document_parsing_exception', "[1:659] failed to parse field [georeferences] of type [text] in document with id 'x92VpowB-Py2aUC-fuw0'. Preview of field's value: '{reference=usa}'")
Error indexing document: BadRequestError(400, 'document_parsing_exception', "[1:3245] failed to parse field [temporal_expressions] of type [text] in document with id 'yN2VpowB-Py2aUC-huwg'. Preview of field's value: '{expression=this week}'")
Error indexing document: BadRequestError(400, 'document_par

KeyboardInterrupt: 

In [8]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],basic_auth=('osama', 'osama123'))
index_name = "testing_reuter_news_index4"


In [9]:
def autocomplete_service(query_prefix):
    body = {
        "suggest": {
            "title-suggest": {
                "prefix": query_prefix,
                "completion": {
                    "field": "title",
                    "size": 10
                }
            }
        }
    }

    result = es.search(index=index_name, body=body)
    suggestions = result["suggest"]["title-suggest"][0]["options"]
    return [suggestion["_source"]["title"] for suggestion in suggestions]

# Example Usage:
query_prefix = "doc"
suggestions = autocomplete_service(query_prefix)
print("Autocomplete Suggestions:", suggestions)


BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'Field [title] is not a completion suggest field')

# test


In [20]:
# In the first phase, insert at least two characters, and the output should be documents that contain these words.
# for example, Ame = expected output: America with another word based on my data.

# Note : size by default 10, and we should edit the min_gram to 3 in mappings and this to `Autocomplete`
# to handle misspelled words probelm use `fuzziness`.

* In the first phase, insert at least two characters, and the output should be documents that contain these words.
* for example, Ame = expected output: America with another word based on my data.

*  Note : size by default 10, and we should edit the min_gram to 3 in mappings and this to `Autocomplete`
* to handle misspelled words probelm use `fuzziness`.

In [1]:
# Autocomplete Query
autocomplete_query = {
    "size": 10, # but is by default be 10 lines.
    "query": {
        "match": {
            "title": {
                "query": "Ame",
                "analyzer": "title_analyzer"
            }
        }
    }
}

# Execute the query
result = es.search(index="testing_reuter_news_index4", body=autocomplete_query)

# Extract and print the suggestions
suggestions = [hit["_source"]["title"] for hit in result["hits"]["hits"]]

suggestions

NameError: name 'es' is not defined

In [80]:
from datetime import datetime
from elasticsearch import Elasticsearch

# Sample user input
user_input = "SWAP DEALERS UNVEIL"

# Sample date range
start_date = "1987-03-04T09:30:50.970000"
end_date = "1987-03-05T09:30:50.970000"

# Sample geopoint coordinates
user_latitude = 6.3110548
user_longitude = 20.5447525

# Query for Relevant Documents
relevant_documents_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "query": user_input,
                        "fields": ["title^2", "content"]
                    }
                },
                {
                    "range": {
                        "date": {
                            "gte": start_date,
                            "lte": end_date
                        }
                    }
                },
                {
                    "geo_distance": {
                        "distance": '10km',
                        "geopoint": {
                            "lat": user_latitude,
                            "lon": user_longitude
                        }
                    }
                }
            ]
        }
    },
    "sort": [
        {"date": {"order": "desc"}},
        {
            "_geo_distance": {
                "geopoint": {
                    "lat": user_latitude,
                    "lon": user_longitude
                },
                "order": "asc",
                "unit": "km"
            }
        }
    ]
}


# Execute the query
result = es.search(index="testing_reuter_news_index4", body=relevant_documents_query)

# Extract and print the relevant documents
for hit in result["hits"]["hits"]:
    print(f"Title: {hit['_source']['title']}, Date: {hit['_source']['date']}, Geopoint: {hit['_source']['geopoint']}")
# Extract and print the relevant documents with title and date
counter = 0

# for hit in result["hits"]["hits"]:
#     title = hit["_source"]["title"]
#     date = hit["_source"]["date"]
#     counter+=1
#     print(f"Title {counter}: {title}, Date: {date}")

Title: JAGUAR SEES STRONG GROWTH IN NEW MODEL SALES, Date: 1987-03-05T09:07:54.170000Z, Geopoint: [{'lat': 6.3110548, 'lon': 20.5447525}]
Title: SWAP DEALERS UNVEIL STANDARD CONTRACT, Date: 1987-03-04T09:30:50.970000Z, Geopoint: [{'lat': 6.3110548, 'lon': 20.5447525}, {'lat': 39.7837304, 'lon': -100.445882}]


In [62]:

# test = {
#     "query": {
#         "multi_match": {
#             "query": "the private exporters",
#             "fields": ["title^2", "content"]
#         }
#     },
#     "sort": [
#         {"date": {"order": "desc"}}
#     ]
# }

counter = 0

# for hit in result["hits"]["hits"]:
#     title = hit["_source"]["title"]
#     date = hit["_source"]["date"]
#     counter+=1
#     print(f"Title {counter}: {title}, Date: {date}")

ObjectApiResponse({'testing_reuter_news_index4': {'mappings': {'properties': {'authors': {'type': 'nested', 'properties': {'first_name': {'type': 'text'}, 'last_name': {'type': 'text'}}}, 'content': {'type': 'text', 'analyzer': 'content_analyzer'}, 'date': {'type': 'date'}, 'geopoint': {'type': 'geo_point'}, 'georeferences': {'type': 'nested', 'properties': {'reference': {'type': 'text'}}}, 'temporal_expressions': {'type': 'nested', 'properties': {'temporal': {'type': 'text'}}}, 'title': {'type': 'text', 'analyzer': 'title_analyzer', 'search_analyzer': 'standard'}}}}})

In [70]:
# Get the mapping for a specific field (e.g., "date")
field_mapping = es.indices.get_field_mapping(index=index_name, fields="geo_distance")

# Print the mapping for the specified field
print(field_mapping)


{'testing_reuter_news_index4': {'mappings': {}}}


In [None]:
    # "query": {
    #     "bool": {
    #         "must": [
    #             {
    #                 "multi_match": {
    #                     "query": "user_input",
    #                     "fields": ["title^2", "content"]
    #                 }
    #             },
    #             {
    #                 "range": {
    #                     "date": {
    #                         "gte": "start_date",
    #                         "lte": "end_date"
    #                     }
    #                 }
    #             },
    #             {
    #                 "geo_distance": {
    #                     "distance": "distance",
    #                     "geopoint": {
    #                         "lat": "latitude",
    #                         "lon": "longitude"
    #                     }
    #                 }
    #             }
    #         ]
    #     }
    # },
    # "sort": [
    #     {"date": {"order": "desc"}}
    # ]

In [91]:

# Query for Top-10 Mentioned Georeferences
top_georeferences_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "query": user_input,
                        "fields": ["title^2", "content"]
                    }
                },
                {
                    "range": {
                        "date": {
                            "gte": start_date,
                            "lte": end_date
                        }
                    }
                },
                {
                    "geo_distance": {
                        "distance": '10km',
                        "geopoint": {
                            "lat": user_latitude,
                            "lon": user_longitude
                        }
                    }
                }
            ]
        }
    },
    "aggs": {
        "top_georeferences": {
            "terms": {
                "field": "georeferences.reference.keyword",  # Assuming georeferences is a nested field
                "size": 10  # Retrieve the top 10 georeferences
            }
        }
    }
}

# Execute the query
result = es.search(index='testing_reuter_news_index4', body=top_georeferences_query)

# Extract and print the top 10 mentioned georeferences
top_georeferences = result["aggregations"]["top_georeferences"]["buckets"]

for georeference in top_georeferences:
    print(f"Georeference: {georeference['key']}, Count: {georeference['doc_count']}")


In [90]:
simple_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "query": user_input,
                        "fields": ["title^2", "content"]
                    }
                },
                {
                    "range": {
                        "date": {
                            "gte": start_date,
                            "lte": end_date
                        }
                    }
                },
                {
                    "geo_distance": {
                        "distance": '10km',
                        "geopoint": {
                            "lat": user_latitude,
                            "lon": user_longitude
                        }
                    }
                }
            ]
        }
    }
}
result = es.search(index='testing_reuter_news_index4', body=simple_query)

for hit in result["hits"]["hits"]:
    print(f"Title: {hit['_source']['title']}, Date: {hit['_source']['date']}, Geopoint: {hit['_source']['geopoint']}")

Title: SWAP DEALERS UNVEIL STANDARD CONTRACT, Date: 1987-03-04T09:30:50.970000Z, Geopoint: [{'lat': 6.3110548, 'lon': 20.5447525}, {'lat': 39.7837304, 'lon': -100.445882}]
Title: JAGUAR SEES STRONG GROWTH IN NEW MODEL SALES, Date: 1987-03-05T09:07:54.170000Z, Geopoint: [{'lat': 6.3110548, 'lon': 20.5447525}]


In [97]:
from elasticsearch import Elasticsearch

# Replace with your Elasticsearch cluster URL
# es = Elasticsearch(['http://your_elasticsearch_url:9200'])

# Index name
index_name = "testing_reuter_news_index4"

# User input (partial title)
user_input = "Ame"

# Autocomplete Query with Fuzzy Matching
autocomplete_query = {
    "query": {
        "match": {
            "title": {
                "query": user_input,
                "fuzziness": "AUTO",  # Enable fuzzy matching
                "prefix_length": 2,   # Consider a prefix length of 2 characters
                "max_expansions": 10  # Limit the number of expansions for efficiency
            }
        }
    }
}

# Execute the query
result = es.search(index=index_name, body=autocomplete_query)

# Extract and print the suggested titles
suggested_titles = [hit["_source"]["title"] for hit in result["hits"]["hits"]]
print("Suggested Titles:", suggested_titles)


Suggested Titles: ['AMERICAN BRANDS INC <AMB> FILES WITH SEC', 'AMERICAN DYNAMICS <AMDC> TO SELL 51 PCT STAKE', 'AMERICAN CITY <AMBJ> SETS INITIAL PREFERRED DIV', 'MOST AMERICAN AGGREGATES <AMAG> STOCK ACQUIRED', 'AMEX HAS RECORD SEAT SALE', 'AMERTEK INC <ATEKF> YEAR NET', 'AMERTEK INC <ATEKF> 1ST QTR NET', 'AMERON INC <AMN> QTLY DIVIDEND', 'AMEX STOCK PRICES CLOSE LOWER', 'ANGLO AMERICAN <AIVJ.J> YEAR TO DEC 31']


: 

In [21]:
from datetime import datetime

# Replace with your Elasticsearch cluster URL

# User query parameters
user_query = "ARGENTINE"
temporal_expression = "February"
georeference = "argentina"

# Query for Relevant Documents
relevant_documents_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "query": user_query,
                        "fields": ["title^2", "content"]
                    }
                },
                {
                    "range": {
                        "date": {
                            "gte": datetime.now().isoformat(),
                            "time_zone": "+00:00"  # Adjust time zone based on your data
                        }
                    }
                },
                {
                    "match": {
                        "georeferences.reference": georeference
                    }
                }
            ]
        }
    },
    "sort": [
        {"_score": {"order": "desc"}},  # Sort by relevance score
        {"date": {"order": "desc"}}  # Sort by date in descending order
    ]
}

# Execute the query
result = es.search(index="testing_reuter_news_index4", body=relevant_documents_query)

# Extract and print the relevant documents
for hit in result["hits"]["hits"]:
    print(hit["_source"]["title"], hit["_source"]["date"])


# Step 1

In [None]:
from flask import Flask, render_template,request
from elasticsearch import Elasticsearch

def initialize_elasticsearch():
    return Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}], basic_auth=('osama', 'osama123'))

# Initialize Elasticsearch connection
es = initialize_elasticsearch()

app = Flask(__name__)


@app.route("/", methods=["GET"])
def home():
    return render_template("index.html")

@app.route("/autocomplete", methods=["GET"])
def autocomplete():
    query = request.args.get("query")
    results = es.search(index="testing_reuter_news_index4", body={"query": {"match_phrase_prefix": {"title": query}}})

    suggestions = [hit["_source"]["title"] for hit in results["hits"]["hits"]]

    return {"suggestions": suggestions}
if __name__ == "__main__":
    # Initialize Elasticsearch connection
    es = initialize_elasticsearch()

    # Run the Flask application
    app.run()
