In [None]:
import spacy
import requests
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import openai  # Import OpenAI library

# Load NLP Models
nlp = spacy.load("en_core_web_trf")  # Better than 'en_core_web_sm'
nlp_ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")  # For better entity similarity matching

# Google API Key (Replace this with your own)
GOOGLE_API_KEY = "AIzaSyCKL_3TpCL7mt8dincmwy45WrFaenyosZY"

# OpenAI API Key (Replace this with your own)
OPENAI_API_KEY = "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"
openai.api_key = OPENAI_API_KEY

# Cache for geocoded locations
geo_cache = {}

"""In summary, the "extract_entities" function combines the strengths of two different NER approaches 
(spaCy and a transformer-based model) to extract geographic entities from a given text. 
By using both methods, the function aims to improve the coverage and accuracy of the entity extraction process.
The final output is a list of unique geographic entities found in the input text."""

"""def extract_entities(text):
    ""Extract Named Entities using both spaCy and Transformer-based NER models. "
    entities = set()

    # spaCy Named Entity Recognition (NER)
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:  # Extract only geographic entities
            entities.add(ent.text.strip())

    # Transformer-based NER (aggregated output)
    transformer_entities = nlp_ner(text)
    for ent in transformer_entities:
        entities.add(ent['word'].strip())

    return list(entities)"""

import openai
import json
import faiss
import pickle
import re
import helper

def extract_entities(text, examples):
    """
    Extract named entities from the given text using OpenAI's language model and similar examples.
    """
    # Convert examples to a formatted string
    examples_text = "\n".join([f"Text: {ex[0]}\nExtracted Entities: {ex[1]}" for ex in examples])

    # Construct system prompt
    system_prompt = f"""
    You are tasked with extracting named entities from a given text. Named entities can be organizations, locations, dates, etc.
    The output should be in the following JSON format, between 3 backticks:
    ```
    {{"entities": ["ENTITY_A", "ENTITY_B"]}}
    ```
    Consider the following examples:
    {examples_text}
    """

    # Call OpenAI API
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text}
        ]
    )

    # Extract response text
    response_text = response['choices'][0]['message']['content']

    # Extract JSON safely
    match = re.search(r'```json\n(.*?)\n```', response_text, re.DOTALL)
    if match:
        entities_json = match.group(1)
        entities = json.loads(entities_json).get("entities", [])
    else:
        entities = []

    return list(set(entities))  # Return unique entities


def main():
    """ Main function to process the test data, find similar examples, and extract entities. """
    
    # Load data
    test_data = json.load(open("test_data.json", "r"))
    train_data = json.load(open("train_data.json", "r"))

    # Load FAISS index
    index = faiss.read_index("training_data_vectors.faiss")

    # Store predicted entities
    predicted_entities_with_similarity = []

    for text in test_data['text']:
        # Generate embedding
        embedding = helper.get_embeddings_oai([text])

        # Ensure 2D shape for FAISS search
        D, I = index.search(embedding.reshape(1, -1), 10)

        # Get similar examples
        similar_examples = [(train_data['text'][i], train_data['entities'][i]) for i in I[0]]

        # Extract entities
        entities = extract_entities(text, similar_examples)

        # Store results
        predicted_entities_with_similarity.append(entities)

    # Save extracted entities
    with open("predicted_entities_with_similarity.pkl", "wb") as f:
        pickle.dump(predicted_entities_with_similarity, f)





"""def semantic_match(name1, name2, threshold=0.85):
    "" Use Sentence Transformers for semantic similarity matching "
    similarity = util.pytorch_cos_sim(semantic_model.encode(name1, convert_to_tensor=True),
                                      semantic_model.encode(name2, convert_to_tensor=True))
    return similarity.item() >= threshold"""

import openai
import torch
from sentence_transformers import SentenceTransformer, util

# Load the Sentence Transformer model
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Set your OpenAI API key
openai.api_key = OPENAI_API_KEY

def get_openai_embedding(text):
    """Get embeddings from OpenAI's API."""
    response = openai.Embedding.create(
        model="text-embedding-ada-002",  # Specify the OpenAI model
        input=text
    )
    return response['data'][0]['embedding']

def semantic_match(name1, name2, threshold=0.85):
    """Use Sentence Transformers and OpenAI for semantic similarity matching."""
    try:
        # Encode the names to get their embeddings
        embedding1 = semantic_model.encode(name1, convert_to_tensor=True)
        embedding2 = semantic_model.encode(name2, convert_to_tensor=True)

        # Compute cosine similarity using Sentence Transformers
        similarity_st = util.pytorch_cos_sim(embedding1, embedding2).item()

        # Get embeddings from OpenAI
        openai_embedding1 = get_openai_embedding(name1)
        openai_embedding2 = get_openai_embedding(name2)

        # Compute cosine similarity using OpenAI embeddings
        similarity_openai = util.pytorch_cos_sim(
            torch.tensor(openai_embedding1),
            torch.tensor(openai_embedding2)
        ).item()

        # Average the similarities from both models
        average_similarity = (similarity_st + similarity_openai) / 2

        # Return whether the average similarity meets the threshold
        return average_similarity >= threshold

    except Exception as e:
        print(f"Error in semantic_match: {e}")
        return False

# Example usage
"""if __name__ == "__main__":
    name1 = "John Doe"
    name2 = "Jonathan Doe"
    
    if semantic_match(name1, name2):
        print(f"{name1} and {name2} are semantically similar.")
    else:
        print(f"{name1} and {name2} are not semantically similar.")"""






"""def geocode_location(location):
    " Get geographical coordinates using Google Maps API with caching. "
    if location in geo_cache:
        return geo_cache[location]

    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={location}&key={GOOGLE_API_KEY}"
    try:
        response = requests.get(url)
        data = response.json()
        if data["status"] == "OK":
            lat = data["results"][0]["geometry"]["location"]["lat"]
            lon = data["results"][0]["geometry"]["location"]["lng"]
            geo_cache[location] = (lat, lon)
            return lat, lon
    except Exception as e:
        print(f"Error fetching geolocation for {location}: {e}")
    return None"""


import requests
import logging
import openai

# Initialize the cache
#geo_cache = {}

# Replace with your actual Google Maps API key
GOOGLE_API_KEY = OPENAI_API_KEY

# Set up logging
logging.basicConfig(level=logging.INFO)

def geocode_location(location):
    """ Get geographical coordinates using Google Maps API with caching and OpenAI integration. """
    if location in geo_cache:
        logging.info(f"Cache hit for location: {location}")
        return geo_cache[location]

    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={location}&key={GOOGLE_API_KEY}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        data = response.json()
        
        if data["status"] == "OK":
            lat = data["results"][0]["geometry"]["location"]["lat"]
            lon = data["results"][0]["geometry"]["location"]["lng"]
            geo_cache[location] = (lat, lon)
            logging.info(f"Coordinates for {location}: ({lat}, {lon})")
            return lat, lon
        else:
            logging.error(f"Error in response: {data['status']}")
            handle_openai_error(data['status'])
    except requests.exceptions.RequestException as e:
        logging.error(f"Request error for {location}: {e}")
    except Exception as e:
        logging.error(f"Unexpected error fetching geolocation for {location}: {e}")
    
    return None

def handle_openai_error(status):
    """ Handle errors by sending a request to OpenAI for further analysis. """
    try:
        openai.api_key = OPENAI_API_KEY
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "user", "content": f"Analyze the geocoding error: {status}"}
            ]
        )
        logging.info(f"OpenAI response: {response['choices'][0]['message']['content']}")
    except Exception as e:
        logging.error(f"OpenAI error: {e}")

# Example usage
"""if __name__ == "__main__":
    location = "1600 Amphitheatre Parkway, Mountain View, CA"
    coordinates = geocode_location(location)
    if coordinates:
        print(f"Coordinates for {location}: {coordinates}")
    else:
        print(f"Could not retrieve coordinates for {location}.")"""





def generate_context(entity):
    """ Generate additional context about the entity using OpenAI API. """
    prompt = f"Provide a brief description of {entity}."
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  # or any other model you prefer
            messages=[{"role": "user", "content": prompt}]
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(f"Error generating context for {entity}: {e}")
        return None

"""def process_text(text):
    " Process input text to extract, normalize, and geocode geospatial entities. "
    extracted_entities = extract_entities(text)
    geo_entities = {}

    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(geocode_location, entity): entity for entity in extracted_entities}
        for future in futures:
            entity = futures[future]
            coordinates = future.result()
            if coordinates:
                # Check for duplicate variations using semantic similarity
                matched = None
                for existing in geo_entities:
                    if semantic_match(existing, entity):
                        matched = existing
                        break
                
                if matched:
                    continue  # Skip redundant entity
                geo_entities[entity] = {
                    "coordinates": coordinates,
                    "context": generate_context(entity)  # Generate context for the entity
                }
    
    return geo_entities"""



import logging
from concurrent.futures import ThreadPoolExecutor

# Assuming these functions are defined elsewhere in your code
# from your_module import extract_entities, geocode_location, semantic_match, generate_context

# Set up logging
logging.basicConfig(level=logging.INFO)

def process_text(text):
    """ Process input text to extract, normalize, and geocode geospatial entities. """
    extracted_entities = extract_entities(text)
    geo_entities = {}

    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(geocode_location, entity): entity for entity in extracted_entities}
        for future in futures:
            entity = futures[future]
            try:
                coordinates = future.result()
                if coordinates:
                    # Check for duplicate variations using semantic similarity
                    matched = None
                    for existing in geo_entities:
                        if semantic_match(existing, entity):
                            matched = existing
                            break
                    
                    if matched:
                        logging.info(f"Duplicate entity found: {entity} is similar to {matched}. Skipping.")
                        continue  # Skip redundant entity
                    
                    # Generate context for the entity
                    context = generate_context(entity)
                    geo_entities[entity] = {
                        "coordinates": coordinates,
                        "context": context
                    }
                    logging.info(f"Processed entity: {entity} with coordinates: {coordinates}")

            except Exception as e:
                logging.error(f"Error processing entity {entity}: {e}")
    
    return geo_entities

# Example usage
"""if __name__ == "__main__":
    text = "Visit the Eiffel Tower in Paris and the Statue of Liberty in New York."
    result = process_text(text)
    print(result)"""

if __name__ == "__main__":
    texts = [
        "The wildfire in California has spread rapidly. Evacuations are in place for Los Angeles County.",
        "New York and Washington DC are experiencing heavy snowfall.",
        "A major earthquake struck Tokyo, Japan early this morning."
    ]
    ground_truths = [
        ["California", "Los Angeles County"],
        ["New York", "Washington DC"],
        ["Tokyo", "Japan"]
    ]

    main()


    coordinates = geocode_location(location)
    if coordinates:
        print(f"Coordinates for {location}: {coordinates}")
    else:
        print(f"Could not retrieve coordinates for {location}.")

    if semantic_match(name1, name2):
        print(f"{name1} and {name2} are semantically similar.")
    else:
        print(f"{name1} and {name2} are not semantically similar.")




    
    precision, recall, f1, accuracy = evaluate_model(test_texts, ground_truths)
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    sample_text = "The wildfire in Chennai has spread rapidly. Evacuations are in place for Karnataka state."
    results = process_text(text)
    print("Geospatial Entities and Coordinates:", results)