# This notebook extracts dataframes of entities from the documents

In [2]:
import pandas as pd
import json
import csv


In [5]:
ent_path = '../03_Output/00_GPT KGs/Entities.csv'
entities = pd.read_csv(ent_path, delimiter=';')

In [6]:
rel_path = '../03_Output/00_GPT KGs/Relations.csv'
relations = pd.read_csv(rel_path, delimiter=';')

In [3]:
import pandas as pd
import re

def extract_entities_from_documents(documents):
    """
    Extracts entities from documents and creates a DataFrame with the following columns:
    entity, doc_id, character_index, large_extract.

    Parameters:
    documents (list of tuples): List of tuples where each tuple contains (doc_id, document_text)

    Returns:
    pd.DataFrame: DataFrame with the extracted entity information.
    """
    # Define a simple pattern for entity extraction (for demonstration purposes)
    # This should be replaced with a more sophisticated entity recognition approach
    entity_pattern = re.compile(r'\b(Sustainable Energy Academy|energy|academy|module)\b', re.IGNORECASE)

    data = []

    for doc_id, text in documents:
        for match in entity_pattern.finditer(text):
            entity = match.group()
            char_index = match.start()
            extract_start = max(char_index - 50, 0)
            extract_end = min(char_index + 50, len(text))
            large_extract = text[extract_start:extract_end]

            data.append({
                'entity': entity,
                'doc_id': doc_id,
                'character_index': char_index,
                'large_extract': large_extract
            })

    df = pd.DataFrame(data)
    return df

# Example usage
documents = [
    ('SEH1', "The Sustainable Energy Academy offers various modules on energy efficiency."),
    ('2', "One of the key focuses of the academy is sustainable energy solutions."),
]

df = extract_entities_from_documents(documents)
print(df)


                       entity  doc_id  character_index  \
0  Sustainable Energy Academy       1                4   
1                      energy       1               57   
2                     academy       2               30   
3                      energy       2               53   

                                       large_extract  
0  The Sustainable Energy Academy offers various ...  
1  tainable Energy Academy offers various modules...  
2  One of the key focuses of the academy is susta...  
3   of the key focuses of the academy is sustaina...  


In [None]:
import pandas as pd
import re
import json

# Define a simple pattern for entity extraction (this can be replaced with a more sophisticated entity recognition approach)
entity_pattern = re.compile(r'\b(Sustainable Energy Academy|energy|academy|module)\b', re.IGNORECASE)

def extract_entities_from_documents(documents):
    data = []

    for doc_id, text in documents:
        for match in entity_pattern.finditer(text):
            entity = match.group()
            char_index = match.start()
            extract_start = max(char_index - 50, 0)
            extract_end = min(char_index + 50, len(text))
            large_extract = text[extract_start:extract_end]

            data.append({
                'entity': entity,
                'doc_id': doc_id,
                'character_index': char_index,
                'large_extract': large_extract
            })

    df = pd.DataFrame(data)
    return df

def get_sub_extracts(df):
    sub_extracts = []

    for _, row in df.iterrows():
        entity = row['entity']
        large_extract = row['large_extract']

        # Assuming entity definition/metadata provides a pattern to find relevant sub-extracts
        # For demonstration, we'll just take a 20-character context around the entity mention
        entity_pattern = re.compile(re.escape(entity), re.IGNORECASE)
        match = entity_pattern.search(large_extract)

        if match:
            start = max(match.start() - 10, 0)
            end = min(match.end() + 10, len(large_extract))
            sub_extract = large_extract[start:end]
            sub_extracts.append({
                'entity': entity,
                'doc_id': row['doc_id'],
                'sub_extract': sub_extract
            })

    sub_extracts_df = pd.DataFrame(sub_extracts)
    return sub_extracts_df

def synthesize_long_description(sub_extracts_df):
    long_descriptions = sub_extracts_df.groupby('entity')['sub_extract'].apply(lambda x: ' '.join(x)).reset_index()
    long_descriptions.columns = ['entity', 'long_description']
    return long_descriptions

def extract_relations(sub_extracts_df):
    # Placeholder function to extract new relations
    # This would need to be implemented based on the specific criteria for extracting relations
    relations = []

    for _, row in sub_extracts_df.iterrows():
        # Dummy relation extraction for demonstration
        relations.append({
            'entity': row['entity'],
            'related_entity': 'Dummy Related Entity',
            'relation': 'related_to'
        })

    relations_df = pd.DataFrame(relations)
    return relations_df

def add_metadata_to_entities(df, long_descriptions, thumbnail_links):
    entities_metadata = []

    for entity, group in df.groupby('entity'):
        document_ids = group['doc_id'].unique().tolist()
        thumbnail_link = thumbnail_links.get(entity, '')

        entity_metadata = {
            'entity': entity,
            'document_ids': document_ids,
            'thumbnail_link': thumbnail_link
        }

        description_row = long_descriptions[long_descriptions['entity'] == entity]
        if not description_row.empty:
            entity_metadata['long_description'] = description_row['long_description'].values[0]

        entities_metadata.append(entity_metadata)

    return entities_metadata

def export_entities_metadata(entities_metadata, csv_path, json_path):
    df = pd.DataFrame(entities_metadata)
    df.to_csv(csv_path, index=False)

    with open(json_path, 'w') as json_file:
        json.dump(entities_metadata, json_file, indent=4)

# Example usage
documents = [
    (1, "The Sustainable Energy Academy offers various modules on energy efficiency."),
    (2, "One of the key focuses of the academy is sustainable energy solutions."),
]

# Step 1: Extract entities from documents
entities_df = extract_entities_from_documents(documents)

# Step 2: Get sub-extracts
sub_extracts_df = get_sub_extracts(entities_df)

# Step 3: Synthesize long descriptions
long_descriptions = synthesize_long_description(sub_extracts_df)

# Step 4: Extract relations
relations_df = extract_relations(sub_extracts_df)

# Step 5: Add metadata and export
thumbnail_links = {
    'Sustainable Energy Academy': 'http://example.com/thumbnail1.jpg',
    'energy': 'http://example.com/thumbnail2.jpg',
    'academy': 'http://example.com/thumbnail3.jpg',
    'module': 'http://example.com/thumbnail4.jpg'
}

entities_metadata = add_metadata_to_entities(entities_df, long_descriptions, thumbnail_links)
export_entities_metadata(entities_metadata, 'entities_metadata.csv', 'entities_metadata.json')


In [9]:
def update_csv_and_create_json(csv_file_path, updated_csv_file_path, json_file_path, synonyms_dict):
    updated_entities = []

    with open(csv_file_path, mode='r', encoding='utf-8') as csvfile, \
         open(updated_csv_file_path, mode='w', encoding='utf-8', newline='') as updated_csvfile:

        reader = csv.DictReader(csvfile, delimiter=',')
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(updated_csvfile, fieldnames=fieldnames, delimiter=';')
        writer.writeheader()

        for row in reader:
            entity = row['Object'].lower()
            if entity in synonyms_dict:
                row['Object'] = synonyms_dict[entity]
            else:
                row['Object'] = entity

            writer.writerow(row)
            updated_entities.append(row)

    with open(json_file_path, mode='w', encoding='utf-8') as jsonfile:
        json.dump(updated_entities, jsonfile, indent=4)



In [10]:
# Example usage:

updated_csv_file_path = '../03_Output/00_GPT KGs/Relations_replaced.csv'
json_file_path = '../03_Output/00_GPT KGs/Relations_replaced.json'

update_csv_and_create_json(rel_path, updated_csv_file_path, json_file_path, synonyms_dict)

print("CSV updated and JSON file created successfully.")


CSV updated and JSON file created successfully.
