In [7]:
import pandas as pd
import json
import spacy
import re
from typing import List, Dict

def load_domain_knowledge(file_path: str) -> Dict:
    with open(file_path, 'r') as file:
        return json.load(file)

def load_spacy_model() -> spacy.language.Language:
    return spacy.load("en_core_web_sm")

def dictionary_lookup(text: str, domain_dict: Dict) -> List[str]:
    found_keywords = []
    for category, keywords in domain_dict.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', text.lower()):
                found_keywords.append(keyword)
    return found_keywords

#Named Entity Recognition
def extract_entities_ner(text: str, nlp_model: spacy.language.Language) -> List[str]:
    doc = nlp_model(text)
    return [ent.text for ent in doc.ents]

#Combine Results from Dictionary Lookup and NER
def combine_extractions(dictionary_entities: List[str], ner_entities: List[str]) -> List[str]:
    return list(set(dictionary_entities + ner_entities))

def extract_entities_from_csv(csv_file: str, domain_dict: Dict, nlp_model: spacy.language.Language) -> List[Dict]:
    df = pd.read_csv(csv_file)
    
    text_data = df['Description'].tolist()
    
    extracted_entities = []
    
    for text in text_data:
        dictionary_entities = dictionary_lookup(text, domain_dict)
        
        ner_entities = extract_entities_ner(text, nlp_model)
        
        combined_entities = combine_extractions(dictionary_entities, ner_entities)
        
        extracted_entities.append({
            'Description': text,
            'Extracted_Entities': combined_entities
        })
    
    return extracted_entities

def format_final_extracted_entities(extracted_entities: List[Dict], domain_dict: Dict) -> Dict:
    final_entities = {category: [] for category in domain_dict.keys()}
    
    #Iterate over all extracted entities
    for entry in extracted_entities:
        for entity in entry['Extracted_Entities']:
            for category, keywords in domain_dict.items():
                if entity.lower() in [keyword.lower() for keyword in keywords]:
                    if entity not in final_entities[category]:
                        final_entities[category].append(entity)
    
    return final_entities

def save_to_json(data: Dict, output_file: str):
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)

domain_knowledge_file = 'input.json'  
domain_dict = load_domain_knowledge(domain_knowledge_file)

nlp_model = load_spacy_model()

csv_file_path = 'input.csv'  

extracted_entities = extract_entities_from_csv(csv_file_path, domain_dict, nlp_model)

final_extracted_entities = format_final_extracted_entities(extracted_entities, domain_dict)

output_json_file = 'final_extracted_entities_output.json' 

save_to_json(final_extracted_entities, output_json_file)


Final extracted entities have been saved to final_extracted_entities_output.json
