In [6]:
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Input text with coreferences
text = "considered the exposure scenario of halide solar cell panels in operation, failing due to harsh environmental conditions. This may create a lead leakage in the nearby water bodies, an ecotoxicology problem. Effectively, the relatively high-water solubility of lead halide salts comprising perovskite may decompose to PbI2, hydroiodic acid, and methylamine. The degradation products could be evaluated with regard to ecotoxicity (e.g. Zebrafsh, Daphnia and C. elegans media)"

# Process text
doc = nlp(text)

# Iterate through sentences
for sent in doc.sents:
    # Collect named entities and noun chunks
    entities = {ent.text.lower() for ent in sent.ents}
    noun_chunks = {chunk.text.lower() for chunk in sent.noun_chunks}
    
    # Resolve coreferences
    resolved_text = []
    for token in sent:
        if token.text.lower() in ['he', 'she', 'it', 'they']:
            antecedent = None
            # Look for closest named entity or noun chunk as antecedent
            for ancestor in token.ancestors:
                if ancestor.text.lower() in entities or ancestor.text.lower() in noun_chunks:
                    antecedent = ancestor.text
                    break
            resolved_text.append(antecedent if antecedent else token.text)
        else:
            resolved_text.append(token.text)
    
    # Print resolved sentence
    print(' '.join(resolved_text))


considered the exposure scenario of halide solar cell panels in operation , failing due to harsh environmental conditions .
This may create a lead leakage in the nearby water bodies , an ecotoxicology problem .
Effectively , the relatively high - water solubility of lead halide salts comprising perovskite may decompose to PbI2 , hydroiodic acid , and methylamine .
The degradation products could be evaluated with regard to ecotoxicity ( e.g. Zebrafsh , Daphnia and C. elegans media )


In [3]:
import spacy
def extract_entities(sents):
   # chunk one
   enti_one = ""
   enti_two = ""
  
   dep_prev_token = "" # dependency tag of previous token in sentence
  
   txt_prev_token = "" # previous token in sentence
  
   prefix = ""
   modifier = ""
  
  
  
   for tokn in nlp(sents):
       # chunk two
       ## move to next token if token is punctuation
      
       if tokn.dep_ != "punct":
           #  check if token is compound word or not
           if tokn.dep_ == "compound":
               prefix = tokn.text
               # add the current word to it if the previous word is 'compound’
               if dep_prev_token == "compound":
                   prefix = txt_prev_token + " "+ tokn.text
                  
           # verify if token is modifier or not
           if tokn.dep_.endswith("mod") == True:
               modifier = tokn.text
               # add it to the current word if the previous word is 'compound'
               if dep_prev_token == "compound":
                   modifier = txt_prev_token + " "+ tokn.text
                  
           # chunk3
           if tokn.dep_.find("subj") == True:
               enti_one = modifier +" "+ prefix + " "+ tokn.text
               prefix = ""
               modifier = ""
               dep_prev_token = ""
               txt_prev_token = ""
              
           # chunk4
           if tokn.dep_.find("obj") == True:
               enti_two = modifier +" "+ prefix +" "+ tokn.text
              
           # chunk 5
           # update variable
           dep_prev_token = tokn.dep_
           txt_prev_token = tokn.text
          
   return [enti_one.strip(), enti_two.strip()]

In [5]:
extract_entities("considered the exposure scenario of halide solar cell panels in operation, failing due to harsh environmental conditions. This may create a lead leakage in the nearby water bodies, an ecotoxicology problem. Effectively, the relatively high-water solubility of lead halide salts comprising perovskite may decompose to PbI2, hydroiodic acid, and methylamine. The degradation products could be evaluated with regard to ecotoxicity (e.g. Zebrafsh, Daphnia and C. elegans media)")
#extract_entities("A mixture of dimethylformamide (DMF) and dimethyl sulfoxide (DMSO) is the most popular solvent combination for a perovskite precursor solution. DMF is necessary to ensure a good dissolution of lead iodide, but it is also the most toxic solvent")

['hydroiodic degradation products', 'ecotoxicity']