In [1]:
import stanza

In [2]:
# stanza.download("en")  # Download model (only needed once)
nlp = stanza.Pipeline(lang="en", processors="tokenize,pos", use_gpu=True)  # Load model

"""
tokenize,pos → Enables tokenization & POS tagging (Part-of-Speech tagging).
"""


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-02-25 22:34:44 INFO: Downloaded file to C:\Users\Soheil\stanza_resources\resources.json
2025-02-25 22:34:44 INFO: Downloading default packages for language: en (English) ...
2025-02-25 22:34:46 INFO: File exists: C:\Users\Soheil\stanza_resources\en\default.zip
2025-02-25 22:34:48 INFO: Finished downloading models and saved to C:\Users\Soheil\stanza_resources
2025-02-25 22:34:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-02-25 22:34:48 INFO: Downloaded file to C:\Users\Soheil\stanza_resources\resources.json
2025-02-25 22:34:49 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2025-02-25 22:34:49 INFO: Using device: cuda
2025-02-25 22:34:49 INFO: Loading: tokenize
2025-02-25 22:34:50 INFO: Loading: mwt
2025-02-25 22:34:50 INFO: Loading: pos
2025-02-25 22:34:52 INFO: Done loading processors!


'\ntokenize,pos → Enables tokenization & POS tagging (Part-of-Speech tagging).\n'

In [3]:
nested_sentences = [
    ["He is a skilled engineer.", "He works in AI and robotics."],
    ["She is a professional researcher.", "She specializes in NLP."]
]

In [4]:
sentence = nested_sentences[0][0]  # "He is a skilled engineer."
doc = nlp(sentence)  # Process using Stanza

# Check the processed output
for sent in doc.sentences:
    for word in sent.words:
        print(f"Word: {word.text}, POS: {word.upos}")

Word: He, POS: PRON
Word: is, POS: AUX
Word: a, POS: DET
Word: skilled, POS: ADJ
Word: engineer, POS: NOUN
Word: ., POS: PUNCT


In [5]:
#            [expression for sublist in main-list for item in iterable(sublist) if statement]
adjectives = [word.text for sent in doc.sentences for word in sent.words if word.upos == "ADJ"]
print(adjectives)  # Expected output: ['skilled']

['skilled']


In [7]:
sent2 = nested_sentences [1][0]
sent2

'She is a professional researcher.'

In [None]:
" What is word.text, doc.sentences, ets?"
"""These are attributes commonly used in Natural Language Processing (NLP) libraries, such as SpaCy or Stanza, 
to access the structure of a text document.

doc.sentences: is a list of sentence objects that represents the sentences in a document.

sent.words: is a list of word objects (tokens) in a sentence. Each word has attributes such as text, upos (Universal Part-of-Speech tag), and more.

Attribute	What It Represents
doc.sentences	List of sentence objects in the document
sent.words	List of word objects (tokens) in a sentence
word.text	The actual word (e.g., "big")
word.upos	The Part-of-Speech (POS) tag (e.g., "ADJ")

"""

In [None]:
adejctive2 = [word.text for  word.sentec in ]

In [8]:
doc.sentences

[[
   {
     "id": 1,
     "text": "He",
     "upos": "PRON",
     "xpos": "PRP",
     "feats": "Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
     "start_char": 0,
     "end_char": 2
   },
   {
     "id": 2,
     "text": "is",
     "upos": "AUX",
     "xpos": "VBZ",
     "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
     "start_char": 3,
     "end_char": 5
   },
   {
     "id": 3,
     "text": "a",
     "upos": "DET",
     "xpos": "DT",
     "feats": "Definite=Ind|PronType=Art",
     "start_char": 6,
     "end_char": 7
   },
   {
     "id": 4,
     "text": "skilled",
     "upos": "ADJ",
     "xpos": "JJ",
     "feats": "Degree=Pos",
     "start_char": 8,
     "end_char": 15
   },
   {
     "id": 5,
     "text": "engineer",
     "upos": "NOUN",
     "xpos": "NN",
     "feats": "Number=Sing",
     "start_char": 16,
     "end_char": 24,
     "misc": "SpaceAfter=No"
   },
   {
     "id": 6,
     "text": ".",
     "upos": "PUNCT",
     "xpos": ".",
     "star

In [16]:
adjectives_nested = []
for bio in nested_sentences:
    adjectives_per_bio = []
    for sentence in bio:
        doc = nlp(sentence)  
        adjectives = [word.text for sent in doc.sentences for word in sent.words if word.upos == "ADJ"]
        adjectives_per_bio.append(adjectives)
    adjectives_nested.append(adjectives_per_bio)

print(adjectives_nested)  # [['skilled'], []], [['professional'], []]

[[['skilled'], []], [['professional'], []]]


In [19]:
def extract_adj(x):
    
    adjectives_nested = []
    for bio in nested_sentences:
        adjectives_per_bio = []
        for sentence in bio:
            doc = nlp(sentence)  
            adjectives = [word.text for sent in doc.sentences for word in sent.words if word.upos == "ADJ"]
            adjectives_per_bio.append(adjectives)
        adjectives_nested.append(adjectives_per_bio)
    return adjectives_nested

In [20]:
ads = extract_adj(nested_sentences)
ads

[[['skilled'], []], [['professional'], []]]

In [21]:
processed_texts = [" ".join([word.text for sent in doc.sentences for word in sent.words if word.upos != "ADJ"]) for word in sent.words]

In [None]:
def remo_extract_adj(sat_bio):
    adjectives_nested = []  # Store extracted adjectives
    processed_texts = []  # Store bios without adjectives

    for bio in sat_bio:  # Iterate over each bio (list of sentences)
        adjectives_per_bio = []
        modified_sentences = []  # Store sentences without adjectives
        
        for sentence in bio:
            doc = nlp(sentence)  # Process sentence with Stanza
            
            # Extract adjectives
            adjectives = [word.text for sent in doc.sentences for word in sent.words if word.upos == "ADJ"]
            adjectives_per_bio.append(adjectives)
            
            # Remove adjectives and reconstruct sentence
            modified_sentence = " ".join([word.text for sent in doc.sentences for word in sent.words if word.upos != "ADJ"])
            modified_sentences.append(modified_sentence)  # Add modified sentence

        adjectives_nested.append(adjectives_per_bio)  # Keep nested structure
        processed_texts.append(modified_sentences)  # Keep sentences grouped per bio

    return adjectives_nested, processed_texts

# Example input: Nested list of sentences
# sat_bio = [
#     ["He is a skilled engineer.", "He works in AI and robotics."],
#     ["She is a professional researcher.", "She specializes in NLP."]
# ]

# Run the function
# adjectives, modified_bios = remo_extract_adj(sat_bio)

# # Print Results
# print("Extracted Adjectives:\n", adjectives)
# print("\nModified Bios:\n", modified_bios)