In [1]:
# install spacy's large english model
! python -m spacy download en_core_web_lg

[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz#egg=en_core_web_lg==2.3.1 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting en_core_web_lg==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz (782.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m782.7/782.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: en_core_web_lg
  Building wheel for en_core_web_lg (setup.py) ... [?25ldone
[?25h  Created wheel for en_core_web_lg: filename=en_core_web_lg-2.3.1-py3-none-any.whl size=782936102 sha256=4

In [2]:
import spacy
import re
nlp = spacy.load("en_core_web_lg")
def tokenize(text):
    """
    Processes the input text, splits it into sentences, and further processes each sentence
    to extract non-numeric words. It constructs a list of these words for each sentence.

    Parameters:
    text (str): A string containing multiple sentences.

    Returns:
    list: A list of lists, where each inner list contains the words from one sentence,
          excluding any numeric strings.
    """
    # remove newline characters, this line is not necessary for all cases
    # the reason it is included here is because the abstracts in the dataset contain abnormal newline characters
    # e.g. Recent works on diffusion models have demonstrated a strong capability for\nconditioning image generation,
    text=text.replace('\n',' ')
    # Initialize an empty list to store the list of words for each sentence
    sentence_list=[]
    # Process the sentence using the spacy model to extract linguistic features and split into components
    doc=nlp(text)
    # Iterate over each sentence in the processed text
    for sent in doc.sents:
        # Extract the words from the sentence
        words = re.findall(r'\b\w+\b', sent.text.lower())
        # Remove any words that are numeric
        words_without_digits=[word for word in words if not word.isdigit()]
        # If the list is not empty, append the list of words to the sentence_list
        if len(words_without_digits)!=0:
            sentence_list.append(words_without_digits)
    return sentence_list

In [3]:
# demo
text=""""Passive acoustic monitoring is used widely in ecology, biodiversity, and\nconservation studies. 
Data sets collected via acoustic monitoring are often\nextremely large and built to be processed automatically using Artificial\nIntelligence and Machine learning models, 
which aim to replicate the work of\ndomain experts. These models, being supervised learning algorithms, need to be\ntrained on high quality annotations produced by experts. 
Since the experts are\noften resource-limited, a cost-effective process for annotating audio is needed\nto get maximal use out of the data. 
We present an open-source interactive audio\ndata annotation tool, NEAL (Nature+Energy Audio Labeller). 
Built using R and\nthe associated Shiny framework, the tool provides a reactive environment where\nusers can quickly annotate audio files and adjust settings that automatically\nchange the corresponding elements of the user interface. 
The app has been\ndesigned with the goal of having both expert birders and citizen scientists\ncontribute to acoustic annotation projects. 
The popularity and flexibility of R\nprogramming in bioacoustics means that the Shiny app can be modified for other\nbird labelling data sets, or even to generic audio labelling tasks. 
We\ndemonstrate the app by labelling data collected from wind farm sites across\nIreland.\n'"""
print(tokenize(text))

[['passive', 'acoustic', 'monitoring', 'is', 'used', 'widely', 'in', 'ecology', 'biodiversity', 'and', 'conservation', 'studies'], ['data', 'sets', 'collected', 'via', 'acoustic', 'monitoring', 'are', 'often', 'extremely', 'large', 'and', 'built', 'to', 'be', 'processed', 'automatically', 'using', 'artificial', 'intelligence', 'and', 'machine', 'learning', 'models', 'which', 'aim', 'to', 'replicate', 'the', 'work', 'of', 'domain', 'experts'], ['these', 'models', 'being', 'supervised', 'learning', 'algorithms', 'need', 'to', 'be', 'trained', 'on', 'high', 'quality', 'annotations', 'produced', 'by', 'experts'], ['since', 'the', 'experts', 'are', 'often', 'resource', 'limited', 'a', 'cost', 'effective', 'process', 'for', 'annotating', 'audio', 'is', 'needed', 'to', 'get', 'maximal', 'use', 'out', 'of', 'the', 'data'], ['we', 'present', 'an', 'open', 'source', 'interactive', 'audio', 'data', 'annotation', 'tool', 'neal', 'nature', 'energy', 'audio', 'labeller'], ['built', 'using', 'r', 'an