# 1.Clean Data

In [1]:
import pandas as pd
import chardet
from ftfy import fix_text
import csv
import re

In [2]:
low_memory=False

In [3]:
input_path = '720_unique_title_description.csv'
output_path = 'metadata_table_utf8.csv'

with open(input_path, 'rb') as f:
    raw_data = f.read(10000)
    result = chardet.detect(raw_data)
    encoding = result['encoding']

encoding

'utf-8'

In [4]:
with open(output_path, 'w', newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)

    # read the input file
    with open(input_path, 'r', encoding=encoding) as input_file:
        reader = csv.reader(input_file, delimiter=',')
        for i, line in enumerate(reader):
            if i >= 100:  # only process the first 100 lines
                break
            
            # standardize the text in each field
            fixed_line = [fix_text(field) for field in line]
            
            # write the fixed line to the output file
            writer.writerow(fixed_line)

print(f"First 100 lines have been processed and saved as {output_path}.")


First 100 lines have been processed and saved as metadata_table_utf8.csv.


In [5]:
df = pd.read_csv(output_path)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [6]:
df.columns

Index(['title', 'description'], dtype='object')

In [7]:
df.head(3)

Unnamed: 0,title,description
0,"\nxThe new species of the genus Atanygnathus Jacobson, 1909 (Coleoptera: Styphylinidae: Staphylininae) from Réunion Island","\n\002This dataset contains the digitized treatments in Plazi based on the original journal article Kocian, Matúš, Hlaváč, Peter (2019): The new species of the genus Atanygnathus Jacobson, 1909 (Coleoptera: Styphylinidae: Staphylininae) from Réunion Island. Zootaxa 4612 (1): 141-144, DOI: https://doi.org/10.11646/zootaxa.4612.1.12"
1,"\n""\001Rivulus uakti sp. n. and R. amanapira sp. n. (Teleostei: Cyprinodontiformes: Rivulidae): two new species from the upper Rio Negro, Brazilian Amazon.","\ný\002This dataset contains the digitized treatments in Plazi based on the original journal article Wilson J. E. M. Costa (2004): Rivulus uakti sp. n. and R. amanapira sp. n. (Teleostei: Cyprinodontiformes: Rivulidae): two new species from the upper Rio Negro, Brazilian Amazon. Zootaxa 465: 1-12, URL: http://www.zoobank.org/urn:lsid:zoobank.org:pub:EFD7A94F-D68A-4A04-801C-C6B2E9DF76B8"
2,"\n;N2 Schafer Lab N2 (Bristol, UK) | 2012-11-09T15:37:36+00:00","\nÄ\r<blockquote>\n<p>This experiment is part of the <em>C.elegans behavioural database</em>. For more information and the complete collection of experiments visit http://movement.openworm.org</p>\n</blockquote>\n\n<ul>\n<li><b>preview link</b> : https://www.youtube.com/watch?v=V16MVNPcWQ4</li>\n<li><b>strain</b> : N2</li>\n<li><b>timestamp</b> : 2012-11-09T15:37:36+00:00</li>\n<li><b>gene</b> : -N/A-</li>\n<li><b>chromosome</b> : -N/A-</li>\n<li><b>allele</b> : -N/A-</li>\n<li><b>strain_description</b> : Schafer Lab N2 (Bristol, UK)</li>\n<li><b>sex</b> : hermaphrodite</li>\n<li><b>stage</b> : adult</li>\n<li><b>ventral_side</b> : clockwise</li>\n<li><b>media</b> : NGM agar low peptone</li>\n<li><b>arena</b> : <ul>\n <li><b>style</b> : petri</li>\n <li><b>size</b> : 35</li>\n <li><b>orientation</b> : away</li>\n </ul>\n</li>\n<li><b>food</b> : OP50</li>\n<li><b>habituation</b> : 30m wait</li>\n<li><b>who</b> : Laura Grundy</li>\n<li><b>protocol</b> : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts.</li>\n<li><b>lab</b> : <ul>\n <li><b>name</b> : William R Schafer</li>\n <li><b>location</b> : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK</li>\n </ul>\n</li>\n<li><b>software</b> : <ul>\n <li><b>name</b> : tierpsy (https://github.com/ver228/tierpsy-tracker)</li>\n <li><b>version</b> : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219</li>\n <li><b>featureID</b> : @OMG</li>\n </ul>\n</li>\n<li><b>base_name</b> : N2 on food L_2012_11_09__15_37_36___6___2</li>\n<li><b>total time (s)</b> : 899.033</li>\n<li><b>frames per second</b> : 30.03</li>\n<li><b>video micrometers per pixel</b> : 4.17406</li>\n<li><b>number of segmented skeletons</b> : 26993</li>\n</ul>"


In [8]:
def clean_text_with_structure(text):
    # Remove unnecessary escape sequences for single and double quotes
    text = re.sub(r'\\([\'"])', r'\1', text)  
    # Remove newline escape sequences followed by letters
    text = re.sub(r'\\n[_A-Za-z]', '', text)  
    # Remove newline escape sequences followed by non-alphanumeric characters
    text = re.sub(r'\\n[\W\d]+', '', text) 
    # Remove standalone 'n' or 'r' that might be remnants of newline characters
    text = re.sub(r'\b[nr]+\b', '', text)  
    # Replace multiple newlines or carriage returns with a single newline
    text = re.sub(r'[\n\r]+', '\n', text)
    # Remove non-ASCII characters to ensure clean text
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove trailing backslashes at the end of the text
    text = re.sub(r'\\$', '', text)  
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Trim leading and trailing whitespace
    return text.strip()

In [9]:
def clean_text_with_structure(text):
    # Remove unnecessary escape sequences for single and double quotes
    text = re.sub(r'\\([\'"])', r'\1', text)  
    # Remove newline escape sequences followed by letters
    text = re.sub(r'\\n[_A-Za-z]', '', text)  
    # Remove newline escape sequences followed by non-alphanumeric characters
    text = re.sub(r'\\n[\W\d]+', '', text) 
    # Replace multiple newlines or carriage returns with a single newline
    text = re.sub(r'[\n\r]+', '\n', text)
    # Remove non-ASCII characters to ensure clean text
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove trailing backslashes at the end of the text
    text = re.sub(r'\\$', '', text)  
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Trim leading and trailing whitespace
    return text.strip()

In [10]:
from bs4 import BeautifulSoup
# check if the text contains HTML tags
def is_html(text):
    return bool(re.search(r'<[^>]+>', text))

# extract text from HTML
def extract_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

# process the text based on whether it is HTML or plain text
def process_html_text(text):
    if is_html(text):
        text = extract_text_from_html(text)
    return text

In [11]:
# clean the text in the 'title' and 'description' columns
df['title'] = df['title'].apply(clean_text_with_structure)
df['description'] = df['description'].apply(process_html_text)
df['description'] = df['description'].apply(clean_text_with_structure)

df.head(5)

Unnamed: 0,title,description
0,"The new species of the genus Atanygnathus Jacobson, 1909 (Coleoptera: Styphylinidae: Staphylininae) from Runion Island","This dataset contains the digitized treatments in Plazi based on the original journal article Kocian, Mat, Hlav, Peter (2019): The new species of the genus Atanygnathus Jacobson, 1909 (Coleoptera: Styphylinidae: Staphylininae) from Runion Island. Zootaxa 4612 (1): 141-144, DOI: https://doi.org/10.11646/zootaxa.4612.1.12"
1,"Rivulus uakti sp. n. and R. amanapira sp. n. (Teleostei: Cyprinodontiformes: Rivulidae): two new species from the upper Rio Negro, Brazilian Amazon.","\n\002This dataset contains the digitized treatments in Plazi based on the original journal article Wilson J. E. M. Costa (2004): Rivulus uakti sp. n. and R. amanapira sp. n. (Teleostei: Cyprinodontiformes: Rivulidae): two new species from the upper Rio Negro, Brazilian Amazon. Zootaxa 465: 1-12, URL: http://www.zoobank.org/urn:lsid:zoobank.org:pub:EFD7A94F-D68A-4A04-801C-C6B2E9DF76B8"
2,"N2 Schafer Lab N2 (Bristol, UK) | 2012-11-09T15:37:36+00:00","\n\r This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org nn preview link : https://www.youtube.com/watch?v=V16MVNPcWQ4 strain : N2 timestamp : 2012-11-09T15:37:36+00:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : clockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away n food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK n software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG n base_name : N2 on food L_2012_11_09__15_37_36___6___2 total time (s) : 899.033 frames per second : 30.03 video micrometers per pixel : 4.17406 number of segmented skeletons : 26993 \n"
3,"N2 Schafer Lab N2 (Bristol, UK) | 2010-07-27T10:25:00+01:00","r This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org nn preview link : https://www.youtube.com/watch?v=AvXCqT7A_w8 strain : N2 timestamp : 2010-07-27T10:25:00+01:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : anticlockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away n food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK n software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG n base_name : N2 on food R_2010_07_27__10_25___3___1 total time (s) : 898.102 frames per second : 24.0385 video micrometers per pixel : 4.53292 number of segmented skeletons : 21410 \n"
4,"N2 Schafer Lab N2 (Bristol, UK) | 2010-03-18T10:15:31+00:00","r This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org nn preview link : https://www.youtube.com/watch?v=x3dejLvvtKA strain : N2 timestamp : 2010-03-18T10:15:31+00:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : clockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away n food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK n software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG n base_name : N2 on food L_2010_03_18__10_15_31___7___1 total time (s) : 898.794 frames per second : 25.641 video micrometers per pixel : 4.31067 number of segmented skeletons : 18616 \n"


# 2. Extract Datatypes

## 2.1 Spacy

In [12]:
import spacy

nlp = spacy.load('en_core_web_sm')

text="This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org preview link : https://www.youtube.com/watch?v=AvXCqT7A_w8 strain : N2 timestamp : 2010-07-27T10:25:00+01:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : anticlockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG base_name : N2 on food R_2010_07_27__10_25___3___1 total time (s) : 898.102 frames per second : 24.0385 video micrometers per pixel : 4.53292 number of segmented skeletons : 21410"

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


  from .autonotebook import tqdm as notebook_tqdm


C.elegans NORP
https://www.youtube.com/watch?v=AvXCqT7A_w8 CARDINAL
2010-07-27T10:25:00+01:00 DATE
Bristol GPE
UK GPE
NGM ORG
35 CARDINAL
30 CARDINAL
Laura Grundy PERSON
E. Yemini GPE
doi:10.1038 PERSON
30 minutes TIME
William R Schafer PERSON
MRC Laboratory of Molecular Biology ORG
Hills Road FAC
Cambridge GPE
0QH GPE
UK GPE
tierpsy PERSON
898.102 CARDINAL
second ORDINAL
24.0385 CARDINAL
4.53292 CARDINAL
21410 CARDINAL


## 2.2 Spacy with Transformer

In [23]:
import spacy

nlp = spacy.load('en_core_web_trf')

text="This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org preview link : https://www.youtube.com/watch?v=AvXCqT7A_w8 strain : N2 timestamp : 2010-07-27T10:25:00+01:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : anticlockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG base_name : N2 on food R_2010_07_27__10_25___3___1 total time (s) : 898.102 frames per second : 24.0385 video micrometers per pixel : 4.53292 number of segmented skeletons : 21410"

doc = nlp(text)

for ent in doc.ents:
    # only print entities name
    print("entity: ", ent.text, "\n")

  model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):


entity:  C.elegans 

entity:  2010-07 

entity:  Schafer Lab 

entity:  Bristol 

entity:  UK 

entity:  35 

entity:  30 

entity:  Laura Grundy 

entity:  E. Yemini 

entity:  30 minutes 

entity:  William R Schafer 

entity:  MRC Laboratory of Molecular Biology 

entity:  Hills Road 

entity:  Cambridge 

entity:  UK 

entity:  898.102 

entity:  24.0385 

entity:  4.53292 

entity:  21410 



In [14]:
ner_labels = nlp.get_pipe('ner').labels
print(ner_labels)

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


In [15]:
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

text = pd.Series(["This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org preview link : https://www.youtube.com/watch?v=V16MVNPcWQ4 strain : N2 timestamp : 2012-11-09T15:37:36+00:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : clockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG base_name : N2 on food L_2012_11_09__15_37_36___6___2 total time (s) : 899.033 frames per second : 30.03 video micrometers per pixel : 4.17406 number of segmented skeletons : 26993"])
descriptions_tokenized = text.apply(tokenize)
print(descriptions_tokenized)

  with torch.cuda.amp.autocast(self._mixed_precision):


0    [This, experiment, is, part, of, the, C.elegans, behavioural, database, ., For, more, information, and, the, complete, collection, of, experiments, visit, http://movement.openworm.org, preview, link, :, https://www.youtube.com/watch?v=V16MVNPcWQ4, strain, :, N2, timestamp, :, 2012, -, 11, -, 09T15:37:36, +, 00:00, gene, :, -N, /, A-, chromosome, :, -N, /, A-, allele, :, -N, /, A-, strain_description, :, Schafer, Lab, N2, (, Bristol, ,, UK, ), sex, :, hermaphrodite, stage, :, adult, ventral_side, :, clockwise, media, :, NGM, agar, low, peptone, arena, :, style, :, petri, size, :, 35, orientation, :, away, food, :, OP50, habituation, :, 30, m, wait, who, :, Laura, Grundy, ...]
dtype: object


In [16]:
def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

descriptions_pos_tagged = text.apply(pos_tagging)
print(descriptions_pos_tagged)

0    [(This, DET), (experiment, NOUN), (is, AUX), (part, NOUN), (of, ADP), (the, DET), (C.elegans, PROPN), (behavioural, ADJ), (database, NOUN), (., PUNCT), (For, ADP), (more, ADJ), (information, NOUN), (and, CCONJ), (the, DET), (complete, ADJ), (collection, NOUN), (of, ADP), (experiments, NOUN), (visit, VERB), (http://movement.openworm.org, X), (preview, NOUN), (link, NOUN), (:, PUNCT), (https://www.youtube.com/watch?v=V16MVNPcWQ4, X), (strain, NOUN), (:, PUNCT), (N2, PROPN), (timestamp, NOUN), (:, PUNCT), (2012, NUM), (-, NUM), (11, NUM), (-, NUM), (09T15:37:36, NUM), (+, NOUN), (00:00, NUM), (gene, NOUN), (:, PUNCT), (-N, NOUN), (/, SYM), (A-, NOUN), (chromosome, NOUN), (:, PUNCT), (-N, NOUN), (/, NOUN), (A-, NOUN), (allele, NOUN), (:, PUNCT), (-N, NOUN), (/, SYM), (A-, NOUN), (strain_description, NOUN), (:, PUNCT), (Schafer, PROPN), (Lab, PROPN), (N2, PROPN), ((, PUNCT), (Bristol, PROPN), (,, PUNCT), (UK, PROPN), (), PUNCT), (sex, NOUN), (:, PUNCT), (hermaphrodite, ADJ), (stage, NO

In [17]:
def remove_stopwords(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]

descriptions_no_stopwords = text.apply(remove_stopwords)

print(descriptions_no_stopwords)


0    [experiment, C.elegans, behavioural, database, ., information, complete, collection, experiments, visit, http://movement.openworm.org, preview, link, :, https://www.youtube.com/watch?v=V16MVNPcWQ4, strain, :, N2, timestamp, :, 2012, -, 11, -, 09T15:37:36, +, 00:00, gene, :, -N, /, A-, chromosome, :, -N, /, A-, allele, :, -N, /, A-, strain_description, :, Schafer, Lab, N2, (, Bristol, ,, UK, ), sex, :, hermaphrodite, stage, :, adult, ventral_side, :, clockwise, media, :, NGM, agar, low, peptone, arena, :, style, :, petri, size, :, 35, orientation, :, away, food, :, OP50, habituation, :, 30, m, wait, :, Laura, Grundy, protocol, :, Method, E., Yemini, et, al, ., doi:10.1038, /, nmeth.2560, ...]
dtype: object


## 2.3 Flair

In [18]:
from flair.models import SequenceTagger
from flair.data import Sentence

# load the NER tagger
tagger = SequenceTagger.load("ner")

sentence = Sentence("This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org preview link : https://www.youtube.com/watch?v=AvXCqT7A_w8 strain : N2 timestamp : 2010-07-27T10:25:00+01:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : anticlockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG base_name : N2 on food R_2010_07_27__10_25___3___1 total time (s) : 898.102 frames per second : 24.0385 video micrometers per pixel : 4.53292 number of segmented skeletons : 21410")

# predict NER tags
tagger.predict(sentence)

# print the entities and their labels
for entity in sentence.get_spans("ner"):
    print(f"Entity: {entity.text}, Label: {entity.get_label('ner').value}")

2024-09-24 13:24:48,757 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Entity: C.elegans, Label: MISC
Entity: Schafer Lab N2, Label: MISC
Entity: Bristol, Label: LOC
Entity: UK, Label: LOC
Entity: Laura Grundy, Label: MISC
Entity: MRC Laboratory of Molecular Biology, Label: ORG
Entity: Hills Road, Label: ORG
Entity: Cambridge, Label: LOC
Entity: UK, Label: LOC


In [19]:
import spacy
from flair.models import SequenceTagger
from flair.data import Sentence

# load the spaCy NLP model
nlp = spacy.load('en_core_web_sm')

# load the Flair NER tagger
tagger = SequenceTagger.load('ner')

# define the text to be processed
sentence = Sentence("This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org preview link : https://www.youtube.com/watch?v=AvXCqT7A_w8 strain : N2 timestamp : 2010-07-27T10:25:00+01:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : anticlockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG base_name : N2 on food R_2010_07_27__10_25___3___1 total time (s) : 898.102 frames per second : 24.0385 video micrometers per pixel : 4.53292 number of segmented skeletons : 21410")


# process the text with spaCy
sentence = Sentence(doc.text)
tagger.predict(sentence)

# print the entities and their labels
print(sentence.to_tagged_string())


2024-09-24 13:24:53,154 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Sentence[233]: "This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org preview link : https://www.youtube.com/watch?v=AvXCqT7A_w8 strain : N2 timestamp : 2010-07-27T10:25:00+01:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : anticlockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology,

## 2.4  GLiNER

In [20]:
from gliner import GLiNER

# load the GLiNER model
ner = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")

# define the text to be processed
text = """This experiment is part of the C.elegans behavioural database . For more information and the complete collection of experiments visit http://movement.openworm.org preview link : https://www.youtube.com/watch?v=AvXCqT7A_w8 strain : N2 timestamp : 2010-07-27T10:25:00+01:00 gene : -N/A- chromosome : -N/A- allele : -N/A- strain_description : Schafer Lab N2 (Bristol, UK) sex : hermaphrodite stage : adult ventral_side : anticlockwise media : NGM agar low peptone arena : style : petri size : 35 orientation : away food : OP50 habituation : 30m wait who : Laura Grundy protocol : Method in E. Yemini et al. doi:10.1038/nmeth.2560. Worm transferred to arena 30 minutes before recording starts. lab : name : William R Schafer location : MRC Laboratory of Molecular Biology, Hills Road, Cambridge, CB2 0QH, UK software : name : tierpsy (https://github.com/ver228/tierpsy-tracker) version : cbfc23eb4f1ac2f29be75ade7a937eed58a5b219 featureID : @OMG base_name : N2 on food R_2010_07_27__10_25___3___1 total time (s) : 898.102 frames per second : 24.0385 video micrometers per pixel : 4.53292 number of segmented skeletons : 21410"""

# predict NER tags
entities = ner(text)

# print the entities and their labels
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


TypeError: string indices must be integers