## This Notebook contains codes for:
#### * Named Entity Recognition (NER)
#### * Part-of-Speech Tagging (POS tagging)

In [1]:
# Install rquired libraries
# !pip install spacy; spacy download en_core_web_sm

In [2]:
import spacy
from spacy import displacy

In [3]:
input_seq = "The companies that would be releasing their quarterly reports tomorrow are Microsoft, 4pm, Google, 4pm, and AT&T, 6pm."

## NER

In [4]:
def extract_companies(text):
    """
    Extracts company names from the input text using Named Entity Recognition (NER).

    This function uses the spaCy library to perform NER on the input text to identify and extract
    entities labeled as organizations ("ORG"). It also visualizes the entities, (only in the Jupyter notebook
    environment), using spaCy's 'displacy.render'.

    Args:
        text (str): The input string containing text from which companies (organizations) should be extracted.

    Returns:
        List[str]: A list of company names (organization entities) found in the text.
    """
    # Load the spaCy model for English
    ner = spacy.load("en_core_web_sm")
    
    # Perform NER on the input text
    extractions = ner(text)
    
    # Visualize the entities in the text
    displacy.render(extractions, style="ent", jupyter=True)
    
    # Return a list of company names (organizations)
    return [item.text for item in extractions.ents if item.label_ == "ORG"]

In [5]:
companies = extract_companies(input_seq)
print("\nThe definition of the label 'ORG': " + spacy.explain("ORG"))
print("Companies:", companies)


The definition of the label 'ORG': Companies, agencies, institutions, etc.
Companies: ['Microsoft', 'Google', 'AT&T']


## POS Tagging

In [6]:
def extract_pos(text):
    """
    Extracts parts of speech (POS) from the input text using spaCy's dependency parsing.

    This function uses spaCy to analyze the input text and extracts tokens with specific
    parts of speech, including nouns, verbs, adjectives, and proper nouns. It also visualizes
    the dependency parsing in the Jupyter notebook environment using spaCy's `displacy.render`.

    Args:
        text (str): The input text string from which parts of speech should be extracted.

    Returns:
        List[List[str, str]]: A list of lists, where each sublist contains a token and its part of speech.
    """
    # Load the spaCy model for English
    ner = spacy.load("en_core_web_sm")
    
    # Perform dependency parsing on the input text
    extractions = ner(text)
    
    # Visualize the dependency parsing in the text
    displacy.render(extractions, style='dep', jupyter=True, options={'compact': True, 'distance': 100})
    
    # Extract tokens and their parts of speech
    pos_tags = [[item.text, item.pos_] for item in extractions if item.pos_ in ["NOUN", "VERB", "ADJ", "PROPN"]]
    
    return pos_tags

In [7]:
print("Visualization of the dependency parsing in the text")
pos_tags = extract_pos(input_seq)

Visualization of the dependency parsing in the text


In [8]:
print(f"POS Tags:\n {pos_tags}")

POS Tags:
 [['companies', 'NOUN'], ['releasing', 'VERB'], ['quarterly', 'ADJ'], ['reports', 'NOUN'], ['tomorrow', 'NOUN'], ['Microsoft', 'PROPN'], ['pm', 'NOUN'], ['Google', 'PROPN'], ['pm', 'NOUN'], ['AT&T', 'PROPN'], ['pm', 'NOUN']]
