In [1]:
import spacy 

In [2]:
# Load the language model 
nlp = spacy.load("en_core_web_sm")

In [3]:
# Example texts for each task
text_lem = "The cars are being driven carefully while the dogs are barking."
text_tok = "I am learning natural language processing using spaCy."
text_ner = "Barack Obama served as the 44th president of the United States and was born in Hawaii."
text_sent = "Machine learning is fascinating. It enables computers to learn from data. Natural language processing is a subfield of AI."
text_stop = "Stop words are common words that often do not add much meaning."
text_pos = "Natural language processing is an exciting field of artificial intelligence."

In [4]:
# Lemmatization
def lemmatization(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]
print("Lemmatization:", lemmatization(text_lem))

Lemmatization: ['the', 'car', 'be', 'be', 'drive', 'carefully', 'while', 'the', 'dog', 'be', 'bark', '.']


In [5]:
# Tokenization
def tokenization(text):
    doc = nlp(text)
    return [token.text for token in doc]

print("Tokenization:", tokenization(text_tok))
print("Number of Tokens:", len(tokenization(text_tok)))

Tokenization: ['I', 'am', 'learning', 'natural', 'language', 'processing', 'using', 'spaCy', '.']
Number of Tokens: 9


In [6]:
# Named Entity Recognition (NER)
def named_entity_recognition(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

print("Named Entities:", named_entity_recognition(text_ner))

Named Entities: [('Barack Obama', 'PERSON'), ('44th', 'ORDINAL'), ('the United States', 'GPE'), ('Hawaii', 'GPE')]


In [7]:
# Sentence Segmentation
def sentence_segmentation(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

print("Sentences:", sentence_segmentation(text_sent))


Sentences: ['Machine learning is fascinating.', 'It enables computers to learn from data.', 'Natural language processing is a subfield of AI.']


In [8]:
# Stop Word Removal
def remove_stopwords(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]

print("Without Stop Words:", remove_stopwords(text_stop))

Without Stop Words: ['Stop', 'words', 'common', 'words', 'add', 'meaning', '.']


In [9]:
# Parts of Speech (POS) Tagging
def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_, spacy.explain(token.pos_)) for token in doc]

print("POS Tagging:", pos_tagging(text_pos))

POS Tagging: [('Natural', 'ADJ', 'adjective'), ('language', 'NOUN', 'noun'), ('processing', 'NOUN', 'noun'), ('is', 'AUX', 'auxiliary'), ('an', 'DET', 'determiner'), ('exciting', 'ADJ', 'adjective'), ('field', 'NOUN', 'noun'), ('of', 'ADP', 'adposition'), ('artificial', 'ADJ', 'adjective'), ('intelligence', 'NOUN', 'noun'), ('.', 'PUNCT', 'punctuation')]


In [12]:
# All Techniques Combined

text_wrap_up = "Albert Einstein, a theoretical physicist, was born in Germany. He developed the theory of relativity, which is one of the two pillars of modern physics."

def full_nlp_pipeline(text):
    doc = nlp(text)
    segmented_sentences = [sent.text for sent in doc.sents]
    tokens = [token.text for token in doc]
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    lemmas = [token.lemma_ for token in doc]
    pos_tags = [(token.text, token.pos_, spacy.explain(token.pos_)) for token in doc]
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]

    return {
        "Sentences": segmented_sentences,
        "Tokens": tokens,
        "Without Stop Words": filtered_tokens,
        "Lemmas": lemmas,
        "POS Tags": pos_tags,
        "Named Entities": named_entities
    }

result = full_nlp_pipeline(text_wrap_up)

print("\nFull NLP Pipeline Output:")
for key, value in result.items():
    print(f"{key}: {value}")


Full NLP Pipeline Output:
Sentences: ['Albert Einstein, a theoretical physicist, was born in Germany.', 'He developed the theory of relativity, which is one of the two pillars of modern physics.']
Tokens: ['Albert', 'Einstein', ',', 'a', 'theoretical', 'physicist', ',', 'was', 'born', 'in', 'Germany', '.', 'He', 'developed', 'the', 'theory', 'of', 'relativity', ',', 'which', 'is', 'one', 'of', 'the', 'two', 'pillars', 'of', 'modern', 'physics', '.']
Without Stop Words: ['Albert', 'Einstein', ',', 'theoretical', 'physicist', ',', 'born', 'Germany', '.', 'developed', 'theory', 'relativity', ',', 'pillars', 'modern', 'physics', '.']
Lemmas: ['Albert', 'Einstein', ',', 'a', 'theoretical', 'physicist', ',', 'be', 'bear', 'in', 'Germany', '.', 'he', 'develop', 'the', 'theory', 'of', 'relativity', ',', 'which', 'be', 'one', 'of', 'the', 'two', 'pillar', 'of', 'modern', 'physic', '.']
POS Tags: [('Albert', 'PROPN', 'proper noun'), ('Einstein', 'PROPN', 'proper noun'), (',', 'PUNCT', 'punctuat