In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
import spacy

from nltk.stem import PorterStemmer, WordNetLemmatizer

from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
# Load spaCy's NLP model

nlp = spacy.load("en_core_web_sm")

In [9]:

# Input text

text = "John works at Google in California. He loves programming and playing football."

# 1. Segmentation (sentence tokenization)

sentences = sent_tokenize(text)

print("Segmentation:", sentences)

Segmentation: ['John works at Google in California.', 'He loves programming and playing football.']


In [10]:

# 2. Tokenization (word tokenization)

tokens = [word_tokenize(sentence) for sentence in sentences]

print("Tokenization:", tokens)

Tokenization: [['John', 'works', 'at', 'Google', 'in', 'California', '.'], ['He', 'loves', 'programming', 'and', 'playing', 'football', '.']]


In [11]:

# 3. Stemming

stemmer = PorterStemmer()

stemmed_tokens = [[stemmer.stem(token) for token in sentence] for sentence in tokens]

print("Stemming:", stemmed_tokens)

Stemming: [['john', 'work', 'at', 'googl', 'in', 'california', '.'], ['he', 'love', 'program', 'and', 'play', 'footbal', '.']]


In [12]:
# 4. Lemmatization

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [[lemmatizer.lemmatize(token) for token in sentence] for sentence in tokens]

print("Lemmatization:", lemmatized_tokens)


Lemmatization: [['John', 'work', 'at', 'Google', 'in', 'California', '.'], ['He', 'love', 'programming', 'and', 'playing', 'football', '.']]


In [13]:

# 5. POS Tagging

doc = nlp(text)

pos_tags = [(token.text, token.pos_) for token in doc]

print("POS Tagging:", pos_tags)

POS Tagging: [('John', 'PROPN'), ('works', 'VERB'), ('at', 'ADP'), ('Google', 'PROPN'), ('in', 'ADP'), ('California', 'PROPN'), ('.', 'PUNCT'), ('He', 'PRON'), ('loves', 'VERB'), ('programming', 'VERB'), ('and', 'CCONJ'), ('playing', 'VERB'), ('football', 'NOUN'), ('.', 'PUNCT')]


In [14]:


# 6. Named Entity Recognition (NER)

entities = [(entity.text, entity.label_) for entity in doc.ents]

print("Named Entities:", entities)


Named Entities: [('John', 'PERSON'), ('Google', 'ORG'), ('California', 'GPE')]


In [15]:
# 7. Parsing (Dependency Parsing)

for sent in doc.sents:

    for token in sent:

        print(f'{token.text:10} -> {token.dep_:10} -> {token.head.text}')

John       -> nsubj      -> works
works      -> ROOT       -> works
at         -> prep       -> works
Google     -> pobj       -> at
in         -> prep       -> works
California -> pobj       -> in
.          -> punct      -> works
He         -> nsubj      -> loves
loves      -> ROOT       -> loves
programming -> xcomp      -> loves
and        -> cc         -> programming
playing    -> conj       -> programming
football   -> dobj       -> playing
.          -> punct      -> loves
