**Lower Casing**



In [20]:
s = "Hy my name is nikhil and it was a nice experience with NLP. Come on guys, Do play with NLP techniques.'"
text_lower = s.lower()
print(text_lower)


hy my name is nikhil and it was a nice experience with nlp. come on guys, do play with nlp techniques.'


**Tokenization**

In [21]:
import spacy
nlp=spacy.load('en_core_web_sm')
doc=nlp(s)
for i in doc:
  print(i.text)

Hy
my
name
is
nikhil
and
it
was
a
nice
experience
with
NLP
.
Come
on
guys
,
Do
play
with
NLP
techniques
.
'


**Removing Punctuations and Digits**

In [23]:
punct_digit=[i.text for i in doc if not i.is_punct and not i.is_digit]
print(punct_digit)

['Hy', 'my', 'name', 'is', 'nikhil', 'and', 'it', 'was', 'a', 'nice', 'experience', 'with', 'NLP', 'Come', 'on', 'guys', 'Do', 'play', 'with', 'NLP', 'techniques']


**Stop Words**

In [25]:
stopwords = [token.text for token in doc if not token.is_stop]
print(stopwords)

['Hy', 'nikhil', 'nice', 'experience', 'NLP', '.', 'Come', 'guys', ',', 'play', 'NLP', 'techniques', '.', "'"]


**Lemmatization**

In [26]:
lemmas = [token.lemma_ for token in doc]
print(lemmas)

['hy', 'my', 'name', 'be', 'nikhil', 'and', 'it', 'be', 'a', 'nice', 'experience', 'with', 'NLP', '.', 'come', 'on', 'guy', ',', 'do', 'play', 'with', 'NLP', 'technique', '.', "'"]


**Parts of Speech (POS)**

In [27]:
pos_tags = [(token.text, token.pos_) for token in doc]
print(pos_tags)


[('Hy', 'INTJ'), ('my', 'PRON'), ('name', 'NOUN'), ('is', 'AUX'), ('nikhil', 'ADJ'), ('and', 'CCONJ'), ('it', 'PRON'), ('was', 'AUX'), ('a', 'DET'), ('nice', 'ADJ'), ('experience', 'NOUN'), ('with', 'ADP'), ('NLP', 'PROPN'), ('.', 'PUNCT'), ('Come', 'VERB'), ('on', 'ADP'), ('guys', 'NOUN'), (',', 'PUNCT'), ('Do', 'AUX'), ('play', 'VERB'), ('with', 'ADP'), ('NLP', 'PROPN'), ('techniques', 'NOUN'), ('.', 'PUNCT'), ("'", 'PUNCT')]


**Named Entity Recognition**

In [28]:
entities = [(entity.text, entity.label_) for entity in doc.ents]
print(entities)

[('NLP', 'ORG'), ('NLP', 'ORG')]


**Removing Non-Alphabetic Tokens**

In [29]:
tokens_alpha = [token.text for token in doc if token.is_alpha]
print("Alphabetic Tokens:", tokens_alpha)

Alphabetic Tokens: ['Hy', 'my', 'name', 'is', 'nikhil', 'and', 'it', 'was', 'a', 'nice', 'experience', 'with', 'NLP', 'Come', 'on', 'guys', 'Do', 'play', 'with', 'NLP', 'techniques']


**Dependency parsing**

In [31]:
dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
print(dependencies)

[('Hy', 'intj', 'is'), ('my', 'poss', 'name'), ('name', 'nsubj', 'is'), ('is', 'ROOT', 'is'), ('nikhil', 'acomp', 'is'), ('and', 'cc', 'is'), ('it', 'nsubj', 'was'), ('was', 'conj', 'is'), ('a', 'det', 'experience'), ('nice', 'amod', 'experience'), ('experience', 'attr', 'was'), ('with', 'prep', 'experience'), ('NLP', 'pobj', 'with'), ('.', 'punct', 'was'), ('Come', 'advcl', 'play'), ('on', 'prep', 'Come'), ('guys', 'pobj', 'on'), (',', 'punct', 'Come'), ('Do', 'aux', 'play'), ('play', 'ROOT', 'play'), ('with', 'prep', 'play'), ('NLP', 'compound', 'techniques'), ('techniques', 'pobj', 'with'), ('.', 'punct', 'play'), ("'", 'punct', 'play')]


**Noun-Chunk Extraction**

In [32]:
noun_chunks = [chunk.text for chunk in doc.noun_chunks]
print("Noun Chunks:", noun_chunks)


Noun Chunks: ['my name', 'it', 'a nice experience', 'NLP', 'guys', 'NLP techniques']


**Normalizing Text (Lowercasing, Removing Punctuation and Stop Words)**

In [33]:
normalized_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
print(normalized_tokens)


['hy', 'nikhil', 'nice', 'experience', 'nlp', 'come', 'guy', 'play', 'nlp', 'technique']


**Custom Pipeline Component (Adding a Simple Custom Component)**

In [35]:
from spacy.tokens import Token

# Registering a new custom attribute
Token.set_extension("is_upper", getter=lambda token: token.text.isupper())

# Using the custom attribute
custom_attributes = [(token.text, token._.is_upper) for token in doc]
print(custom_attributes)


[('Hy', False), ('my', False), ('name', False), ('is', False), ('nikhil', False), ('and', False), ('it', False), ('was', False), ('a', False), ('nice', False), ('experience', False), ('with', False), ('NLP', True), ('.', False), ('Come', False), ('on', False), ('guys', False), (',', False), ('Do', False), ('play', False), ('with', False), ('NLP', True), ('techniques', False), ('.', False), ("'", False)]


**Sentence Boundary Detection**

In [36]:
sentences = [sent.text for sent in doc.sents]
print(sentences)


['Hy my name is nikhil and it was a nice experience with NLP.', "Come on guys, Do play with NLP techniques.'"]


**Extracting Subtrees**

In [37]:
subtrees = [list(token.subtree) for token in doc]
subtree_texts = [" ".join([t.text for t in subtree]) for subtree in subtrees]
print(subtree_texts)


['Hy', 'my', 'my name', 'Hy my name is nikhil and it was a nice experience with NLP .', 'nikhil', 'and', 'it', 'it was a nice experience with NLP .', 'a', 'nice', 'a nice experience with NLP', 'with NLP', 'NLP', '.', 'Come on guys ,', 'on guys', 'guys', ',', 'Do', "Come on guys , Do play with NLP techniques . '", 'with NLP techniques', 'NLP', 'NLP techniques', '.', "'"]


**Word Shape Features**

In [38]:
word_shapes = [(token.text, token.shape_) for token in doc]
print( word_shapes)


[('Hy', 'Xx'), ('my', 'xx'), ('name', 'xxxx'), ('is', 'xx'), ('nikhil', 'xxxx'), ('and', 'xxx'), ('it', 'xx'), ('was', 'xxx'), ('a', 'x'), ('nice', 'xxxx'), ('experience', 'xxxx'), ('with', 'xxxx'), ('NLP', 'XXX'), ('.', '.'), ('Come', 'Xxxx'), ('on', 'xx'), ('guys', 'xxxx'), (',', ','), ('Do', 'Xx'), ('play', 'xxxx'), ('with', 'xxxx'), ('NLP', 'XXX'), ('techniques', 'xxxx'), ('.', '.'), ("'", "'")]


**Checking Token Similarity**

In [39]:
token_similarity = [(token1.text, token2.text, token1.similarity(token2)) for token1 in doc for token2 in doc if token1 != token2]
print(token_similarity[:5])  # Displaying the first 5 for brevity


[('Hy', 'my', 0.2780427932739258), ('Hy', 'name', 0.09494823962450027), ('Hy', 'is', -0.024768082424998283), ('Hy', 'nikhil', 0.11030188202857971), ('Hy', 'and', 0.3212807774543762)]


  token_similarity = [(token1.text, token2.text, token1.similarity(token2)) for token1 in doc for token2 in doc if token1 != token2]


**Counter**

In [3]:
%pip install Counter

Collecting Counter
  Downloading Counter-1.0.0.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: Counter
  Building wheel for Counter (setup.py) ... [?25l[?25hdone
  Created wheel for Counter: filename=Counter-1.0.0-py3-none-any.whl size=5394 sha256=b6351e594a7448bffb3dbc2ad9153d3a870666bbea33e89e7a3039e64ba0d536
  Stored in directory: /root/.cache/pip/wheels/e3/02/6d/d5c0838427a060718c6060ae4d24da95a0e0df0d7a3dab8040
Successfully built Counter
Installing collected packages: Counter
Successfully installed Counter-1.0.0


In [4]:
from collections import Counter
count=Counter(s)
count

Counter({'h': 2,
         'y': 1,
         ' ': 3,
         'n': 3,
         'i': 4,
         'k': 1,
         'l': 1,
         'c': 2,
         'e': 5,
         'x': 1,
         'p': 1,
         'r': 1})