In [1]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')
text = "hello Marry, don't slap the green witch!"

In [5]:
print([str(token) for token in nlp(text.lower())])

['hello', 'marry', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '!']


In [6]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.9.11-cp38-cp38-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 13.2 MB/s eta 0:00:00
Downloading regex-2024.9.11-cp38-cp38-win_amd64.whl (274 kB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: regex, joblib, nltk
Successfully installed joblib-1.4.2 nltk-3.9.1 regex-2024.9.11
Note: you may need to restart the kernel to use updated packages.


In [7]:
from nltk.tokenize import TweetTokenizer

tweet="Snow White and the Seven Degrees #MakeAMovieCold@midnight:)"

In [9]:
token_tweet = TweetTokenizer()

token_tweet.tokenize(tweet.lower())

['snow',
 'white',
 'and',
 'the',
 'seven',
 'degrees',
 '#makeamoviecold',
 '@midnight',
 ':)']

### Unigrams, Bigrams, Trigrams, …, N-grams

In [10]:
def n_grams(text, n) :
    """
        takes tokens or text, returns a list of ngrams 
    
    """

    return [text[i : i+n] for i in range(len(text) - n+1)]



text = "hello Marry, don't slap the green witch!"

cleaned_text = [str(token) for token in nlp(text.lower())]

n_grams(cleaned_text, 2) # Biagrams

[['hello', 'marry'],
 ['marry', ','],
 [',', 'do'],
 ['do', "n't"],
 ["n't", 'slap'],
 ['slap', 'the'],
 ['the', 'green'],
 ['green', 'witch'],
 ['witch', '!']]

In [11]:
n_grams(cleaned_text, 1) # Unigrams

[['hello'],
 ['marry'],
 [','],
 ['do'],
 ["n't"],
 ['slap'],
 ['the'],
 ['green'],
 ['witch'],
 ['!']]

In [12]:
n_grams(cleaned_text, 3) # trigrams

[['hello', 'marry', ','],
 ['marry', ',', 'do'],
 [',', 'do', "n't"],
 ['do', "n't", 'slap'],
 ["n't", 'slap', 'the'],
 ['slap', 'the', 'green'],
 ['the', 'green', 'witch'],
 ['green', 'witch', '!']]

### Lemmas and Stems

In [13]:
# Lemmatization: reducing words to their root forms
doc = nlp(u"he was running late geese.”")
for token in doc:
    print('{} ---> {}'.format(token, token.lemma_))

he ---> he
was ---> be
running ---> run
late ---> late


In [22]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
 
ps = PorterStemmer()
 
# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "was","geese"]
 
for w in words:
    print(w, " : ", ps.stem(w))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
was  :  wa
geese  :  gees


### Categorizing Sentences and Documents

In [17]:
doc = nlp(text)

for token in doc:
    print('{} -----> {}'.format(token, token.pos_))

hello -----> INTJ
Marry -----> PROPN
, -----> PUNCT
do -----> AUX
n't -----> PART
slap -----> VERB
the -----> DET
green -----> ADJ
witch -----> NOUN
! -----> PUNCT


In [20]:
### Categorizing Spans: Chunking and Named Entity Recognition
### Noun Phrase (NP) chunking
doc = nlp(text)

for chunk in doc.noun_chunks:
    print('{} ----> {}'.format(chunk,chunk.label_))\

hello Marry ----> NP
the green witch ----> NP


### Structure of Sentences