In [1]:
import nltk

## Looking into tokenisation

In [3]:
text = "Tokenization is a fundamental preprocessing step in natural language processing (NLP). It involves breaking down a text into smaller units called tokens. Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word."

In [6]:
sentences = nltk.sent_tokenize(text)

In [7]:
sentences

['Tokenization is a fundamental preprocessing step in natural language processing (NLP).',
 'It involves breaking down a text into smaller units called tokens.',
 'Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.']

In [9]:
words = nltk.word_tokenize(text)

In [10]:
words

['Tokenization',
 'is',
 'a',
 'fundamental',
 'preprocessing',
 'step',
 'in',
 'natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 '.',
 'It',
 'involves',
 'breaking',
 'down',
 'a',
 'text',
 'into',
 'smaller',
 'units',
 'called',
 'tokens',
 '.',
 'Tokens',
 'are',
 'the',
 'building',
 'blocks',
 'of',
 'natural',
 'language',
 'and',
 'can',
 'be',
 'as',
 'short',
 'as',
 'a',
 'single',
 'character',
 'or',
 'as',
 'long',
 'as',
 'an',
 'entire',
 'word',
 '.']

In [12]:
print(len(sentences)," ",len(words))

3   51


## Looking into stemming and lemmatization

In [40]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords

In [16]:
text_stem = "Tokenization is a fundamental preprocessing step in natural language processing (NLP). It involves breaking down a text into smaller units called tokens. Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word. The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval.Additionally, stemming is a related technique often employed in NLP preprocessing. Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings. This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms. Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems."

In [17]:
sentences2 = nltk.sent_tokenize(text_stem)

In [22]:
print(sentences2)
print(len(sentences2))

['Tokenization is a fundamental preprocessing step in natural language processing (NLP).', 'It involves breaking down a text into smaller units called tokens.', 'Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.', 'The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval.Additionally, stemming is a related technique often employed in NLP preprocessing.', 'Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings.', 'This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms.', 'Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems.']
7


In [21]:
words2 = nltk.word_tokenize(text_stem)
print(words2)
print(len(words2))

['Tokenization', 'is', 'a', 'fundamental', 'preprocessing', 'step', 'in', 'natural', 'language', 'processing', '(', 'NLP', ')', '.', 'It', 'involves', 'breaking', 'down', 'a', 'text', 'into', 'smaller', 'units', 'called', 'tokens', '.', 'Tokens', 'are', 'the', 'building', 'blocks', 'of', 'natural', 'language', 'and', 'can', 'be', 'as', 'short', 'as', 'a', 'single', 'character', 'or', 'as', 'long', 'as', 'an', 'entire', 'word', '.', 'The', 'process', 'serves', 'to', 'structure', 'and', 'standardize', 'textual', 'data', ',', 'facilitating', 'tasks', 'such', 'as', 'statistical', 'analysis', ',', 'language', 'modeling', ',', 'and', 'information', 'retrieval.Additionally', ',', 'stemming', 'is', 'a', 'related', 'technique', 'often', 'employed', 'in', 'NLP', 'preprocessing', '.', 'Stemming', 'aims', 'to', 'reduce', 'words', 'to', 'their', 'base', 'or', 'root', 'form', ',', 'aiding', 'in', 'the', 'consolidation', 'of', 'similar', 'words', 'with', 'shared', 'meanings', '.', 'This', 'heuristic'

In [24]:
stemmer = PorterStemmer()

In [31]:
stopwords.words('english') # there are a total of 179 stop words i.e., words which are generally not too important and are meant to be filtered out

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [36]:
stem_words_final = []
for i in range(len(sentences2)):
    words = nltk.word_tokenize(sentences2[i])
    stem_words = [stemmer.stem(items) for items in words if items not in set(stopwords.words('english'))]
    stem_words_final.append(stem_words)

In [39]:
stem_words_final

[['token',
  'fundament',
  'preprocess',
  'step',
  'natur',
  'languag',
  'process',
  '(',
  'nlp',
  ')',
  '.'],
 ['it', 'involv', 'break', 'text', 'smaller', 'unit', 'call', 'token', '.'],
 ['token',
  'build',
  'block',
  'natur',
  'languag',
  'short',
  'singl',
  'charact',
  'long',
  'entir',
  'word',
  '.'],
 ['the',
  'process',
  'serv',
  'structur',
  'standard',
  'textual',
  'data',
  ',',
  'facilit',
  'task',
  'statist',
  'analysi',
  ',',
  'languag',
  'model',
  ',',
  'inform',
  'retrieval.addit',
  ',',
  'stem',
  'relat',
  'techniqu',
  'often',
  'employ',
  'nlp',
  'preprocess',
  '.'],
 ['stem',
  'aim',
  'reduc',
  'word',
  'base',
  'root',
  'form',
  ',',
  'aid',
  'consolid',
  'similar',
  'word',
  'share',
  'mean',
  '.'],
 ['thi',
  'heurist',
  'process',
  'involv',
  'remov',
  'prefix',
  'suffix',
  ',',
  'result',
  'stem',
  'may',
  'alway',
  'valid',
  'word',
  'help',
  'captur',
  'core',
  'mean',
  'relat',
  'word