In [1]:
import nltk

## Looking into tokenisation

In [3]:
text = "Tokenization is a fundamental preprocessing step in natural language processing (NLP). It involves breaking down a text into smaller units called tokens. Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word."

In [6]:
sentences = nltk.sent_tokenize(text)

In [7]:
sentences

['Tokenization is a fundamental preprocessing step in natural language processing (NLP).',
 'It involves breaking down a text into smaller units called tokens.',
 'Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.']

In [9]:
words = nltk.word_tokenize(text)

In [10]:
words

['Tokenization',
 'is',
 'a',
 'fundamental',
 'preprocessing',
 'step',
 'in',
 'natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 '.',
 'It',
 'involves',
 'breaking',
 'down',
 'a',
 'text',
 'into',
 'smaller',
 'units',
 'called',
 'tokens',
 '.',
 'Tokens',
 'are',
 'the',
 'building',
 'blocks',
 'of',
 'natural',
 'language',
 'and',
 'can',
 'be',
 'as',
 'short',
 'as',
 'a',
 'single',
 'character',
 'or',
 'as',
 'long',
 'as',
 'an',
 'entire',
 'word',
 '.']

In [12]:
print(len(sentences)," ",len(words))

3   51


## Looking into stemming and lemmatization

In [40]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords

In [47]:
text_stem = "Tokenization is a fundamental preprocessing step in natural language processing (NLP). It involves breaking down a text into smaller units called tokens. Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word. The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval. Additionally, stemming is a related technique often employed in NLP preprocessing. Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings. This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms. Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems."

In [48]:
sentences2 = nltk.sent_tokenize(text_stem)

In [49]:
print(sentences2)
print(len(sentences2))

['Tokenization is a fundamental preprocessing step in natural language processing (NLP).', 'It involves breaking down a text into smaller units called tokens.', 'Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.', 'The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval.', 'Additionally, stemming is a related technique often employed in NLP preprocessing.', 'Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings.', 'This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms.', 'Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems.']
8


In [50]:
words2 = nltk.word_tokenize(text_stem)
print(words2)
print(len(words2))

['Tokenization', 'is', 'a', 'fundamental', 'preprocessing', 'step', 'in', 'natural', 'language', 'processing', '(', 'NLP', ')', '.', 'It', 'involves', 'breaking', 'down', 'a', 'text', 'into', 'smaller', 'units', 'called', 'tokens', '.', 'Tokens', 'are', 'the', 'building', 'blocks', 'of', 'natural', 'language', 'and', 'can', 'be', 'as', 'short', 'as', 'a', 'single', 'character', 'or', 'as', 'long', 'as', 'an', 'entire', 'word', '.', 'The', 'process', 'serves', 'to', 'structure', 'and', 'standardize', 'textual', 'data', ',', 'facilitating', 'tasks', 'such', 'as', 'statistical', 'analysis', ',', 'language', 'modeling', ',', 'and', 'information', 'retrieval', '.', 'Additionally', ',', 'stemming', 'is', 'a', 'related', 'technique', 'often', 'employed', 'in', 'NLP', 'preprocessing', '.', 'Stemming', 'aims', 'to', 'reduce', 'words', 'to', 'their', 'base', 'or', 'root', 'form', ',', 'aiding', 'in', 'the', 'consolidation', 'of', 'similar', 'words', 'with', 'shared', 'meanings', '.', 'This', 'he

In [51]:
stemmer = PorterStemmer()

In [52]:
stopwords.words('english') # there are a total of 179 stop words i.e., words which are generally not too important and are meant to be filtered out

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [53]:
stem_words_final = []
for i in range(len(sentences2)):
    words = nltk.word_tokenize(sentences2[i])
    stem_words = [stemmer.stem(items) for items in words if items not in set(stopwords.words('english'))]
    stem_words_final.append(stem_words)

In [54]:
stem_words_final

[['token',
  'fundament',
  'preprocess',
  'step',
  'natur',
  'languag',
  'process',
  '(',
  'nlp',
  ')',
  '.'],
 ['it', 'involv', 'break', 'text', 'smaller', 'unit', 'call', 'token', '.'],
 ['token',
  'build',
  'block',
  'natur',
  'languag',
  'short',
  'singl',
  'charact',
  'long',
  'entir',
  'word',
  '.'],
 ['the',
  'process',
  'serv',
  'structur',
  'standard',
  'textual',
  'data',
  ',',
  'facilit',
  'task',
  'statist',
  'analysi',
  ',',
  'languag',
  'model',
  ',',
  'inform',
  'retriev',
  '.'],
 ['addit',
  ',',
  'stem',
  'relat',
  'techniqu',
  'often',
  'employ',
  'nlp',
  'preprocess',
  '.'],
 ['stem',
  'aim',
  'reduc',
  'word',
  'base',
  'root',
  'form',
  ',',
  'aid',
  'consolid',
  'similar',
  'word',
  'share',
  'mean',
  '.'],
 ['thi',
  'heurist',
  'process',
  'involv',
  'remov',
  'prefix',
  'suffix',
  ',',
  'result',
  'stem',
  'may',
  'alway',
  'valid',
  'word',
  'help',
  'captur',
  'core',
  'mean',
  'rela

In [55]:
lemmatizer = WordNetLemmatizer()

In [56]:
lemma_words_final = []
for i in range(len(sentences2)):
    words = nltk.word_tokenize(sentences2[i])
    lemma_words = [lemmatizer.lemmatize(items) for items in words if items not in set(stopwords.words('english'))]
    lemma_words_final.append(lemma_words)

In [57]:
lemma_words_final

[['Tokenization',
  'fundamental',
  'preprocessing',
  'step',
  'natural',
  'language',
  'processing',
  '(',
  'NLP',
  ')',
  '.'],
 ['It',
  'involves',
  'breaking',
  'text',
  'smaller',
  'unit',
  'called',
  'token',
  '.'],
 ['Tokens',
  'building',
  'block',
  'natural',
  'language',
  'short',
  'single',
  'character',
  'long',
  'entire',
  'word',
  '.'],
 ['The',
  'process',
  'serf',
  'structure',
  'standardize',
  'textual',
  'data',
  ',',
  'facilitating',
  'task',
  'statistical',
  'analysis',
  ',',
  'language',
  'modeling',
  ',',
  'information',
  'retrieval',
  '.'],
 ['Additionally',
  ',',
  'stemming',
  'related',
  'technique',
  'often',
  'employed',
  'NLP',
  'preprocessing',
  '.'],
 ['Stemming',
  'aim',
  'reduce',
  'word',
  'base',
  'root',
  'form',
  ',',
  'aiding',
  'consolidation',
  'similar',
  'word',
  'shared',
  'meaning',
  '.'],
 ['This',
  'heuristic',
  'process',
  'involves',
  'removing',
  'prefix',
  'suffix'

In [58]:
sentences2

['Tokenization is a fundamental preprocessing step in natural language processing (NLP).',
 'It involves breaking down a text into smaller units called tokens.',
 'Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.',
 'The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval.',
 'Additionally, stemming is a related technique often employed in NLP preprocessing.',
 'Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings.',
 'This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms.',
 'Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems.']

## Looking into Bag of words

In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
text_bow = "Tokenization is a fundamental preprocessing step in natural language processing (NLP). It involves breaking down a text into smaller units called tokens. Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word. The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval. Additionally, stemming is a related technique often employed in NLP preprocessing. Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings. This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms. Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems.In conjunction with tokenization and stemming, the Bag of Words (BoW) model is frequently employed in NLP. BoW represents a document as an unordered set of its constituent words, disregarding grammar and word order but capturing word frequency information. Each unique word in the document forms a token, and the resulting document is represented as a numerical vector, where each element corresponds to the frequency of a specific word in the vocabulary. BoW is a foundational technique in various NLP tasks, providing a simple yet effective way to represent and analyze textual data. While it lacks semantic understanding and context, BoW serves as a baseline for more advanced models and is particularly useful in applications such as text classification, sentiment analysis, and information retrieval."

In [62]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [63]:
sentences3 = nltk.sent_tokenize(text_bow)
len(sentences3)

12

In [64]:
sentences3

['Tokenization is a fundamental preprocessing step in natural language processing (NLP).',
 'It involves breaking down a text into smaller units called tokens.',
 'Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.',
 'The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval.',
 'Additionally, stemming is a related technique often employed in NLP preprocessing.',
 'Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings.',
 'This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms.',
 'Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems.In conju

In [87]:
corpus = []
for i in range(len(sentences3)):
    review = re.sub("[^a-zA-Z]",' ',sentences3[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(item) for item in review if item not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    print(review)

token fundament preprocess step natur languag process nlp
involv break text smaller unit call token
token build block natur languag short singl charact long entir word
process serv structur standard textual data facilit task statist analysi languag model inform retriev
addit stem relat techniqu often employ nlp preprocess
stem aim reduc word base root form aid consolid similar word share mean
heurist process involv remov prefix suffix result stem may alway valid word help captur core mean relat word form
stem contribut text normal commonli use enhanc effici text analysi inform retriev system conjunct token stem bag word bow model frequent employ nlp
bow repres document unord set constitu word disregard grammar word order captur word frequenc inform
uniqu word document form token result document repres numer vector element correspond frequenc specif word vocabulari
bow foundat techniqu variou nlp task provid simpl yet effect way repres analyz textual data
lack semant understand context 

In [88]:
corpus

['token fundament preprocess step natur languag process nlp',
 'involv break text smaller unit call token',
 'token build block natur languag short singl charact long entir word',
 'process serv structur standard textual data facilit task statist analysi languag model inform retriev',
 'addit stem relat techniqu often employ nlp preprocess',
 'stem aim reduc word base root form aid consolid similar word share mean',
 'heurist process involv remov prefix suffix result stem may alway valid word help captur core mean relat word form',
 'stem contribut text normal commonli use enhanc effici text analysi inform retriev system conjunct token stem bag word bow model frequent employ nlp',
 'bow repres document unord set constitu word disregard grammar word order captur word frequenc inform',
 'uniqu word document form token result document repres numer vector element correspond frequenc specif word vocabulari',
 'bow foundat techniqu variou nlp task provid simpl yet effect way repres analyz te

In [85]:
corpus_lemma = []
for i in range(len(sentences3)):
    review = re.sub("[^a-zA-Z]",' ',sentences3[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(item) for item in review if item not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_lemma.append(review)
    print(review)

tokenization fundamental preprocessing step natural language processing nlp
involves breaking text smaller unit called token
token building block natural language short single character long entire word
process serf structure standardize textual data facilitating task statistical analysis language modeling information retrieval
additionally stemming related technique often employed nlp preprocessing
stemming aim reduce word base root form aiding consolidation similar word shared meaning
heuristic process involves removing prefix suffix resulting stem may always valid word help capturing core meaning related word form
stemming contributes text normalization commonly used enhance efficiency text analysis information retrieval system conjunction tokenization stemming bag word bow model frequently employed nlp
bow represents document unordered set constituent word disregarding grammar word order capturing word frequency information
unique word document form token resulting document represe

In [86]:
print(corpus_lemma)

['tokenization fundamental preprocessing step natural language processing nlp', 'involves breaking text smaller unit called token', 'token building block natural language short single character long entire word', 'process serf structure standardize textual data facilitating task statistical analysis language modeling information retrieval', 'additionally stemming related technique often employed nlp preprocessing', 'stemming aim reduce word base root form aiding consolidation similar word shared meaning', 'heuristic process involves removing prefix suffix resulting stem may always valid word help capturing core meaning related word form', 'stemming contributes text normalization commonly used enhance efficiency text analysis information retrieval system conjunction tokenization stemming bag word bow model frequently employed nlp', 'bow represents document unordered set constituent word disregarding grammar word order capturing word frequency information', 'unique word document form tok

In [90]:
# creating the bag of words using Count Vectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus_lemma).toarray()

In [91]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 2, 0],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 1, 0, ..., 0, 0, 0]])

In [92]:
cv.get_feature_names_out()

array(['additionally', 'advanced', 'aiding', 'aim', 'always', 'analysis',
       'analyze', 'application', 'bag', 'base', 'baseline', 'block',
       'bow', 'breaking', 'building', 'called', 'capturing', 'character',
       'classification', 'commonly', 'conjunction', 'consolidation',
       'constituent', 'context', 'contributes', 'core', 'corresponds',
       'data', 'disregarding', 'document', 'effective', 'efficiency',
       'element', 'employed', 'enhance', 'entire', 'facilitating', 'form',
       'foundational', 'frequency', 'frequently', 'fundamental',
       'grammar', 'help', 'heuristic', 'information', 'involves', 'lack',
       'language', 'long', 'may', 'meaning', 'model', 'modeling',
       'natural', 'nlp', 'normalization', 'numerical', 'often', 'order',
       'particularly', 'prefix', 'preprocessing', 'process', 'processing',
       'providing', 'reduce', 'related', 'removing', 'represent',
       'represented', 'represents', 'resulting', 'retrieval', 'root',
       's

## Looking into TF-IDF

In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 

In [106]:
text_tfidf = "Tokenization is a fundamental preprocessing step in natural language processing (NLP). It involves breaking down a text into smaller units called tokens. Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word. The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval. Additionally, stemming is a related technique often employed in NLP preprocessing. Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings. This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms. Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems. In conjunction with tokenization and stemming, the Bag of Words (BoW) model is frequently employed in NLP. BoW represents a document as an unordered set of its constituent words, disregarding grammar and word order but capturing word frequency information. Each unique word in the document forms a token, and the resulting document is represented as a numerical vector, where each element corresponds to the frequency of a specific word in the vocabulary. BoW is a foundational technique in various NLP tasks, providing a simple yet effective way to represent and analyze textual data. While it lacks semantic understanding and context, BoW serves as a baseline for more advanced models and is particularly useful in applications such as text classification, sentiment analysis, and information retrieval. Additionally, TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic used in NLP and information retrieval to evaluate the importance of a word in a document relative to a collection of documents (corpus). TF-IDF takes into account both the frequency of a term within a document (TF) and the rarity of the term across the entire corpus (IDF). It helps in weighting the importance of terms, emphasizing those that are frequent in a document but rare in the overall corpus. TF-IDF is commonly used for tasks such as document ranking, information retrieval, and text mining. It provides a more nuanced representation of documents compared to simple word frequencies, making it a valuable tool in natural language processing."


In [107]:
len(nltk.sent_tokenize(text_tfidf))

18

In [108]:
wordnet = WordNetLemmatizer() 
corpus= []

In [109]:
sentences4 = nltk.sent_tokenize(text_tfidf)

In [110]:
sentences4

['Tokenization is a fundamental preprocessing step in natural language processing (NLP).',
 'It involves breaking down a text into smaller units called tokens.',
 'Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.',
 'The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval.',
 'Additionally, stemming is a related technique often employed in NLP preprocessing.',
 'Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings.',
 'This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms.',
 'Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems.',
 'In 

In [113]:
for i in range(len(sentences4)):
    review = re.sub("[^a-zA-Z]",' ',sentences4[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(item) for item in review if item not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    print(review)

tokenization fundamental preprocessing step natural language processing nlp
involves breaking text smaller unit called token
token building block natural language short single character long entire word
process serf structure standardize textual data facilitating task statistical analysis language modeling information retrieval
additionally stemming related technique often employed nlp preprocessing
stemming aim reduce word base root form aiding consolidation similar word shared meaning
heuristic process involves removing prefix suffix resulting stem may always valid word help capturing core meaning related word form
stemming contributes text normalization commonly used enhance efficiency text analysis information retrieval system
conjunction tokenization stemming bag word bow model frequently employed nlp
bow represents document unordered set constituent word disregarding grammar word order capturing word frequency information
unique word document form token resulting document represe

In [114]:
print(corpus)

['tokenization fundamental preprocessing step natural language processing nlp', 'involves breaking text smaller unit called token', 'token building block natural language short single character long entire word', 'process serf structure standardize textual data facilitating task statistical analysis language modeling information retrieval', 'additionally stemming related technique often employed nlp preprocessing', 'stemming aim reduce word base root form aiding consolidation similar word shared meaning', 'heuristic process involves removing prefix suffix resulting stem may always valid word help capturing core meaning related word form', 'stemming contributes text normalization commonly used enhance efficiency text analysis information retrieval system', 'conjunction tokenization stemming bag word bow model frequently employed nlp', 'bow represents document unordered set constituent word disregarding grammar word order capturing word frequency information', 'unique word document form 

In [115]:
# creating the TF-IDF vectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()

In [116]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.18106353,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.16410647,
        0.        ]])

In [117]:
tfidf.get_feature_names_out()

array(['account', 'across', 'additionally', 'advanced', 'aiding', 'aim',
       'always', 'analysis', 'analyze', 'application', 'bag', 'base',
       'baseline', 'block', 'bow', 'breaking', 'building', 'called',
       'capturing', 'character', 'classification', 'collection',
       'commonly', 'compared', 'conjunction', 'consolidation',
       'constituent', 'context', 'contributes', 'core', 'corpus',
       'corresponds', 'data', 'disregarding', 'document', 'effective',
       'efficiency', 'element', 'emphasizing', 'employed', 'enhance',
       'entire', 'evaluate', 'facilitating', 'form', 'foundational',
       'frequency', 'frequent', 'frequently', 'fundamental', 'grammar',
       'help', 'heuristic', 'idf', 'importance', 'information', 'inverse',
       'involves', 'lack', 'language', 'long', 'making', 'may', 'meaning',
       'mining', 'model', 'modeling', 'natural', 'nlp', 'normalization',
       'nuanced', 'numerical', 'often', 'order', 'overall',
       'particularly', 'prefi

## Looking into Word2Vec

In [9]:
import re
from gensim.models import Word2Vec
from nltk.corpus import stopwords

In [5]:
text_word2vec = "Tokenization is a fundamental preprocessing step in natural language processing (NLP). It involves breaking down a text into smaller units called tokens. Tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word. The process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval. Additionally, stemming is a related technique often employed in NLP preprocessing. Stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings. This heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms. Stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems. In conjunction with tokenization and stemming, the Bag of Words (BoW) model is frequently employed in NLP. BoW represents a document as an unordered set of its constituent words, disregarding grammar and word order but capturing word frequency information. Each unique word in the document forms a token, and the resulting document is represented as a numerical vector, where each element corresponds to the frequency of a specific word in the vocabulary. BoW is a foundational technique in various NLP tasks, providing a simple yet effective way to represent and analyze textual data. While it lacks semantic understanding and context, BoW serves as a baseline for more advanced models and is particularly useful in applications such as text classification, sentiment analysis, and information retrieval. Additionally, TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic used in NLP and information retrieval to evaluate the importance of a word in a document relative to a collection of documents (corpus). TF-IDF takes into account both the frequency of a term within a document (TF) and the rarity of the term across the entire corpus (IDF). It helps in weighting the importance of terms, emphasizing those that are frequent in a document but rare in the overall corpus. TF-IDF is commonly used for tasks such as document ranking, information retrieval, and text mining. It provides a more nuanced representation of documents compared to simple word frequencies, making it a valuable tool in natural language processing. Moreover, Word2Vec is another influential technique in NLP for word representation. Unlike traditional methods that rely on discrete representations, Word2Vec embeds words in continuous vector spaces, capturing semantic relationships. The models, such as Skip-gram and Continuous Bag of Words (CBOW), use neural networks to learn distributed representations of words. Word2Vec enables the exploration of semantic similarities, analogies, and contextual understanding in a way that goes beyond the capabilities of traditional techniques. The resulting word embeddings from Word2Vec can be leveraged for various NLP tasks, including sentiment analysis, machine translation, and document clustering, providing a powerful approach to understanding and representing language in computational models."


In [7]:
len(nltk.sent_tokenize(text_word2vec))

23

In [28]:
review = re.sub(r'\[[0-9]*\]',' ',text_word2vec)
review = re.sub(r'\s+',' ',review)
review = review.lower()
review = re.sub(r'\d',' ',review)
review = re.sub(r'\s+',' ',review)

In [29]:
review

'tokenization is a fundamental preprocessing step in natural language processing (nlp). it involves breaking down a text into smaller units called tokens. tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word. the process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval. additionally, stemming is a related technique often employed in nlp preprocessing. stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings. this heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms. stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems. in conjunction with tokenization and

In [39]:
sentences5 = nltk.sent_tokenize(review)
sentences5

['tokenization is a fundamental preprocessing step in natural language processing (nlp).',
 'it involves breaking down a text into smaller units called tokens.',
 'tokens are the building blocks of natural language and can be as short as a single character or as long as an entire word.',
 'the process serves to structure and standardize textual data, facilitating tasks such as statistical analysis, language modeling, and information retrieval.',
 'additionally, stemming is a related technique often employed in nlp preprocessing.',
 'stemming aims to reduce words to their base or root form, aiding in the consolidation of similar words with shared meanings.',
 'this heuristic process involves removing prefixes or suffixes, resulting in stems that may not always be valid words but help in capturing the core meaning of related word forms.',
 'stemming contributes to text normalization and is commonly used to enhance the efficiency of text analysis and information retrieval systems.',
 'in 

In [40]:
sentence_words = [nltk.word_tokenize(sentence) for sentence in sentences5]

In [43]:
sentence_words

[['tokenization',
  'is',
  'a',
  'fundamental',
  'preprocessing',
  'step',
  'in',
  'natural',
  'language',
  'processing',
  '(',
  'nlp',
  ')',
  '.'],
 ['it',
  'involves',
  'breaking',
  'down',
  'a',
  'text',
  'into',
  'smaller',
  'units',
  'called',
  'tokens',
  '.'],
 ['tokens',
  'are',
  'the',
  'building',
  'blocks',
  'of',
  'natural',
  'language',
  'and',
  'can',
  'be',
  'as',
  'short',
  'as',
  'a',
  'single',
  'character',
  'or',
  'as',
  'long',
  'as',
  'an',
  'entire',
  'word',
  '.'],
 ['the',
  'process',
  'serves',
  'to',
  'structure',
  'and',
  'standardize',
  'textual',
  'data',
  ',',
  'facilitating',
  'tasks',
  'such',
  'as',
  'statistical',
  'analysis',
  ',',
  'language',
  'modeling',
  ',',
  'and',
  'information',
  'retrieval',
  '.'],
 ['additionally',
  ',',
  'stemming',
  'is',
  'a',
  'related',
  'technique',
  'often',
  'employed',
  'in',
  'nlp',
  'preprocessing',
  '.'],
 ['stemming',
  'aims',
  '

In [45]:
len(sentence_words)

23

In [47]:
for i in range(len(sentence_words)):
     sentence_words[i] = [word for word in sentence_words[i] if word not in stopwords.words('english')]

In [48]:
sentence_words

[['tokenization',
  'fundamental',
  'preprocessing',
  'step',
  'natural',
  'language',
  'processing',
  '(',
  'nlp',
  ')',
  '.'],
 ['involves', 'breaking', 'text', 'smaller', 'units', 'called', 'tokens', '.'],
 ['tokens',
  'building',
  'blocks',
  'natural',
  'language',
  'short',
  'single',
  'character',
  'long',
  'entire',
  'word',
  '.'],
 ['process',
  'serves',
  'structure',
  'standardize',
  'textual',
  'data',
  ',',
  'facilitating',
  'tasks',
  'statistical',
  'analysis',
  ',',
  'language',
  'modeling',
  ',',
  'information',
  'retrieval',
  '.'],
 ['additionally',
  ',',
  'stemming',
  'related',
  'technique',
  'often',
  'employed',
  'nlp',
  'preprocessing',
  '.'],
 ['stemming',
  'aims',
  'reduce',
  'words',
  'base',
  'root',
  'form',
  ',',
  'aiding',
  'consolidation',
  'similar',
  'words',
  'shared',
  'meanings',
  '.'],
 ['heuristic',
  'process',
  'involves',
  'removing',
  'prefixes',
  'suffixes',
  ',',
  'resulting',
  '

In [49]:
model = Word2Vec(sentence_words,min_count=1)

In [56]:
model.wv.key_to_index.keys()

dict_keys([',', '.', 'word', 'document', 'words', ')', '(', 'nlp', 'information', 'text', 'retrieval', 'language', 'frequency', 'analysis', 'stemming', 'tasks', 'vec', 'bow', 'tf-idf', 'models', 'technique', 'corpus', 'resulting', 'term', 'natural', 'semantic', 'capturing', 'understanding', 'used', 'commonly', 'bag', 'additionally', 'various', 'providing', 'simple', 'related', 'way', 'sentiment', 'employed', 'forms', 'documents', 'importance', 'representation', 'vector', 'numerical', 'tokenization', 'processing', 'serves', 'entire', 'data', 'textual', 'traditional', 'preprocessing', 'representations', 'tokens', 'involves', 'process', 'continuous', 'blocks', 'normalization', 'modeling', 'contributes', 'short', 'single', 'character', 'building', 'smaller', 'called', 'units', 'core', 'enhance', 'efficiency', 'systems', 'conjunction', 'breaking', 'step', 'model', 'frequently', 'represents', 'fundamental', 'meaning', 'valid', 'long', 'help', 'facilitating', 'often', 'aims', 'reduce', 'set',

In [55]:
model.wv.index_to_key

[',',
 '.',
 'word',
 'document',
 'words',
 ')',
 '(',
 'nlp',
 'information',
 'text',
 'retrieval',
 'language',
 'frequency',
 'analysis',
 'stemming',
 'tasks',
 'vec',
 'bow',
 'tf-idf',
 'models',
 'technique',
 'corpus',
 'resulting',
 'term',
 'natural',
 'semantic',
 'capturing',
 'understanding',
 'used',
 'commonly',
 'bag',
 'additionally',
 'various',
 'providing',
 'simple',
 'related',
 'way',
 'sentiment',
 'employed',
 'forms',
 'documents',
 'importance',
 'representation',
 'vector',
 'numerical',
 'tokenization',
 'processing',
 'serves',
 'entire',
 'data',
 'textual',
 'traditional',
 'preprocessing',
 'representations',
 'tokens',
 'involves',
 'process',
 'continuous',
 'blocks',
 'normalization',
 'modeling',
 'contributes',
 'short',
 'single',
 'character',
 'building',
 'smaller',
 'called',
 'units',
 'core',
 'enhance',
 'efficiency',
 'systems',
 'conjunction',
 'breaking',
 'step',
 'model',
 'frequently',
 'represents',
 'fundamental',
 'meaning',
 'va

In [65]:
vector_for_word = model.wv.get_vector("single")
vector_for_word

array([ 0.00201864,  0.00831493, -0.00829566, -0.00871039, -0.0005239 ,
       -0.00198984, -0.00237825, -0.00045772, -0.00513931, -0.008051  ,
        0.00826423, -0.00340255, -0.00285354, -0.00159801, -0.00126206,
       -0.00917961, -0.00175041, -0.00143581,  0.00864139, -0.00810593,
       -0.00480165, -0.00184953,  0.00282517, -0.00601818, -0.00556686,
       -0.00793623,  0.00548581, -0.00874214, -0.00326716,  0.00031419,
        0.00149079, -0.00990256,  0.00868073, -0.00213352,  0.00030848,
       -0.00869059, -0.00809868,  0.00369411,  0.0040338 ,  0.00288857,
        0.00584292, -0.00706011, -0.00128638, -0.00269416, -0.00135634,
        0.00441871,  0.00135374, -0.00748008,  0.00325538,  0.00628808,
       -0.00602691,  0.00324274, -0.00429614,  0.00881807,  0.00413512,
        0.00203307, -0.00101068, -0.00913577,  0.00862566, -0.0017808 ,
        0.00733564,  0.00382444, -0.0082812 , -0.00060924,  0.00270938,
       -0.00254367,  0.00979145, -0.00491189, -0.0072127 ,  0.00

In [64]:
similar = model.wv.most_similar('single')
similar

[('approach', 0.34905263781547546),
 ('disregarding', 0.3337511420249939),
 ('frequently', 0.24489283561706543),
 ('traditional', 0.23436366021633148),
 ('rare', 0.21372495591640472),
 ('forms', 0.19526058435440063),
 ('vocabulary', 0.1837596297264099),
 ('simple', 0.16039122641086578),
 ('various', 0.1596897840499878),
 ('weighting', 0.15936227142810822)]