In [57]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import tensorflow as tf
import numpy as np

In [30]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
text = "Life is a complex journey filled with choices, lessons, and countless opportunities for growth. Every morning presents a new beginning, a fresh chance to shape the day according to our actions and mindset. People often underestimate the power of small efforts, forgetting that consistency in tiny steps leads to remarkable results over time. Success rarely happens overnight; it’s built on persistence, patience, and a willingness to learn from failure. Mistakes are not the end but the stepping stones that guide us toward wisdom. Relationships form the core of human experience, teaching us compassion, forgiveness, and understanding. Technology continues to reshape our world, connecting people across continents and transforming how we live, work, and communicate. Education empowers minds and fuels innovation, while curiosity keeps the flame of discovery alive. Nature, in its endless beauty, reminds us of balance, peace, and the importance of preserving our planet. The rhythm of the seasons mirrors the cycles in our lives—times of growth, change, and renewal. In moments of silence, we often find clarity and strength to face our challenges. Dreams give direction to our efforts, and discipline turns those dreams into achievements. Though the world is full of uncertainty, hope remains the most powerful force that drives humanity forward. Every person has a story worth telling, a purpose worth pursuing, and a destiny waiting to be shaped. Time passes swiftly, urging us to live meaningfully, appreciate the present, and cherish the people we love. Even in darkness, there’s always a spark of light waiting to guide us. Change is inevitable, but our response to it defines our character. The future belongs to those who dare to imagine it and work relentlessly to make it real. And in the end, it’s not the years in our life that matter most, but the life we put into those years."

word, sent tokenization

In [17]:
words = word_tokenize(text)
sent = sent_tokenize(text)
print(words)
print(sent)

['Life', 'is', 'a', 'complex', 'journey', 'filled', 'with', 'choices', ',', 'lessons', ',', 'and', 'countless', 'opportunities', 'for', 'growth', '.', 'Every', 'morning', 'presents', 'a', 'new', 'beginning', ',', 'a', 'fresh', 'chance', 'to', 'shape', 'the', 'day', 'according', 'to', 'our', 'actions', 'and', 'mindset', '.', 'People', 'often', 'underestimate', 'the', 'power', 'of', 'small', 'efforts', ',', 'forgetting', 'that', 'consistency', 'in', 'tiny', 'steps', 'leads', 'to', 'remarkable', 'results', 'over', 'time', '.', 'Success', 'rarely', 'happens', 'overnight', ';', 'it', '’', 's', 'built', 'on', 'persistence', ',', 'patience', ',', 'and', 'a', 'willingness', 'to', 'learn', 'from', 'failure', '.', 'Mistakes', 'are', 'not', 'the', 'end', 'but', 'the', 'stepping', 'stones', 'that', 'guide', 'us', 'toward', 'wisdom', '.', 'Relationships', 'form', 'the', 'core', 'of', 'human', 'experience', ',', 'teaching', 'us', 'compassion', ',', 'forgiveness', ',', 'and', 'understanding', '.', 'T

Removing stop words

In [31]:
filtered_sentence = [w for w in words if w.lower() not in stop_words]
print(filtered_sentence)

['Life', 'complex', 'journey', 'filled', 'choices', ',', 'lessons', ',', 'countless', 'opportunities', 'growth', '.', 'Every', 'morning', 'presents', 'new', 'beginning', ',', 'fresh', 'chance', 'shape', 'day', 'according', 'actions', 'mindset', '.', 'People', 'often', 'underestimate', 'power', 'small', 'efforts', ',', 'forgetting', 'consistency', 'tiny', 'steps', 'leads', 'remarkable', 'results', 'time', '.', 'Success', 'rarely', 'happens', 'overnight', ';', '’', 'built', 'persistence', ',', 'patience', ',', 'willingness', 'learn', 'failure', '.', 'Mistakes', 'end', 'stepping', 'stones', 'guide', 'us', 'toward', 'wisdom', '.', 'Relationships', 'form', 'core', 'human', 'experience', ',', 'teaching', 'us', 'compassion', ',', 'forgiveness', ',', 'understanding', '.', 'Technology', 'continues', 'reshape', 'world', ',', 'connecting', 'people', 'across', 'continents', 'transforming', 'live', ',', 'work', ',', 'communicate', '.', 'Education', 'empowers', 'minds', 'fuels', 'innovation', ',', '

Stemming

In [32]:
stemed_words = [PorterStemmer().stem(word) for word in filtered_sentence]
print(stemed_words)

['life', 'complex', 'journey', 'fill', 'choic', ',', 'lesson', ',', 'countless', 'opportun', 'growth', '.', 'everi', 'morn', 'present', 'new', 'begin', ',', 'fresh', 'chanc', 'shape', 'day', 'accord', 'action', 'mindset', '.', 'peopl', 'often', 'underestim', 'power', 'small', 'effort', ',', 'forget', 'consist', 'tini', 'step', 'lead', 'remark', 'result', 'time', '.', 'success', 'rare', 'happen', 'overnight', ';', '’', 'built', 'persist', ',', 'patienc', ',', 'willing', 'learn', 'failur', '.', 'mistak', 'end', 'step', 'stone', 'guid', 'us', 'toward', 'wisdom', '.', 'relationship', 'form', 'core', 'human', 'experi', ',', 'teach', 'us', 'compass', ',', 'forgiv', ',', 'understand', '.', 'technolog', 'continu', 'reshap', 'world', ',', 'connect', 'peopl', 'across', 'contin', 'transform', 'live', ',', 'work', ',', 'commun', '.', 'educ', 'empow', 'mind', 'fuel', 'innov', ',', 'curios', 'keep', 'flame', 'discoveri', 'aliv', '.', 'natur', ',', 'endless', 'beauti', ',', 'remind', 'us', 'balanc', 

Lemmatization

In [33]:
lemmatized_words = [WordNetLemmatizer().lemmatize(word) for word in filtered_sentence]
print(lemmatized_words)

['Life', 'complex', 'journey', 'filled', 'choice', ',', 'lesson', ',', 'countless', 'opportunity', 'growth', '.', 'Every', 'morning', 'present', 'new', 'beginning', ',', 'fresh', 'chance', 'shape', 'day', 'according', 'action', 'mindset', '.', 'People', 'often', 'underestimate', 'power', 'small', 'effort', ',', 'forgetting', 'consistency', 'tiny', 'step', 'lead', 'remarkable', 'result', 'time', '.', 'Success', 'rarely', 'happens', 'overnight', ';', '’', 'built', 'persistence', ',', 'patience', ',', 'willingness', 'learn', 'failure', '.', 'Mistakes', 'end', 'stepping', 'stone', 'guide', 'u', 'toward', 'wisdom', '.', 'Relationships', 'form', 'core', 'human', 'experience', ',', 'teaching', 'u', 'compassion', ',', 'forgiveness', ',', 'understanding', '.', 'Technology', 'continues', 'reshape', 'world', ',', 'connecting', 'people', 'across', 'continent', 'transforming', 'live', ',', 'work', ',', 'communicate', '.', 'Education', 'empowers', 'mind', 'fuel', 'innovation', ',', 'curiosity', 'kee

Bag of words

In [44]:
vector = CountVectorizer().fit_transform(sent)
vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 2]])

TFIDF

In [46]:
vector = TfidfVectorizer().fit_transform(sent)
vector.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.27457096, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.46072591]])

Word2Vec

In [51]:
word_sent = [nltk.word_tokenize(sent) for sent in sent]
model = Word2Vec(sentences=word_sent, vector_size=100, window=5, min_count=1, workers=4)

In [54]:
print("Vector for 'Life':")
print(model.wv['Life'])

Vector for 'Life':
[ 0.00749461  0.00469684  0.00042672  0.00411817  0.00872112 -0.00306155
 -0.00356727  0.00388608 -0.00264242 -0.00872831 -0.00943048 -0.00800499
  0.00842612  0.00200898 -0.00024964 -0.00820785  0.00500091  0.00206593
 -0.00075786 -0.00039594 -0.00807215  0.00693964 -0.00354194 -0.00985959
  0.00723518 -0.00809275  0.00932327 -0.00603082  0.00229515 -0.00419751
 -0.00708455  0.00992666 -0.00425352 -0.00951669 -0.00857374  0.00468986
  0.00177289 -0.00755032 -0.00894102 -0.00599845  0.00730261 -0.00775689
 -0.00318389 -0.00228503  0.0087466  -0.00371988  0.00648413 -0.00776862
  0.0062576   0.00659328 -0.00814886  0.0074376   0.00116337 -0.00191012
  0.00716026 -0.00702595  0.00415031 -0.00925199 -0.00864383 -0.00613809
 -0.00046363  0.0047042  -0.00901595  0.0082234  -0.00647626 -0.00213581
  0.00756515  0.00180519  0.00512723  0.00425424  0.00248461 -0.00024039
 -0.00993977  0.00317779  0.00881517 -0.00057553 -0.00265445  0.00246928
 -0.00276921 -0.00760018  0.0044

In [55]:
print("Words similar to 'Life':")
print(model.wv.most_similar('Life'))

Words similar to 'Life':
[('relentlessly', 0.39609459042549133), ('new', 0.2035699188709259), ('time', 0.19592858850955963), ('telling', 0.19441048800945282), ('hope', 0.179193377494812), ('who', 0.17819646000862122), ('Time', 0.1777038872241974), ('passes', 0.17428982257843018), ('cherish', 0.16685201227664948), ('work', 0.1576065570116043)]


Word embeddings using Tensorflow

In [60]:
vocab = ["king", "queen", "women", "boy", "man", "girl"]
index_dict = {word: i for i, word in enumerate(vocab)}

embedding_layer = tf.keras.layers.Embedding(input_dim=len(vocab), output_dim=6)

word = "king"
word_id = np.array([[index_dict[word]]])

embedding_vector = embedding_layer(word_id)

print("Word:", word)
print("Embedding vector:\n", embedding_vector.numpy())

Word: king
Embedding vector:
 [[[ 0.03222283 -0.01064516  0.04271019  0.0376575   0.04491914
   -0.01523045]]]
