In [2]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
import pandas as pd


[nltk_data] Downloading package punkt to C:\Users\Pacific
[nltk_data]     BD\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Pacific
[nltk_data]     BD\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Pacific
[nltk_data]     BD\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Pacific
[nltk_data]     BD\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
corpus = [
    "I love machine learning. Machine learning is fun!",
    "I love NLP and I love deep learning.",
    "NLP is about text processing and understanding language."
]
corpus


['I love machine learning. Machine learning is fun!',
 'I love NLP and I love deep learning.',
 'NLP is about text processing and understanding language.']

In [4]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)   # keep only letters/spaces
    text = re.sub(r"\s+", " ", text).strip()

    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]   # remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]    # lemmatize
    return tokens

processed = [preprocess(doc) for doc in corpus]
processed


[['love', 'machine', 'learning', 'machine', 'learning', 'fun'],
 ['love', 'nlp', 'love', 'deep', 'learning'],
 ['nlp', 'text', 'processing', 'understanding', 'language']]

In [5]:
vocab = sorted(set(word for doc in processed for word in doc))
vocab


['deep',
 'fun',
 'language',
 'learning',
 'love',
 'machine',
 'nlp',
 'processing',
 'text',
 'understanding']

In [6]:
word2idx = {word: i for i, word in enumerate(vocab)}
word2idx


{'deep': 0,
 'fun': 1,
 'language': 2,
 'learning': 3,
 'love': 4,
 'machine': 5,
 'nlp': 6,
 'processing': 7,
 'text': 8,
 'understanding': 9}

In [7]:
def bow_vector(tokens, word2idx):
    vec = [0] * len(word2idx)
    for w in tokens:
        vec[word2idx[w]] += 1   # count frequency
    return vec

bow_vectors = [bow_vector(doc_tokens, word2idx) for doc_tokens in processed]
bow_vectors


[[0, 1, 0, 2, 1, 2, 0, 0, 0, 0],
 [1, 0, 0, 1, 2, 0, 1, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 1, 1, 1, 1]]

In [8]:
bow_df = pd.DataFrame(bow_vectors, columns=vocab)
bow_df.index = [f"D{i+1}" for i in range(len(corpus))]
bow_df


Unnamed: 0,deep,fun,language,learning,love,machine,nlp,processing,text,understanding
D1,0,1,0,2,1,2,0,0,0,0
D2,1,0,0,1,2,0,1,0,0,0
D3,0,0,1,0,0,0,1,1,1,1


In [9]:
for i, (tokens, vec) in enumerate(zip(processed, bow_vectors), start=1):
    print(f"Document D{i}:")
    print("Tokens:", tokens)
    print("BoW:", vec)
    print("-" * 60)


Document D1:
Tokens: ['love', 'machine', 'learning', 'machine', 'learning', 'fun']
BoW: [0, 1, 0, 2, 1, 2, 0, 0, 0, 0]
------------------------------------------------------------
Document D2:
Tokens: ['love', 'nlp', 'love', 'deep', 'learning']
BoW: [1, 0, 0, 1, 2, 0, 1, 0, 0, 0]
------------------------------------------------------------
Document D3:
Tokens: ['nlp', 'text', 'processing', 'understanding', 'language']
BoW: [0, 0, 1, 0, 0, 0, 1, 1, 1, 1]
------------------------------------------------------------
