In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from collections import defaultdict

## Data Preprocessing
- tokenization: Breaking down a piece of text into smaller units
- Removing Stopwords: common words in English such as "the", "that", "a" 

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
# Sentence
sentence = "The crazy brown fox jummps over the lazy dog."


In [9]:
# Tokenize
words = word_tokenize(sentence)

In [10]:
# Get the list of stopwords in English
stop_words = set(stopwords.words("english"))

In [11]:
# Remove stopwords from the tokenized words
filtered_sentence = [word for word in words if word.lower() not in stop_words]

In [12]:
filtered_sentence = " ".join(filtered_sentence)
print("Original Sentence: ", sentence)
print("Filtered Sentence: ", filtered_sentence)

Original Sentence:  The crazy brown fox jummps over the lazy dog.
Filtered Sentence:  crazy brown fox jummps lazy dog .


## Creating a Vocabulary
- Collection of unique words in a corpus of text

In [13]:
corpus = [
    "tokeize the words in this sentence",
    "remove the stopwords from the sentence",
    "this is a sample sentence",
    "this is another example",
    "this is a test sentence",
    "NLP is fun",
    "I love programming",
    "Python is great",
    "tokenization is the process of breaking down text into smaller pieces",
    "stopwords are common words that are usually ignored in text processing",
]

In [16]:
# Initialize a defualtdict with integer values to store word frequencies

#  defaultdict(int) initializes each new key with a default integer value of 0
vocab = defaultdict(int)

# Loop through each sentence in the corpus to tokenize and normalize
for sentence in corpus:
    words = re.findall(r"\w+", sentence.lower())  # Tokenize and normalize to lowercase
    for word in words:
        vocab[word]+=1

# covert the defaultdict to a regulate dictionary for easier handling and sorting
sorted_vocab = dict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

print("Vocabulary: ", sorted_vocab)

Vocabulary:  {'is': 6, 'the': 4, 'this': 4, 'sentence': 4, 'words': 2, 'in': 2, 'stopwords': 2, 'a': 2, 'text': 2, 'are': 2, 'tokeize': 1, 'remove': 1, 'from': 1, 'sample': 1, 'another': 1, 'example': 1, 'test': 1, 'nlp': 1, 'fun': 1, 'i': 1, 'love': 1, 'programming': 1, 'python': 1, 'great': 1, 'tokenization': 1, 'process': 1, 'of': 1, 'breaking': 1, 'down': 1, 'into': 1, 'smaller': 1, 'pieces': 1, 'common': 1, 'that': 1, 'usually': 1, 'ignored': 1, 'processing': 1}


# Bag of Words Implementation

In [17]:
import string

In [18]:
corpus = [
    "tokeize the words in this sentence",
    "remove the stopwords from the sentence",
    "this is a sample sentence",
    "this is another example",
    "this is a test sentence",
    "NLP is fun",
    "I love programming",
    "Python is great",
    "tokenization is the process of breaking down text into smaller pieces",
    "stopwords are common words that are usually ignored in text processing",
]

In [19]:
#  function to preprocess the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Split the text into words
    tokens = text.split()

    return tokens

In [20]:
# Applying the preprocessing to the sample corpus
processed_corpus = [preprocess_text(sentence) for sentence in corpus]
print("Processed Corpus: ", processed_corpus)

Processed Corpus:  [['tokeize', 'the', 'words', 'in', 'this', 'sentence'], ['remove', 'the', 'stopwords', 'from', 'the', 'sentence'], ['this', 'is', 'a', 'sample', 'sentence'], ['this', 'is', 'another', 'example'], ['this', 'is', 'a', 'test', 'sentence'], ['nlp', 'is', 'fun'], ['i', 'love', 'programming'], ['python', 'is', 'great'], ['tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'smaller', 'pieces'], ['stopwords', 'are', 'common', 'words', 'that', 'are', 'usually', 'ignored', 'in', 'text', 'processing']]
