**Student name:** PAV Limseng\
**Student ID:** e20211548
# TP3: Bag of Words

### 1. Implement a bag of words algorithm with Python

In [1]:
sentence1 = "Welcome to NLP Learning, Now Start Learning"
sentence2 = "Learning is a good practice"

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from Utils import Utils
import string

In [3]:
# Step1: Tokenization
tokens1 = word_tokenize(sentence1.lower())
tokens2 = word_tokenize(sentence2.lower())

# Step2: Create a combined vocabulary
vocabulary = Utils.create_vocabulary([tokens1, tokens2])


# Step3: Remove stop words and punctuation from vocabulary
punctuation = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
filtered_vocabulary = [word for word in vocabulary if word not in stop_words and word not in punctuation]

# Step4: Create Bag of Words vectors
bow_vector1 = Utils.create_bow_vector(tokens1, filtered_vocabulary)
bow_vector2 = Utils.create_bow_vector(tokens2, filtered_vocabulary)

In [4]:
print(tokens1)
print(tokens2)
print(vocabulary)
print(filtered_vocabulary)
print(bow_vector1)
print(bow_vector2)

['welcome', 'to', 'nlp', 'learning', ',', 'now', 'start', 'learning']
['learning', 'is', 'a', 'good', 'practice']
[',', 'a', 'good', 'is', 'learning', 'nlp', 'now', 'practice', 'start', 'to', 'welcome']
['good', 'learning', 'nlp', 'practice', 'start', 'welcome']
[0, 2, 1, 0, 1, 1]
[1, 1, 0, 1, 0, 0]


### 2. Implement Bag of Words using SKLEARN

In [5]:
sentence1_1 = "This is a good job. I will not miss it for anything" 
sentence2_1 = " This is not good at all" 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [7]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([sentence1_1, sentence2_1])

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(df)

   good  job  miss
0     1    1     1
1     1    0     0


### 3. Implement Bag of words using NLTK

In [8]:
# Step 1: Import necessary libraries from NLTK (already did at the begining of the document)
# Step 2: Define list of sample documents
sentence1_2 = "I love natural language processing."
sentence2_2 = "Text classification is an important NLP task."
sentence3_2 = "NLTK provides useful tools for NLP."
sentences = [sentence1_2, sentence2_2, sentence3_2]

In [9]:
# Step 3: Tokenise the documents into words and convert them to lowercase
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

In [10]:
# Step 4: Remove stop words and punctuation from the tokens

stop_words = set(nltk.corpus.stopwords.words("english"))
filtered_sentences = []
for tokens in tokenized_sentences:
    filtered_tokens = [
        word
        for word in tokens
        if word.isalnum() and word not in stop_words and word not in punctuation
    ]
    filtered_sentences.append(filtered_tokens)

In [11]:
# Step 5: Create a vocabulary by collecting all unique words from the processed documents.
vocabulary_2 = Utils.create_vocabulary(filtered_sentences)

In [12]:
# Step 6: Initialize a BoW dictionary with word counts, setting the initial count for each word to 0.
# Step 7: Iterate through the filtered tokens and increment the count for each word in the BoW dictionary.
bow_vectors = []
for tokens in filtered_sentences:
    bow_vector = Utils.create_bow_vector(tokens, vocabulary_2)
    bow_vectors.append(bow_vector)

In [13]:
# Step 8: Print the BoW representation, which shows the word counts for each word in the vocabulary.
bow_dict = Utils.bowvectors_to_dict(vocabulary_2, bow_vectors)
print('Bag of Words (BoW) representation:')
print(bow_dict)

Bag of Words (BoW) representation:
{'classification': 1, 'important': 1, 'language': 1, 'love': 1, 'natural': 1, 'nlp': 2, 'nltk': 1, 'processing': 1, 'provides': 1, 'task': 1, 'text': 1, 'tools': 1, 'useful': 1}


### 4. Classify movie review is posi ve or nega ve using Bag of words for pre-processing the text (from Sklearn) and apply with any models (RF, DT) 

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
df = pd.read_csv('IMDB-Dataset.csv')

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [16]:
print(df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [17]:
print(df['sentiment'].unique()) 

['positive' 'negative']


In [18]:
# Encode sentiment → 0 = negative, 1 = positive
le = LabelEncoder()
df["sentiment_label"] = le.fit_transform(df["sentiment"])

In [19]:
# 2. Train–Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment_label"], test_size=0.3, random_state=42
)

In [20]:
# 3. Bag-of-Words (CountVectorizer)
vectorizer = CountVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [21]:
# 4A. Decision Tree Model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_vec, y_train)
dt_pred = dt.predict(X_test_vec)

print("========== Decision Tree ==========")
print(classification_report(y_test, dt_pred))

              precision    recall  f1-score   support

           0       0.72      0.74      0.73      7411
           1       0.74      0.72      0.73      7589

    accuracy                           0.73     15000
   macro avg       0.73      0.73      0.73     15000
weighted avg       0.73      0.73      0.73     15000



In [22]:
# 4B. Random Forest Model
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_vec, y_train)
rf_pred = rf.predict(X_test_vec)

print("========== Random Forest ==========")
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      7411
           1       0.87      0.87      0.87      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000

