## Load the Preocessed Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset\processed_reviews.csv")
df.dropna(subset=["Review_text"], inplace=True) ## Dealing with empty rows
df.head(5)

Unnamed: 0,Review_text,Rating
0,liked,1
1,bought phone amazon using samsung m30s couple ...,1
2,awesome book reasonable price must buy,1
3,good,1
4,book fine bad contains nice concepts nicely ex...,1


# Lemmatizer

The WordNet Lemmatizer in NLTK reduces words to their base or dictionary form, called a lemma. Unlike stemming, it uses the WordNet lexical database to ensure valid words.

## Usage
- Converts inflected forms (e.g., running → run, better → good).
- Requires a WordNet installation (`nltk.download('wordnet')`).

This is often used in text preprocessing for NLP tasks to normalize words.



In [4]:
## Lemmatizer
import nltk
from nltk.stem import  WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nirma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
## Custom Function taking a sentance and lemmatizing the words in it
def lemmatize_words(text):
    return "".join([lemmatizer.lemmatize(word) for word in text])

In [7]:
## Apply Lemmatizer
df["Review_text"] = df["Review_text"].apply(lambda x: lemmatize_words(x))

In [9]:
df.head(5)

Unnamed: 0,Review_text,Rating
0,liked,1
1,bought phone amazon using samsung m30s couple ...,1
2,awesome book reasonable price must buy,1
3,good,1
4,book fine bad contains nice concepts nicely ex...,1


In [11]:
## Train Test Split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(df["Review_text"], df["Rating"],test_size=0.20 , random_state=42 )

# Bag-Of-Words(BOW) and Term Frequency - Inverse Document Frequency(TFIDF) Implementation

## Text Vectorization and Naive Bayes Classification

This code demonstrates how to preprocess text data using Bag of Words (BoW) and TF-IDF, and apply a Naive Bayes classifier for text classification.

## Process Overview

### Bag of Words (BoW) Representation
- `CountVectorizer` converts text data into a BoW representation.
- This creates a sparse matrix where each row corresponds to a document, and each column represents the count of a specific word.

### TF-IDF Representation
- `TfidfVectorizer` transforms text into TF-IDF features.
- It accounts for word frequency within a document and across all documents to assign importance to words.

### Naive Bayes Classifier
- A `GaussianNB` model is trained using the generated BoW and TF-IDF features.
- BoW and TF-IDF features are separately used to build classifiers.
- Memory errors are mitigated by reducing data size before training.



In [12]:
from sklearn.feature_extraction.text import  CountVectorizer

bow = CountVectorizer() # Initialize CountVectorizer for Bag-of-Words (BoW) representation
X_train_bow = bow.fit_transform(X_train).toarray() # Fit the BoW model to the training data and transform it into an array
X_test_bow = bow.transform(X_test).toarray() # Transform the test data using the BoW model into an array

In [13]:
from sklearn.feature_extraction.text import  TfidfVectorizer

tfidf = TfidfVectorizer() # Initialize TfidfVectorizer for TF-IDF representation
X_train_tfidf = tfidf.fit_transform(X_train).toarray() # Fit the TF-IDF model to the training data and transform it into an array
X_test_tfidf = tfidf.transform(X_test).toarray() # Transform the test data using the TF-IDF model into an array

In [14]:
from sklearn.naive_bayes import GaussianNB ## Reduced Size cuz of memory errors

# Train a Gaussian Naive Bayes model on a subset of the Bag-of-Words data (to avoid memory issues)
nb_model_bow = GaussianNB().fit(X_train_bow[:30000],y_train[:30000])
# Train a Gaussian Naive Bayes model on a subset of the TF-IDF data (to avoid memory issues)
nb_model_tfidf = GaussianNB().fit(X_train_tfidf[:30000], y_train[:30000])

## Evaluation and Metrics

In [15]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [16]:
y_pred_bow = nb_model_bow.predict(X_test_bow)
y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)

In [17]:
print("BOW accuracy: ", accuracy_score(y_test, y_pred_bow))
print("TFIDF accuracy: ", accuracy_score(y_test, y_pred_tfidf))

BOW accuracy:  0.33863655098507955
TFIDF accuracy:  0.34836369631522546


## Saving the Model

In [18]:
import pickle

In [24]:
# Save the model 
with open('artifacts/models/nb_model_bow.pkl', 'wb') as file:
    pickle.dump(nb_model_bow, file)

with open('artifacts/models/nb_model_tfidf.pkl', 'wb') as file:
    pickle.dump(nb_model_tfidf, file)


# Save the vectorizer 
with open('artifacts/vectorizer/bow_vectorizer.pkl', 'wb') as file:
    pickle.dump(bow, file)

with open('artifacts/vectorizer/tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)