In [1]:
## Basic Data Handling
import pandas as pd  ## Handling Datasets (Like CSVs, Excel, Json, etc.,)
import numpy as np  ## For numrical operations and arrays

## Text Preprocessing
import re   ## For regex operations for cleaning text
import string  ## For Punctuation Handling
import nltk  ## Natural Language ToolKit (For stopwords, stemming etc)
from nltk.corpus import stopwords  ## To remove common, meaningless words
from nltk.stem import PorterStemmer ## For Stemming
from nltk.stem import WordNetLemmatizer  ## For Lemmatizations

## Sklearn feature exractions (Data Transformation Model)
from sklearn.feature_extraction.text import CountVectorizer  ## BAg of Words
from sklearn.feature_extraction.text import TfidfVectorizer  ## TF-IDF Model

## Data Preprocessing and Model Building
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB  ## Classifier for text data
from sklearn.linear_model import LogisticRegression  ## Another good text classifier
from sklearn.svm import SVC

## Model Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Progress Bar for Loops and apply 
from tqdm import tqdm  ## To visually track long operations
tqdm.pandas()  ## Enabled progress_apply() in pandas

## Download nltk resources (only need to do once)
# nltk.download('stopwords')
# nltk.download('wordnet')  ## for lemmatizer
# nltk.download('omw-1.4')

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
import os
import sys

# Add project root (parent of Notebook/) to Python path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from src.text_preprocessing import preprocess_corpus

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Load data and apply preprocessing
import os
# from src.text_preprocessing import preprocess_corpus

# TODO: Update the file name and column name below to match your dataset
# Example assumes a CSV in the `Data` folder with a text column named 'Review'

import os

# Correct path from Notebook/ to Data/
data_path = os.path.join("..", "Data", "data_cleaned.csv")  # or "data.csv"

df = pd.read_csv(data_path)
print("Loaded:", data_path)
print("Shape:", df.shape)

# Replace 'Review' with the actual text column name containing the review text
text_column = "Review text"  # e.g., 'Review', 'Review_Text', 'review'

# Apply preprocessing to create a new cleaned text column
df["cleaned_review"] = preprocess_corpus(df[text_column])

# Quick check of before vs after
print("\nSample cleaned texts:\n")
print(
    df[[text_column, "cleaned_review"]]
    .head(5)
    .to_string(index=False)
)

Loaded: ..\Data\data_cleaned.csv
Shape: (8510, 9)

Sample cleaned texts:

                                                                                                                                                                                                                                                                                                                                                                                         Review text                                                                                                                                                                         cleaned_review
                                                                                                                                                                                        Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df["cleaned_review"]
y = df["Ratings"]          # target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6421856639247944

Classification report:
               precision    recall  f1-score   support

           1       0.59      0.54      0.56       153
           2       0.31      0.07      0.11        61
           3       0.56      0.08      0.14       123
           4       0.20      0.02      0.03       349
           5       0.66      0.98      0.79      1016

    accuracy                           0.64      1702
   macro avg       0.46      0.34      0.33      1702
weighted avg       0.54      0.64      0.54      1702


Confusion matrix:
 [[ 82   3   1   4  63]
 [ 24   4   0   1  32]
 [ 18   2  10   6  87]
 [ 10   1   2   6 330]
 [  4   3   5  13 991]]
