In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv("C:\\Users\\maths\\Downloads\\Womens Clothing E-Commerce Reviews.csv")

# Drop missing values
df.dropna(inplace=True)

# Remove short reviews (<5 words)
df = df[df["Review Text"].str.split().str.len() >= 5]

# Text cleaning: Expand contractions, remove special chars, convert to lowercase
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())  # Remove special characters
    return " ".join([word for word in text.split() if word not in stop_words])

df["Review Text"] = df["Review Text"].apply(clean_text)

# Feature & Label Separation
X = df["Review Text"]
y = df["Recommended IND"]

# Train-Test Split (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text into numerical form (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Naïve Bayes Model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
"C:\\Users\\maths\\Downloads\\Womens Clothing E-Commerce Reviews.csv"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maths\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Accuracy: 0.8414354797658438
              precision    recall  f1-score   support

           0       0.89      0.15      0.26       718
           1       0.84      1.00      0.91      3211

    accuracy                           0.84      3929
   macro avg       0.86      0.57      0.59      3929
weighted avg       0.85      0.84      0.79      3929



'C:\\Users\\maths\\Downloads\\Womens Clothing E-Commerce Reviews.csv'