In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing function for product reviews with NaN handling
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Load dataset
data = pd.read_csv('1429_1.csv')

# Check for missing values
print(f"Missing reviews: {data['reviews.text'].isna().sum()}")

# Convert rating to sentiment - handle potential NaN in ratings
data['sentiment'] = pd.cut(data['reviews.rating'], 
                          bins=[0, 2, 3, 5],
                          labels=['negative', 'neutral', 'positive'])

# Remove rows with NaN in sentiment (from invalid ratings)
data = data.dropna(subset=['sentiment'])

# Check class distribution
print("\nClass distribution:")
print(data['sentiment'].value_counts())

# Preprocess text
data['processed_text'] = data['reviews.text'].apply(preprocess_text)

# Remove empty texts
data = data[data['processed_text'].str.len() > 0]

# Verify no NaN values remain
print("\nNaN check after cleaning:")
print(data[['sentiment', 'processed_text']].isna().sum())

# Split data - now stratify will work
X_train, X_test, y_train, y_test = train_test_split(
    data['processed_text'], 
    data['sentiment'], 
    test_size=0.2, 
    random_state=42,
    stratify=data['sentiment']
)

# Create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.7
    )),
    ('classifier', LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight='balanced',
        multi_class='multinomial'
    ))
])

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Example predictions
sample_reviews = [
    "This product is absolutely terrible. It broke after 2 days of use.",
    "The item works fine but could be better. It's just okay.",
    "I love this product! It exceeded all my expectations!",
    np.nan
]

processed_samples = [preprocess_text(review) for review in sample_reviews]
predictions = pipeline.predict([x for x in processed_samples if x != ""])

for review, pred in zip(sample_reviews, predictions):
    print(f"\nReview: {review}")
    print("Predicted sentiment:", pred)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ms2003\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ms2003\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data = pd.read_csv('1429_1.csv')


Missing reviews: 1

Class distribution:
sentiment
positive    32316
neutral      1499
negative      812
Name: count, dtype: int64

NaN check after cleaning:
sentiment         0
processed_text    0
dtype: int64

Accuracy: 0.8548736462093863

Classification Report:
              precision    recall  f1-score   support

    negative       0.28      0.55      0.37       162
     neutral       0.16      0.39      0.23       300
    positive       0.98      0.88      0.93      6463

    accuracy                           0.85      6925
   macro avg       0.47      0.61      0.51      6925
weighted avg       0.92      0.85      0.88      6925


Confusion Matrix:
[[  89   46   27]
 [  66  118  116]
 [ 167  583 5713]]

Review: This product is absolutely terrible. It broke after 2 days of use.
Predicted sentiment: negative

Review: The item works fine but could be better. It's just okay.
Predicted sentiment: neutral

Review: I love this product! It exceeded all my expectations!
Predicted sentime