In [1]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
kindle_reviews = pd.read_csv('all_kindle_review.csv')

# Select relevant columns
reviews = kindle_reviews[['reviewText', 'rating']]

# Create sentiment labels (1 for positive, 0 for negative)
reviews['Sentiment'] = reviews['rating'].apply(lambda x: 1 if x >= 4 else (0 if x <= 2 else None))

# Drop neutral reviews
reviews = reviews.dropna(subset=['Sentiment'])

# Reset the index
reviews_cleaned = reviews[['reviewText', 'Sentiment']].reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['Sentiment'] = reviews['rating'].apply(lambda x: 1 if x >= 4 else (0 if x <= 2 else None))


In [3]:
# Text Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join(char for char in text if char not in string.punctuation)
    # Remove stop words (optional, requires nltk stopwords)
    return text

# Apply preprocessing to review text
reviews_cleaned['reviewText'] = reviews_cleaned['reviewText'].apply(preprocess_text)


In [4]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=2500)

# Transform the review text into numerical features
X = tfidf.fit_transform(reviews_cleaned['reviewText']).toarray()
y = reviews_cleaned['Sentiment']


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Train a Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)


In [7]:
# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8625
Classification Report:
               precision    recall  f1-score   support

         0.0       0.90      0.75      0.82       811
         1.0       0.85      0.94      0.89      1189

    accuracy                           0.86      2000
   macro avg       0.87      0.84      0.85      2000
weighted avg       0.87      0.86      0.86      2000

