In [9]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib

# Download NLTK data if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
df = pd.read_csv("Amazon-Product-Reviews.csv")

# Drop missing values
df = df.dropna()

# Drop duplicates
df = df.drop_duplicates()

# Convert text to lower case
df["review_body"] = df["review_body"].str.lower()

# Remove HTML tags
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub('', text)
df['review_body'] = df['review_body'].apply(remove_html_tags)

# Remove punctuation
def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))
df['review_body'] = df['review_body'].apply(remove_punc)

# Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)
df['review_body'] = df['review_body'].apply(remove_stopwords)

# Remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                             u"\U0001F600-\U0001F64F"  # emoticons
                             u"\U0001F300-\U0001F5FF"  # symbols, pictograph
                             u"\U0001F680-\U0001F6FF"  # transport and map symbol
                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                             u"\U00002702-\U000027B0"
                             u"\U00002FC2-\U0001F251"
                             "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
df['review_body'] = df['review_body'].apply(remove_emoji)

# Tokenization and stemming
def preprocess_text(text):
    lancaster_stemmer = LancasterStemmer()
    words = nltk.word_tokenize(text)
    stemmed_words = [lancaster_stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

df['Processed_review_body'] = df['review_body'].apply(preprocess_text)

# Add sentiment based on star_rating
def star_rating_sentiment(rating):
    return 1 if rating in [4, 5] else 0

df['star_rating_sentiment'] = df['star_rating'].apply(star_rating_sentiment)

# Combine sentiments
df['combined_sentiment'] = df.apply(lambda row: 1 if (row['sentiment'] == 1 and row['star_rating_sentiment'] == 1) else 0, axis=1)

# Split the data
X = df['Processed_review_body']
y = df['combined_sentiment']

# Vectorize processed text
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vect = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with Naive Bayes: {accuracy}")

# Save the trained model and vectorizer
joblib.dump(model, 'naive_bayes_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')


[nltk_data] Downloading package stopwords to C:\Users\Chong Pei
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Chong Pei
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy with Naive Bayes: 0.8847276264591439


['tfidf_vectorizer.joblib']