In [1]:
import pandas as pd

In [2]:
# Load the dataset
data = pd.read_csv('Reviews.csv', nrows=1000)

# Preview the head of the data
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# 1.0 Data Preprocessing 

In [3]:
# Select Score and Text column only
data = data[['Score', 'Text']]

In [4]:
# Find the number of duplicated rows
duplicate_text = data.duplicated()
print(duplicate_text.sum())

3


In [5]:
# Remove the duplicated rows except the first one
data = data.drop_duplicates(keep='first')
data.shape

(997, 2)

In [6]:
# Check any rows available
print(data) 

     Score                                               Text
0        5  I have bought several of the Vitality canned d...
1        1  Product arrived labeled as Jumbo Salted Peanut...
2        4  This is a confection that has been around a fe...
3        2  If you are looking for the secret ingredient i...
4        5  Great taffy at a great price.  There was a wid...
..     ...                                                ...
995      5  BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...
996      5  Man what can i say, this salsa is the bomb!! i...
997      5  this sauce is so good with just about anything...
998      1  Not hot at all. Like the other low star review...
999      2  I have to admit, I was a sucker for the large ...

[997 rows x 2 columns]


## 1.1 Cleaning & Standardization

In [7]:
import re

def clean_text(text):
    # Remove HTML tags
    cleaned_text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    cleaned_text = re.sub(r'http\S+', '', cleaned_text)
    
    # Remove special characters and digits
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text)
    
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    
    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    return cleaned_text.strip()

# Apply the clean_text function to the 'Text' column
data['Text'] = data['Text'].apply(clean_text)

# View the cleaned text data
data.head()


Unnamed: 0,Score,Text
0,5,i have bought several of the vitality canned d...
1,1,product arrived labeled as jumbo salted peanut...
2,4,this is a confection that has been around a fe...
3,2,if you are looking for the secret ingredient i...
4,5,great taffy at a great price there was a wide ...


In [8]:
# Check for missing values
data.isnull().sum()

Score    0
Text     0
dtype: int64

In [9]:
# Select relevant columns for sentiment analysis
data = data[['Score', 'Text']]

## 1.2 Tokenization & Stopword Removal

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

# Tokenize the text into individual words
data['Tokens'] = data['Text'].apply(tokenize_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1.3 Lemmatization 

In [11]:
# Download the WordNet lemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize the tokens
data['Tokens'] = data['Tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [13]:
# Join the tokens back into sentences
data['Preprocessed_Text'] = data['Tokens'].apply(lambda x: ' '.join(x))

# Save the preprocessed data to a new CSV file
data.to_csv('processed.csv', index=False)

# Preview the preprocessed data
print(data)

     Score                                               Text  \
0        5  i have bought several of the vitality canned d...   
1        1  product arrived labeled as jumbo salted peanut...   
2        4  this is a confection that has been around a fe...   
3        2  if you are looking for the secret ingredient i...   
4        5  great taffy at a great price there was a wide ...   
..     ...                                                ...   
995      5  black market hot sauce is wonderful my husband...   
996      5  man what can i say this salsa is the bomb i ha...   
997      5  this sauce is so good with just about anything...   
998      1  not hot at all like the other low star reviewe...   
999      2  i have to admit i was a sucker for the large q...   

                                                Tokens  \
0    [bought, several, vitality, canned, dog, food,...   
1    [product, arrived, labeled, jumbo, salted, pea...   
2    [confection, around, century, light, pil

# 2.0 Feature Extraction

## 2.1 Bag of Words

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data
bow_features = vectorizer.fit_transform(data['Preprocessed_Text'])

# Get the vocabulary (unique words)
vocabulary = vectorizer.get_feature_names_out()

# Print the shape of the BoW features and the vocabulary size
print("Bag of Words (BoW) feature shape:", bow_features.shape)
print("Vocabulary size:", len(vocabulary))


Bag of Words (BoW) feature shape: (997, 5937)
Vocabulary size: 5937


## 2.2 TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
tfidf_features = tfidf_vectorizer.fit_transform(data['Preprocessed_Text'])

# Get the vocabulary (unique words)
tfidf_vocabulary = tfidf_vectorizer.get_feature_names_out()

# Print the shape of the TF-IDF features and the vocabulary size
print("TF-IDF feature shape:", tfidf_features.shape)
print("Vocabulary size:", len(tfidf_vocabulary))


TF-IDF feature shape: (997, 5937)
Vocabulary size: 5937


# 3.0 Model Selection

In [16]:
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

## 3.1 Lexicon-based Approach

In [17]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

# Define a function to assign sentiment labels based on the 'Score' column
def assign_sentiment(score):
    
    if score >= 4:
        return 'Positive'
    elif score <= 2:
        return 'Negative'
    else:
        return 'Neutral'

# Assign sentiment labels based on the 'Score' column
data['Sentiment'] = data['Score'].apply(assign_sentiment)

# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Calculate sentiment scores for each review
data['Lexicon_Sentiment'] = data['Preprocessed_Text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Map sentiment scores to labels
data['Lexicon_Sentiment_Label'] = data['Lexicon_Sentiment'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral'))

# Evaluate the lexicon-based approach
lexicon_accuracy = accuracy_score(data['Sentiment'], data['Lexicon_Sentiment_Label'])
print("Accuracy of the Lexicon-based Approach:", lexicon_accuracy)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Accuracy of the Lexicon-based Approach: 0.8004012036108324


## 3.2 Machine-Learning based Approach

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Preprocessed_Text'], data['Sentiment'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 4.0 Model Evaluation

In [20]:
# Train and evaluate Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
nb_predictions = nb_classifier.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_predictions)
print("Naive Bayes Classifier Accuracy:", nb_accuracy)
print(classification_report(y_test, nb_predictions))

# Train and evaluate SVM classifier
svm_classifier = LinearSVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("Support Vector Machine (SVM) Classifier Accuracy:", svm_accuracy)
print(classification_report(y_test, svm_predictions))

Naive Bayes Classifier Accuracy: 0.77
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        29
     Neutral       0.00      0.00      0.00        17
    Positive       0.77      1.00      0.87       154

    accuracy                           0.77       200
   macro avg       0.26      0.33      0.29       200
weighted avg       0.59      0.77      0.67       200

Support Vector Machine (SVM) Classifier Accuracy: 0.795
              precision    recall  f1-score   support

    Negative       0.70      0.24      0.36        29
     Neutral       1.00      0.06      0.11        17
    Positive       0.80      0.98      0.88       154

    accuracy                           0.80       200
   macro avg       0.83      0.43      0.45       200
weighted avg       0.80      0.80      0.74       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 5.0 Discussion

### In a general sense, the strengths and weaknesses of the each model for sentiment classification are as follows:

## 5.1 Naive Bayes
### Strengths:
### a) Naive Bayes classifiers are straightforward, convenient and easy to use.
### b) Only need a minimal amount of training time, particularly when working with big datasets.
### c) Due to its independence assumption, Naive Bayes is highly suitable and adaptable to handle irrelevant features.

### Weaknesses:
### a) Complicated feature relationships and associationsÂ may be overlooked by NB classifier.
### b) The NB classifier makes the assumption that features are independent, which may not always be the case with data from the actual world.
### c) The NB classifier gives zero probability to a category and a feature that do not appear together in the training data, which might cause unexpected problems or issues to arise.

## 5.2 SVM 
### Strengths:
### a) SVM has a high accuracy rate, particularly in spaces with several dimensions.
### b) SVM is efficient in high-dimensional spaces when the number of dimensions exceeds the number of samples.
### c) SVM provide adaptability in selecting various kernel functions to represent intricate relationships.
### Weaknesses:
### a) SVM training may be computationally costly and intensive, particularly when dealing with massive datasets.
### b) SVM sensitivity to noise in the data might result in overfitting if improper regularization is not done.
### c) SVMs has low interpretability since it offers less information about the causes of predictions.

## 5.3 Comparison

### a) Based on the results, SVM has a marginal superiority over Naive Bayes in terms of accuracy, with a score of 0.795 compared to Naive Bayes' score of 0.77. 
### b) The simplicity of Naive Bayes makes it more interpretable when compared with SVM.
### c) Computationally speaking, Naive Bayes is more efficient than SVM.
### d) In terms of robustness, Naive Bayes exhibits more adaptability to irrelevant features, whereas SVM may have difficulties in handling noise present in the data.