In [11]:
import spacy
import pandas as pd
import spacy
from textblob import TextBlob

# Load the English language model
nlp = spacy.load('en_core_web_sm')

In [12]:
# Load dataset and drop missing values
df = pd.read_csv('amazon_product_reviews.csv')
clean_data = df.dropna(subset=['reviews.text'])

# Select column for preprocessing
cleaned = clean_data[['reviews.text']].copy()
text = cleaned['reviews.text']
text.head()

0    I order 3 of them and one of the item is bad q...
1    Bulk is always the less expensive way to go fo...
2    Well they are not Duracell but for the price i...
3    Seem to work as well as name brand batteries a...
4    These batteries are very long lasting the pric...
Name: reviews.text, dtype: object

In [14]:
def preprocess_text(text):
    """
    Preprocesses the input text by removing stopwords and performing basic text cleaning.
    """
    # Convert text to lowercase and remove whitespace
    doc = nlp(text.lower().strip())
    
    # Iterate through words in doc to remove stopwords and punctuation and return as lemmatised tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # Join the tokens back into a string with white spaces
    return ' '.join(tokens)

In [15]:
# Apply preprocessing function to the reviews column of the dataframe
cleaned['processed.text'] = cleaned['reviews.text'].apply(preprocess_text)

In [16]:
cleaned.head()

Unnamed: 0,reviews.text,processed.text
0,I order 3 of them and one of the item is bad q...,order 3 item bad quality miss backup spring pc...
1,Bulk is always the less expensive way to go fo...,bulk expensive way product like
2,Well they are not Duracell but for the price i...,duracell price happy
3,Seem to work as well as name brand batteries a...,work brand battery well price
4,These batteries are very long lasting the pric...,battery long last price great


In [17]:
def analyse_sentiment(text):
    """
    Analyses sentiment of the input text using TextBlob
    """

    # Analyse sentiment with TextBlob
    blob = TextBlob(text)
    # Using the polarity attribute
    polarity = blob.polarity

    # Determine sentiment based on polarity score
    if polarity > 0:
        sentiment = 'positive'
    elif polarity < 0:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
        
    return sentiment

In [18]:
# Apply sentiment analysis to the preprocessed text column
cleaned['sentiment'] = cleaned['processed.text'].apply(analyse_sentiment)

print(cleaned[['processed.text', 'sentiment']])

                                          processed.text sentiment
0      order 3 item bad quality miss backup spring pc...  negative
1                        bulk expensive way product like  negative
2                                   duracell price happy  positive
3                          work brand battery well price   neutral
4                          battery long last price great  positive
...                                                  ...       ...
28327  get 2 8 yr old twin 11 yr old well perfect way...  positive
28328       buy niece christmas gift.she 9 year old love  positive
28329  nice light internet browsing keep email view v...  positive
28330  tablet absolutely want watch tv show movie che...  positive
28331  ninety dollar expectionation low good table go...  positive

[28332 rows x 2 columns]


In [19]:
# Testing model on a review
# Select a review
review = cleaned['reviews.text'][555]   # Index of review number

# Analyse sentiment of the given review
sentiment_label = analyse_sentiment(review)

# Print the sentiment analysis function result
print("__________Sentiment Analysis__________\n")
print(f"Review text: {review}\n")
print(f"Sentiment:", sentiment_label)

__________Sentiment Analysis__________

Review text: Batteries were packaged good and seem fine

Sentiment: positive


In [20]:
def similarity_analysis(text1, text2):
    """
    Analyses the similarity between two reviews texts using the similarity function.
    """
    # Process the texts using spacy
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    
    # Calculate the similarity between the two documents and return the score
    similarity_score = doc1.similarity(doc2)
    
    return similarity_score

In [21]:
# Select two reviews for comparison
review1 = cleaned['reviews.text'][0]  # Index of the first review
review2 = cleaned['reviews.text'][1]  # Index of the second review

# Calculate similarity between the selected reviews
similarity_score = similarity_analysis(review1, review2)

print(f"Similarity score: {similarity_score:.2f}")

Similarity score: 0.42


  similarity_score = doc1.similarity(doc2)
