In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8-pastel')
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer
from wordcloud import WordCloud

# Importing the Dataset
The dataset contains Amazon US Book Reviews over the span of two decades from 1995 to present. This makes Amazon Customer Reviews a rich source of information for academic researchers in the fields of Natural Language Processing (NLP), Information Retrieval (IR), and Machine Learning (ML), amongst others.

My prime objective is to use a portion of the dataset to analyze recent reviews on books and cluster the customers based on sentiments.

In [None]:
df = pd.read_csv('amazon_reviews_us_Books_v1_02.tsv', sep='\t', on_bad_lines='skip')

# Basic Data Description
The following functions are carried out to get an overview of the dataset. Since the missing values are quite less, it has been decided to drop them.

In [None]:
#Peeping at the datset
df.head()

In [None]:
#Getting random rows to peep at the dataset unbiased
df.sample(5)

In [None]:
#Shape of the dataset
df.shape

In [None]:
#statistical overview of the dataset
df.describe()

In [None]:
#Info about the attributes
df.info()

In [None]:
#Finding out the missing values
df.isna().sum()

In [None]:
#Dropping the missing values
df = df.dropna()

In [None]:
#Confirming the drop
df.isna().sum()

In [None]:
#The new shape of the dataset
df.shape

# EDA
Conducting Exploratory Data Analysis on the dataset to derive insights. It has been observed that most of the customers has given a 5 star rating to the product and the ratio of good to bad rating rating is quite high.

In [None]:
#Univartiate Analysis to determine the ratings distribution
df['star_rating'].value_counts()

In [None]:
#Viewing the ratings
sns.set_theme(style="darkgrid")
sns.countplot(data=df, y='star_rating', palette='dark')
plt.show()

In [None]:
plt.pie(df['star_rating'].value_counts(), labels = df['star_rating'].value_counts().index, autopct = '%.0f%%')

In [None]:
#Changing review_date to Datetime format and extracting the year
df["review_date"] = pd.to_datetime(df["review_date"])
df["review_year"] = df["review_date"].dt.year

In [None]:
#Bivariate analysis - distribution of number of votes with respect to review year
print(df.groupby("star_rating")["total_votes"].mean())
sns.lineplot(data = df, x = "review_year", y = "total_votes")

#### Since the dataset is observed to be quite large and we are only interested in the latest customer reviews, subsetting the dataset to drop the reviews that range from 1995 - 2000 has been deemed desirable.

#### It has been observed that most 5 star rated reviews do not have a good number of votes and may be misleading in our analysis, therefore the reviews with a total vote count less than 8 has been dropped

In [None]:
#Subsetting df in order to only work with the recent dataset and the most accurate dataset
df = df[(df['total_votes']>8.0) & (df['review_year'>1998])]

#### With careful observation, I have decided that it would be within my best interest to get a column 'review' whch comprises of not only review body but also data on the overall rating on the product and the total votes.

In [None]:
df['review'] = df.apply(lambda row: f"{row['star_rating']} stars with 
                        {row['total_votes']} votes. {row['review_headline']}: {row['review_body']}", axis=1)

#### Preparing Review column through NLP techniques
In order to prepare the review column for analysis, we perform the following tasks
- Remove HTML tags
- Remove punctuations
- Remove stopwords
- Lametize
- represent the data as a frequency distribution of the words

We have used WordCloud to effectively view the frequently used words in both kind of ratings [good = 3,4,5 and bad = 1,2]

In [None]:
# Remove HTML tags
df['review'] = df['review'].apply(lambda x: re.sub(r'<.*?>', '', x))

In [None]:
#Remove punctuations
df['review_body'] = df['review_body'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [None]:
#Convert words into small letter words
df['review_body'] = df['review_body'].apply(lambda x : x.lower())

In [None]:
#Tokenizing through splitting the dataset
df['review_body'] = df['review_body'].apply(lambda x: x.split())

In [None]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
df['review_body'] = df['review_body'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
# Join the tokens back into a string
df['review_body'] = df['review_body'].apply(lambda x: ' '.join(x))

In [None]:
lemmatizer = nltk.WordNetLemmatizer()
df['review_body'] = df['review_body'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [None]:
df['review_body'].head(3)

In [None]:
from nltk.probability import FreqDist
fdist = FreqDist(word for review in df[df['star_rating'].isin([5.0,4.0,3.0])]['review_body'] for word in review.split())
wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate_from_frequencies(fdist)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
from nltk.probability import FreqDist
fdist = FreqDist(word for review in df[df['star_rating'].isin([1.0,2.0])]['review_body'] for word in review.split())
wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate_from_frequencies(fdist)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# TF-IDF Vectorizer
TF-IDF transforms the customer reviews into numerical vectors. These vectors represent the importance of terms in each document, which can then be used in tasks like clustering, sentiment analysis, or classification. This transformation allows machine learning models to work effectively with textual data by providing them with a numerical representation of the content.

- Term Frequency (TF): Measures the frequency of a term in a document. A higher frequency indicates a term's significance within that document.
- Inverse Document Frequency (IDF): Measures how unique or rare a term is across all documents. Rare terms get higher scores, reducing the weight of common words that appear in most documents.

In [None]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the review data and transform it into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(df['review_body'])

# Convert the TF-IDF matrix into a dense array
tfidf_array = tfidf_matrix.toarray()

# Print the shape of the TF-IDF array
print(tfidf_array.shape)

# Principle Component Analysis
PCA is used to preprocess large textual data (e.g., vectors created using TF-IDF) by reducing its dimensionality. This simplifies the data, reduces computational load, and helps to enhance the performance of clustering algorithms applied to the transformed data.

In [None]:
# Create a PCA object with 200 components
pca = PCA(n_components=200)

# Fit the PCA object to the TF-IDF array and transform it into a lower-dimensional representation
pca_array = pca.fit_transform(tfidf_array)

# Print the shape of the PCA array
print(pca_array.shape)

# Standard Scalar Normalization
For clustering customer reviews or performing sentiment analysis, standardization ensures that all numerical features (such as TF-IDF scores) are on the same scale, leading to more meaningful and accurate results.

In [None]:
# Scale the data using StandardScaler
scaler = StandardScaler()
pca_array_scaled = scaler.fit_transform(pca_array)

# Clustering Algorithms

For clustering the customer reviews I have used three clustering algorithms, namely:
- KMeans Clustering
- Aggolomerative Heirarchial Clutering
- DBSCAN (Density based Clustering Algorithm)

From the analysis, it has been estimated that DBSCAN has provided the most accurate results. 

In [None]:
def evaluate_clustering_quality(clustering_algorithm, X, n_clusters):
    clustering_algorithm.n_clusters = n_clusters
    cluster_labels = clustering_algorithm.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    db_index = davies_bouldin_score(X, cluster_labels)
    return silhouette_avg, db_index

In [None]:
# Apply K-Means clustering with Elbow Method
kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2, 6))
visualizer.fit(pca_array_scaled)

In [None]:
kmeans = KMeans(n_clusters = 3, random_state = 42)
kmeans.fit(pca_array_scaled)

In [None]:
wcss=[]
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(pca_array_scaled)
    wcss.append(kmeans.inertia_)

In [None]:
hierarchical_n_clusters = visualizer.elbow_value_
hierarchical = AgglomerativeClustering(n_clusters=hierarchical_n_clusters)
hierarchical_quality = evaluate_clustering_quality(hierarchical, pca_array_scaled, hierarchical_n_clusters)

In [None]:
# Apply DBSCAN clustering with different epsilon values
dbscan_results = []
for epsilon in [0.1, 0.5, 1, 2, 5]:
    dbscan = DBSCAN(eps=epsilon)
    cluster_labels = dbscan.fit_predict(pca_array_scaled)
    silhouette_avg = silhouette_score(pca_array_scaled, cluster_labels)
    db_index = davies_bouldin_score(pca_array_scaled, cluster_labels)
    dbscan_results.append((epsilon, silhouette_avg, db_index))

In [None]:
# Evaluate clustering quality using Silhouette Score and Davies-Bouldin Index
best_quality = max([kmeans_quality, hierarchical_quality] + dbscan_results, key=lambda x: x[1])

print("Best Clustering Algorithm:")
print("----------------------------")
if best_quality == kmeans_quality:
    print(f"K-Means Clustering: n_clusters={kmeans_n_clusters}, Silhouette Score={best_quality[0]:.3f}, Davies-Bouldin Index={best_quality[1]:.3f}")
elif best_quality == hierarchical_quality:
    print(f"Hierarchical Clustering: n_clusters={hierarchical_n_clusters}, Silhouette Score={best_quality[0]:.3f}, Davies-Bouldin Index={best_quality[1]:.3f}")
else:
    print(f"DBSCAN Clustering: epsilon={best_quality[0]}, Silhouette Score={best_quality[1]:.3f}, Davies-Bouldin Index={best_quality[2]:.3f}")

# Future Prospects

- Dabbling in other clustering algorithms such as H-DBSCAN, GMM, Spectral and Mean-Shift Clustering
- Using derived results in sentiment analysis of the customers