In [None]:
from datasets import load_dataset, load_dataset_builder

import re
import numpy as np
import pandas as pd

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stopwords = stopwords.words("portuguese")
from nltk.stem.porter import PorterStemmer

from gensim import corpora
from gensim.models import LdaModel
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
load_dataset_builder("ruanchaves/b2w-reviews01").info.description

In [None]:
dataset = load_dataset("ruanchaves/b2w-reviews01", split="train")

In [None]:
dataset

In [None]:
dataset.features

## Exploratory Data Analysis

In [None]:
np.unique(np.array(dataset["site_category_lv1"], dtype="str"))

In [None]:
np.unique(np.array(dataset["site_category_lv2"], dtype="str"))

In [None]:
np.unique(np.array(dataset["overall_rating"], dtype="int8"), return_counts=True)

In [None]:
np.unique(np.array(dataset["recommend_to_a_friend"], dtype="str"), return_counts=True)

In [None]:
states = np.unique(np.array(dataset["reviewer_state"], dtype="str"), return_counts=True)
reviews_by_states = {st[0]: st[1] for st in zip(states[0], states[1])}

reviews_by_states

Some relations will be visualized considering the following features:
- Reviews overall rating
- Reviews whose products the client would recommended to someone else

In [None]:
def plot_reviews(data, relation_column, figsize=(10, 30), hue=None):
    _, axes = plt.subplots(5, figsize=figsize)

    for i in range(1, 6):
        reviews = data.filter(lambda x: x["overall_rating"] == i)
    
        axes[i-1].set_title(f"Number of {float(i)} ratings by {relation_column}")
        axes[i-1].set_xticklabels(axes[i-1].get_xticks(), rotation=90)
        if hue is None:
            sns.countplot(x=reviews[relation_column], ax=axes[i-1],
                        order=sorted(np.unique(np.array(reviews[relation_column], dtype="str")))
            );
        
        else:
            unique_hue = sorted(np.unique(np.array(reviews[hue], dtype="str")))
            sns.countplot(x=reviews[relation_column], hue=reviews[hue], ax=axes[i-1],
                        order=sorted(np.unique(np.array(reviews[relation_column], dtype="str"))),
                        hue_order=unique_hue
            );

    plt.show()

### Reviews and recommendations according to the states

In [None]:
plot_reviews(dataset, "reviewer_state")

In [None]:
plot_reviews(dataset, "reviewer_state", hue="recommend_to_a_friend")

### Reviews and recommendations according to the category (lv 1)

In [None]:
plot_reviews(dataset, "site_category_lv1", figsize=(10, 90))

In [None]:
plot_reviews(dataset, "site_category_lv1", figsize=(10, 90), hue="recommend_to_a_friend")

With this overview, we can move on to analyse the content of reviews

## Topic modelling

In [None]:
def tokenize(ds):
    stemmer = PorterStemmer()
    text = ds["review_text"]
    word_list = []
    
    if text is not None:
        for word in re.split(r"[.,!?\d\s]+", text): # process only letters of the alphabet
            word = stemmer.stem(word)
            if (word not in stopwords) and word != '':
                word_list.append(word)
    else:
        word_list.append("None")

    ds["tokens_list"] = word_list
    return ds

In [None]:
texts = dataset.map(tokenize)["tokens_list"]

dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
texts

In [None]:
lda_model = LdaModel(
    corpus=corpus,
    num_topics=25,
    id2word=dictionary,
    alpha="auto",
    # passes as 1 just for initial observation
)

In [None]:
for i, topic in enumerate(lda_model.print_topics(5)):
    print(f"topic {i+1}: {topic}")

See the main topic for each review

In [None]:
topic_dict = {f"topic_{id}": [] for id in range(lda_model.num_topics)}

for doc in range(len(corpus)):
    doc_corpus = corpus[doc]
    doc_topics = lda_model.get_document_topics(doc_corpus, 0)
    
    for topic_id, topic_prob in doc_topics:
        topic_dict[f"topic_{topic_id}"].append(topic_prob)

topic_df = pd.DataFrame(topic_dict)
topic_df.head()

How propense are the main topics of a review?

In [None]:
def find_topic(row):
    """
    Get the first ocurrence of a topic
    with propensity greather than a defined
    threshold
    """
    threshold = 0.50
    targ_row = row.loc[row>threshold]

    if targ_row.any():
        return row.index[0]
    else:
        return None
    
def find_propensity(row):
    """
    Get the propensity for the first ocurrence
    of a topic with propensity greather than a defined
    threshold
    """
    threshold = 0.50
    targ_row = row.loc[row>threshold]

    if targ_row.any():
        return targ_row.values[0]
    else:
        return None

In [None]:
topic_df["topic"] = topic_df.apply(find_topic, axis=1)
topic_df["propensity"] = topic_df.iloc[:, 0:len(topic_df.columns)-1].apply(find_propensity, axis=1)

In [None]:
topic_df

In [None]:
sns.countplot(data=topic_df, x="topic");

Elbow method through KMeans inertial to find the optimum number of clusters

In [None]:
sum_squared_dist = []

for k in range(1, 25):
    km = KMeans(n_clusters=k, n_init='auto')
    km = km.fit(topic_df.iloc[:, 0:len(topic_df.columns)-2])
    sum_squared_dist.append(km.inertia_)

plt.plot(range(1, 25), sum_squared_dist, 'bx-')
plt.xlabel("Number of clusters")
plt.ylabel("Sum of squared distance")
plt.show()