In [1]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Analysis of Reviews on Olist

🎯 Now that you are familiar with NLP, let's analyze the reviews of Olist.

👇 Run the following cell to load the reviews dataset and install `unidecode`

In [2]:
!pip install -q unidecode

import pandas as pd

url = "https://wagon-public-datasets.s3.amazonaws.com/Machine%20Learning%20Datasets/ml_olist_nlp_reviews.csv"
df = pd.read_csv(url, low_memory = False)

df.head()



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Unnamed: 0.1,Unnamed: 0,review_id,length_review,review_score,order_id,product_category_name,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,0,7bc2406110b926393aa56f80a40eba40,0,4,73fc7af87114b39712e6da79b0a377eb,esporte_lazer,,,2018-01-18 00:00:00,2018-01-18 21:46:59,41dcb106f807e993532d446263290104,delivered,2018-01-11 15:30:49,2018-01-11 15:47:59,2018-01-12 21:57:22,2018-01-17 18:42:41,2018-02-02 00:00:00
1,1,80e641a11e56f04c1ad469d5645fdfde,0,5,a548910a1c6147796b98fdf73dbeba33,informatica_acessorios,,,2018-03-10 00:00:00,2018-03-11 03:05:13,8a2e7ef9053dea531e4dc76bd6d853e6,delivered,2018-02-28 12:25:19,2018-02-28 12:48:39,2018-03-02 19:08:15,2018-03-09 23:17:20,2018-03-14 00:00:00
2,2,228ce5500dc1d8e020d8d1322874b6f0,0,5,f9e4b658b201a9f2ecdecbb34bed034b,informatica_acessorios,,,2018-02-17 00:00:00,2018-02-18 14:36:24,e226dfed6544df5b7b87a48208690feb,delivered,2018-02-03 09:56:22,2018-02-03 10:33:41,2018-02-06 16:18:28,2018-02-16 17:28:48,2018-03-09 00:00:00
3,3,e64fb393e7b32834bb789ff8bb30750e,37,5,658677c97b385a9be170737859d3511b,ferramentas_jardim,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,de6dff97e5f1ba84a3cd9a3bc97df5f6,delivered,2017-04-09 17:41:13,2017-04-09 17:55:19,2017-04-10 14:24:47,2017-04-20 09:08:35,2017-05-10 00:00:00
4,4,f7c4243c7fe1938f181bec41a392bdeb,100,5,8e6bfb81e283fa7e4f11123a3fb894f1,esporte_lazer,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,5986b333ca0d44534a156a52a8e33a83,delivered,2018-02-10 10:59:03,2018-02-10 15:48:21,2018-02-15 19:36:14,2018-02-28 16:33:35,2018-03-09 00:00:00


In [3]:
df.shape

(98657, 17)

❓ **Question: Analyse the reviews to understand what could be the causes of the bad review scores** ❓

This challenge is not as guided as the previous ones. But here are some questions to ask yourself:

- Are all the reviews relevant ? 
- What about combining the title and the body of a review ?
- What cleaning operations would you apply to the reviews ?

🇧🇷 Some Brazilian expressions and their translations:

- `producto errado` = wrong product
- `ainda nao` = not yet
- `nao entregue` = not delivered
- `nao veio` = did not come
- `nao gostei` = did not like it
- `produto defeito` = defective product
- `nao functiona` = not working
- `produto diferente` = different product
- `pessima qualidade` = poor quality
- `veio defeito` = came defect
- `veio faltando` = came missing
- `veio errado` = came wrong

In [4]:
# Selecting only the columns that make sence
df = df[['review_score', 'review_comment_title', 'review_comment_message']]
df.head()

Unnamed: 0,review_score,review_comment_title,review_comment_message
0,4,,
1,5,,
2,5,,
3,5,,Recebi bem antes do prazo estipulado.
4,5,,Parabéns lojas lannister adorei comprar pela I...


In [5]:
# Selecting only the bad reviews (under 3)
df = df[df['review_score'] <= 3]

In [6]:
df.head()

Unnamed: 0,review_score,review_comment_title,review_comment_message
5,1,,
14,3,,
16,2,,"GOSTARIA DE SABER O QUE HOUVE, SEMPRE RECEBI E..."
18,3,,
19,1,Não chegou meu produto,Péssimo


In [None]:
# concat review title and review comment
def concat_comments(row):
    title = row['review_comment_title']
    message = row['review_comment_message']

    if pd.isna(title) and pd.isna(message):
        return np.nan
    else:
        return f"{'' if pd.isna(title) else title} {' ' if not pd.isna(title) and not pd.isna(message) else ''}{'' if pd.isna(message) else message}"

# Création d'une nouvelle colonne avec le résultat
df['review_full_comment'] = df.apply(concat_comments, axis=1)

In [8]:
df = df[['review_full_comment']]

In [9]:
df.dropna(inplace=True)

In [10]:
df.head()

Unnamed: 0,review_full_comment
16,"GOSTARIA DE SABER O QUE HOUVE, SEMPRE RECEBI ..."
19,Não chegou meu produto Péssimo
29,Não gostei ! Comprei gato por lebre
34,Sempre compro pela Internet e a entrega ocorr...
41,Nada de chegar o meu pedido.


In [11]:
# Cleaning the review comment
def cleaning(sentence):

    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers

    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    tokenized_sentence = word_tokenize(sentence) ## tokenize

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in tokenized_sentence
    ]

    cleaned_sentence = ' '.join(word for word in lemmatized)

    return cleaned_sentence

In [12]:
df['review_full_comment_clean'] = df['review_full_comment'].apply(cleaning)

In [13]:
df.head()

Unnamed: 0,review_full_comment,review_full_comment_clean
16,"GOSTARIA DE SABER O QUE HOUVE, SEMPRE RECEBI ...",gostaria de saber o que houve sempre recebi e ...
19,Não chegou meu produto Péssimo,não chegou meu produto péssimo
29,Não gostei ! Comprei gato por lebre,não gostei comprei gato por lebre
34,Sempre compro pela Internet e a entrega ocorr...,sempre compro pela internet e a entrega ocorre...
41,Nada de chegar o meu pedido.,nada de chegar o meu pedido


In [23]:
# vectorized clean reviews

vectorizer = TfidfVectorizer(ngram_range=(2,2))

vectorized_documents = vectorizer.fit_transform(df['review_full_comment_clean'])
vectorized_documents = pd.DataFrame(
    vectorized_documents.toarray(),
    columns = vectorizer.get_feature_names_out()
)

In [31]:
# Instantiate the LDA
n_components = 5
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_documents)

In [32]:
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(
        lda_model.components_,
        columns = vectorizer.get_feature_names_out()
    )

    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]

    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the the top {top_words} words with weights:")

        topic_df = topic_mixture.iloc[topic]\
            .sort_values(ascending = False).head(top_words)

        print(round(topic_df,3))


print_topics(lda_model, vectorizer, 5)

----------
For topic 0, here are the the top 5 words with weights:
não recebi        348.267
recebi produto    267.011
ainda não         183.459
foi entregue      151.956
produto não       137.203
Name: 0, dtype: float64
----------
For topic 1, here are the the top 5 words with weights:
bom produto    30.853
produto com    17.244
produto bom    16.854
com defeito    16.382
do produto     13.811
Name: 1, dtype: float64
----------
For topic 2, here are the the top 5 words with weights:
demorou muito     22.224
produto chegou    14.714
bom bom           12.304
muito para        11.352
muito ruim        10.542
Name: 2, dtype: float64
----------
For topic 3, here are the the top 5 words with weights:
muito bom        41.778
do produto       13.661
veio com         10.580
produto veio     10.378
produto muito    10.235
Name: 3, dtype: float64
----------
For topic 4, here are the the top 5 words with weights:
recebi apenas      11.242
comprei duas       10.999
apenas uma         10.989
produt

In [33]:
res = lda_model.transform(vectorized_documents)

In [34]:
df['topics'] = np.argmax(res, axis=1)

In [35]:
df['topics'].value_counts(normalize=True)

topics
0    0.499120
3    0.129707
4    0.127173
2    0.124428
1    0.119572
Name: proportion, dtype: float64

In [36]:
# Here are the 12 most relevent reason for bad reviews, rank per order :
topic_titles = {
    0: "Positive Feedback With Pending Delivery Issues",
    1: "Product Quality Complaints and Non-Delivery",
    2: "Delayed Delivery and Product Condition",
    3: "Successful Delivery and Customer Inquiries",
    4: "Orders Not Received",
    5: "Broken Items and Missed Deadlines",
    6: "Waiting or Requesting a Return",
    7: "Satisfied Despite Long Delivery Time",
    8: "Negative Experience and Delays",
    9: "Quality Feedback: Both Positive and Negative",
    10: "Defective Products and Dissatisfaction",
    11: "Positive Reviews with Emphasis on Quantity"
}

12 topics was a bit too much and too redontant or irrevelant for some reason of bad reviews. <br>
5 topics was way better to understand why people gives bad reviews <br>
We can see that the second reason is for review score of 3

In [None]:

topic_titles_ranked = {
    0: "Order Not Received / Missing Delivery",
    3: "Very Positive Feedback with Product Satisfaction",
    1: "Product Issues and Defects",
    2: "Late Delivery and Negative Experience",
    4: "Partial Orders or Missing Items"
}


In [39]:
topic_titles_ranked

{0: 'Order Not Received / Missing Delivery',
 3: 'Very Positive Feedback with Product Satisfaction',
 1: 'Product Issues and Defects',
 2: 'Late Delivery and Negative Experience',
 4: 'Partial Orders or Missing Items'}

In [40]:
df[df['topics'] == 3]

Unnamed: 0,review_full_comment,review_full_comment_clean,topics
53,recebi somente 1 controle Midea Split ESTILO....,recebi somente controle midea split estilo fal...,3
70,O produto não chegou no prazo estipulado e ca...,o produto não chegou no prazo estipulado e cau...,3
78,"Produto muito inferior, mal acabado.",produto muito inferior mal acabado,3
117,Este foi o pedido\r\nBalde Com 128 Peças - Bl...,este foi o pedido balde com peças blocos de mo...,3
121,comprei tres pacotes de cinco folhas cada de ...,comprei tres pacotes de cinco folhas cada de p...,3
...,...,...,...
98455,"Falta de consideração com o cliente, pois oco...",falta de consideração com o cliente pois ocorr...,3
98512,Teve um probleminha minha pasta multiuso pra ...,teve um probleminha minha pasta multiuso pra n...,3
98533,Ainda nao recevi o produto.,ainda nao recevi o produto,3
98578,"nao satifez minhas expectativas, não funciona...",nao satifez minhas expectativas não funciona c...,3


🏁 Congratulations. Instead of reading 90K+ reviews, you were able to detect the main reasons of dissatisfactions on Olist.

💾 Don't forget to `git add/commit/push`