#### 1. Setup

In [1]:
%pip install pandas numpy seaborn matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install nltk




In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

In [4]:
reviews_amazon = pd.read_csv(r'Reviews_Amazon.csv')

In [5]:
reviews_amazon

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [6]:
reviews_amazon.drop(axis=1,
                    columns=['ProductId', 'UserId', 'Time', 'Summary',
                             'ProfileName',	'HelpfulnessNumerator',
                             'HelpfulnessDenominator'],
                    inplace=True)
reviews_amazon

Unnamed: 0,Id,Score,Text
0,1,5,I have bought several of the Vitality canned d...
1,2,1,Product arrived labeled as Jumbo Salted Peanut...
2,3,4,This is a confection that has been around a fe...
3,4,2,If you are looking for the secret ingredient i...
4,5,5,Great taffy at a great price. There was a wid...
...,...,...,...
568449,568450,5,Great for sesame chicken..this is a good if no...
568450,568451,2,I'm disappointed with the flavor. The chocolat...
568451,568452,5,"These stars are small, so you can give 10-15 o..."
568452,568453,5,These are the BEST treats for training and rew...


In [7]:
# A tabela é muito grande para rodar em uma CPU, por isso usaremos
# parte somente parte dela como exemplo
reviews_amazon = reviews_amazon[:500]

#### 2. Classificando as reviews com o RoBERTa

<br>

#### RoBERTa - Facebook (default)

##### 1. Na tokenização, as subwords tem como base os bytes ao invés dos caracteres unicode, característica também do GPT-2, isso aumenta o vocabulário disponível do BERT padrão em aproximadamente 20 mil palavras, ajudando o modelo a compreender palavras raras.
#### 2. O modelo RoBERTa é treinado em uma combinação de conjuntos de dados que totalizam 160 GB, enquanto o BERT é treinado inicialmente em 13 GB.

<br>

##### RoBERTa - Twitter (pretrained for Sentiment Analysis)

##### 1. Pré-treinado em cerca de 120 milhões de posts do twitter em inglês
##### 2. Tweets foram classificados e separados em 3 grupos: negativo, neutro, positivo

In [None]:
%pip install transformers scipy

In [9]:
import torch

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Definindo o modelo RoBERTa
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"

# Obtendo o tokenizador e os pesos do modelo escolhido
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [15]:
# Função que irá tokenizar o texto da review e analisá-la sentimentalmente
def polarity_scores_RoBERTa(review_text, model_op) -> dict:

    def apply_label(neg, neu, pos):
        if max(neg, neu, pos) == neg:
            return '-1'
        elif max(neg, neu, pos) == neu:
            return '0'
        else:
            return '1'

    def defineModel(model_op):

        if model_op:
            model_name = f"cardiffnlp/twitter-roberta-base-sentiment"
        else:
            model_name = f'FacebookAI/roberta-base'

        return model_name

    # Definindo o modelo
    model_name = defineModel(model_op)

    # Obtendo o tokenizador e os pesos do modelo escolhido
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # is in charge of preparing the inputs in the appropriate format
    # for the model — tensors
    tokenized_text = tokenizer(review_text, return_tensors='pt')

    output = model(**tokenized_text)

    scores = softmax(output[0][0].detach().numpy())

    # One model has neg, neu and pos, but the other has
    # only to values: neg e pos
    if model_op:
        neg, neu, pos = scores
    else:
        neu = 0
        neg, pos = scores

    return {'RoBERTa_neg' : neg,
            'RoBERTa_neu' : neu,
        'RoBERTa_pos' : pos,
            'Label' : apply_label(neg, neu, pos)}

In [None]:
def classifyReviews(model_op: bool, reviews : pd.DataFrame):

    def createDataframe(results, reviews):

        df = pd.DataFrame(results).T
        df = df.reset_index().rename(
            columns={'index': 'Id'})

        return df.merge(reviews, how='left')

    # Archiving the results of the model picked
    model_results = {}

    # O tqdm está sendo usado para mostrar uma barra de progresso
    # enquanto o loop itera sobre as linhas do DataFrame
    for i, row in tqdm(reviews.iterrows(), total=len(reviews)):
        try:
            text = row['Text']
            myid = row['Id']
            model_result = polarity_scores_RoBERTa(text, model_op)
            model_results[myid] = model_result
        except RuntimeError:
            print(f'Broke for id {myid}')

    # Formatando o dataframe
    return createDataframe(model_results, reviews)

In [None]:
roberta_results = {}

# O tqdm está sendo usado para mostrar uma barra de progresso
# enquanto o loop itera sobre as linhas do DataFrame
for i, row in tqdm(reviews_amazon.iterrows(), total=len(reviews_amazon)):
    try:
        text = row['Text']
        myid = row['Id']
        roberta_result = polarity_scores_RoBERTa(text)
        roberta_results[myid] = roberta_result
    except RuntimeError:
        print(f'Broke for id {myid}')

In [None]:
roberta_results_df = pd.DataFrame(roberta_results).T
roberta_results_df = roberta_results_df.reset_index().rename(columns={'index': 'Id'})
roberta_results_df = roberta_results_df.merge(reviews_amazon, how='left')

In [None]:
roberta_results_df

#### 3. Visualização dos Dados

##### 3.1 Distribuição das reviews

In [None]:
def showPercents(df : pd.DataFrame):
    # Conta as ocorrências das labels e normaliza para que
    # representem porcentagens
    label_counts = df['Label'].value_counts(normalize=True) * 100

    # Criando a figura do df para posteriormente passá-la pra png
    fig, ax = plt.subplots()
    label_counts.plot(kind='bar', color='lightgreen', ax=ax)

    # Adicionando as porcentagens nas barras
    for i in ax.patches:
        ax.annotate(f'{i.get_height():.2f}%', 
                    (i.get_x() + i.get_width() / 2, i.get_height()), 
                    ha='center', va='baseline', fontsize=10,
                    color='black', xytext=(0, 5),
                    textcoords='offset points')

    # Detalhes da imagem
    plt.title('Distribuição das Labels')
    plt.xlabel('Label')
    plt.ylabel('Porcentagem (%)')

    # Salvando a figura
    plt.savefig('label_distribution.png', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
showPercents(roberta_results_df)

In [None]:
from wordcloud import WordCloud

In [None]:
def showWordCloud(df : pd.DataFrame):
    # Junta todos os reviews em um único texto
    text = " ".join(review for review in df['Text'])

    # Gere a word cloud
    wordcloud = WordCloud(max_font_size=50, max_words=100,
                          background_color="white").generate(text)

    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
def splitDataframe(df : pd.DataFrame):
    pos = df[df['Label'] == '1']
    neg = df[df['Label'] == '-1']
    neu = df[df['Label'] == '0']

    return pos, neg, neu

In [None]:
# Divindo o dataframe com base nas labels
df_pos, df_neg, df_neu = splitDataframe(roberta_results_df)

In [None]:
# Mostrando uma nuvem de palavras com as reviews
showWordCloud(df_neg)

In [None]:
showWordCloud(df_neu)

In [None]:
showWordCloud(df_pos)