## 1 - Entendimento dos dados

In [17]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parent.parent 
DATA_DIR = PROJECT_ROOT / "case_a3data" # pasta raiz

try:
    csv_path = DATA_DIR / "books_rating.csv"
    df_begin = pd.read_csv(csv_path)
    print(f"✅ Sucesso!")
    display(df_begin.head(4))
except FileNotFoundError:
    print(f"❌ Erro: Arquivo não encontrado em:\n{csv_path}")
    print("Verifique a estrutura de pastas:")
    print(f"Diretório atual: {Path.cwd()}")
    print(f"Raiz do projeto: {PROJECT_ROOT}")

✅ Sucesso!


Unnamed: 0,Id,Title,Price,User_id,profileName,score,time,summary,text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."


In [3]:
print(df_begin.info(show_counts=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 9 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Id           3000000 non-null  object 
 1   Title        2999792 non-null  object 
 2   Price        481171 non-null   float64
 3   User_id      2438213 non-null  object 
 4   profileName  2438095 non-null  object 
 5   score        3000000 non-null  float64
 6   time         3000000 non-null  int64  
 7   summary      2999593 non-null  object 
 8   text         2999992 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 206.0+ MB
None


In [4]:
print(df_begin.isnull().sum())

Id                   0
Title              208
Price          2518829
User_id         561787
profileName     561905
score                0
time                 0
summary            407
text                 8
dtype: int64


## 2 - Limpeza dos dados

* Excluir linhas onde não há título do livro
* Excluir User_id e profileName onde há registros vazios
* converter data (UNIX PARA DATA)
* tratar os nulos
* Pode haver múltiplas avaliações para o mesmo livro-usuário. VERIFICAR

In [5]:
df_tratado = df_begin.copy()

#### Analisando colunas 'Title', 'User_id' e 'profileName'

In [6]:
# Contagem de valores nulos em 'Title', 'User_id' e 'profileName'
print(f'{df_tratado["Title"].isnull().sum()} valores nulos em Title')
print(f'{df_tratado["User_id"].isnull().sum()} valores nulos em User_id')
print(f'{df_tratado["profileName"].isnull().sum()} valores nulos em profileName')



208 valores nulos em Title
561787 valores nulos em User_id
561905 valores nulos em profileName


In [7]:
# Excluir linhas onde o título está vazio
df_tratado = df_tratado[df_tratado['Title'].notna()]

# Excluir User_id e profileName onde há registros vazios
# Exclui porque haverá análises de usuários com opiniões relevantes. Em um primeiro momento, vou excluir os nulos.
df_tratado = df_tratado[df_tratado['User_id'].notna()]
df_tratado = df_tratado[df_tratado['profileName'].notna()]

#### Converter tipo da coluna 'data'

In [8]:
# converter data de UNIX para DATA
df_tratado['time'] = pd.to_datetime(df_tratado['time'], unit='s')


#### Analisando coluna 'price'

In [9]:
# Filtrando as linhas que correspondem ao título
filtro = df_tratado['Title'] == 'The Idea of History'
df_idea = df_tratado.loc[filtro, ['Title', 'Price']]

print(df_idea.head(10))
print(df_idea['Price'].unique())


# Mantive o valores NaN, pois no momento, não irei avaliar os preços e, são muitas livros sem preço (mais de 2 milhões de registros)

                       Title  Price
2999990  The Idea of History    NaN
2999994  The Idea of History    NaN
2999996  The Idea of History    NaN
2999997  The Idea of History    NaN
2999998  The Idea of History    NaN
[nan]


#### Verificar e tratar valores nulos em 'summary' e 'text'

In [10]:
# CONTAGEM DOS VALORES NULOS EM 'summary' e 'text'
print(f'{df_tratado["summary"].isnull().sum()} valores nulos em summary')
print(f'{df_tratado["text"].isnull().sum()} valores nulos em text')



393 valores nulos em summary
1 valores nulos em text


In [11]:
# Preenchendo com string vazia os valores nulos
df_tratado['summary'] = df_tratado['summary'].fillna('')
df_tratado['text'] = df_tratado['text'].fillna('')

# Pensando na análise de sentimento, como são poucos registros, vou manter os valores nulos, gerando valores neutros na análise de sentimento


#### Verificar usuários duplicados para o mesmo livro

In [12]:
# Verificar duplicatas nas colunas 'Id', 'User_id' e 'text'
duplicatas = df_tratado.duplicated(subset=['Id', 'User_id'], keep=False)

# Contar quantas linhas são consideradas duplicadas
quantidade_duplicatas = duplicatas.sum()
print(f"Total de linhas duplicadas: {quantidade_duplicatas}")

# Exibir algumas linhas duplicadas
df_duplicadas = df_tratado[duplicatas].head(10)
print(df_duplicadas.head(10))

# NÃO ADICIONEI O PROFILE NAME, POIS UM USUÁRIO PODE MUDAR O NOME E COMENTAR A MESMA COISA

Total de linhas duplicadas: 71758
             Id                             Title  Price         User_id  \
162  0517150328   History of Magic and the Occult    NaN   AMKC1EJBUXDS2   
164  0517150328   History of Magic and the Occult    NaN   AMKC1EJBUXDS2   
198  B0007DVHU2            Treat yourself to life    NaN  A1RJD10TTI568L   
201  B0007DVHU2            Treat yourself to life    NaN  A1RJD10TTI568L   
656  B0000630MU      HTML: The Complete Reference    NaN  A332U346E9T5PU   
681  B0000630MU      HTML: The Complete Reference    NaN  A332U346E9T5PU   
724  050552421X  The Scarletti Curse (Candleglow)    NaN  A2A9BTNYLA9EA0   
726  050552421X  The Scarletti Curse (Candleglow)    NaN  A1PURG5ASALH79   
727  050552421X  The Scarletti Curse (Candleglow)    NaN  A1PURG5ASALH79   
776  050552421X  The Scarletti Curse (Candleglow)    NaN  A14MF63X40QDT2   

                             profileName  score       time  \
162                            Anita Fix    5.0 2001-05-06   
164  

In [13]:
# Primeiro, ordene o DataFrame pela coluna 'time'
df_tratado = df_tratado.sort_values('time')

# Em seguida, remova os duplicados considerando as colunas que identificam o livro e o usuário.
# Aqui, estamos usando 'Id' para identificar o livro e 'User_id' para o usuário.
# O parâmetro keep='last' garante que o último comentário (mais recente) seja mantido.
df_tratado = df_tratado.drop_duplicates(subset=['Id', 'User_id'], keep='last')

# Verificando o resultado
#print(df_final.shape)
print(df_tratado.head(3))


               Id                                              Title  Price  \
75747  0786280670  Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...    NaN   
75745  0786280670  Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...    NaN   
75746  0786280670  Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...    NaN   

              User_id                  profileName  score                time  \
75747   AWF1MPR7NZX07                         Mary    2.0 1969-12-31 23:59:59   
75745  A3LL5TMGX00LA1         Virginia Teacher Mom    2.0 1969-12-31 23:59:59   
75746  A2ZE8PHSFIQBLQ  Sarah Beagle "Sarah Beagle"    1.0 1969-12-31 23:59:59   

                                                 summary  \
75747  Disappointing...read My Life in France by Juli...   
75745                    For once, the movie was better.   
75746                                 Dazed and Confused   

                                                    text  
75747  I eagerly snatched this book up when I saw it ...  

In [14]:
# Depois de remover duplicatas, a contagem deve ser zero para o mesmo critério
duplicatas_final = df_tratado.duplicated(subset=['Id', 'User_id'], keep=False)
print("Total de linhas duplicadas após remoção:", duplicatas_final.sum())


Total de linhas duplicadas após remoção: 0


#### Concatenação das colunas summary e text

In [15]:
df_tratado.loc[:, 'text_concat'] = df_tratado['summary'] + ' ' + df_tratado['text']


#### DATAFRAME TRATADO / LIMPO

In [16]:
# Define a raiz do projeto e a pasta de dados, conforme sua estrutura
PROJECT_ROOT = Path.cwd().parent.parent
SAVE_DIR = PROJECT_ROOT / "case_a3data" / "app" / "data"

# Cria a pasta, se ela não existir
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Define o caminho completo para o arquivo a ser salvo
csv_save_path = SAVE_DIR / "books_rating_tratado.csv"

df_final = df_tratado.copy()  
df_final.reset_index(drop=True, inplace=True)
df_final.to_csv(csv_save_path, index=False)
print(f"Arquivo salvo com sucesso em: {csv_save_path}")



Arquivo salvo com sucesso em: c:\Users\Thiago_W\Desktop\A3_Case\case_a3data\app\data\books_rating_tratado.csv


REMOVI OS COMENTÁRIOS DO MESMO USUÁRIO PARA O LIVRO, MANTENDO APENAS O ÚLTIMO COMENTÁRIO

## 4 - ANÁLISE DE SENTIMENTO

AQUI EU POSSO TESTAR DOIS MODELOS: NLTK E bert-base-multilingual-uncased-sentiment (HUGGINFACE)

#### NLTK

In [16]:
df_teste = df_final[:1001].copy()
df_teste.head(3)

Unnamed: 0,Id,Title,Price,User_id,profileName,score,time,summary,text,text_concat
0,786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,AWF1MPR7NZX07,Mary,2.0,1969-12-31 23:59:59,Disappointing...read My Life in France by Juli...,I eagerly snatched this book up when I saw it ...,Disappointing...read My Life in France by Juli...
1,786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,A3LL5TMGX00LA1,Virginia Teacher Mom,2.0,1969-12-31 23:59:59,"For once, the movie was better.","I purchased this book after seeing, and truly ...","For once, the movie was better. I purchased th..."
2,786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,A2ZE8PHSFIQBLQ,"Sarah Beagle ""Sarah Beagle""",1.0,1969-12-31 23:59:59,Dazed and Confused,I had such high hopes for this book and I was ...,Dazed and Confused I had such high hopes for t...


In [17]:
import nltk

## Precisamos do léxico VADER
#nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()



In [18]:
def analisar_sentimento(texto):
    # Gera um dicionário com scores
    # {'neg': 0.0, 'neu': 0.4, 'pos': 0.6, 'compound': 0.6696}
    return sia.polarity_scores(texto)

df_teste['sentiment_scores'] = df_teste['text_concat'].apply(analisar_sentimento)

df_teste['compound'] = df_teste['sentiment_scores'].apply(lambda d: d['compound'])

def classificar_sentimento(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df_teste['sentiment_label'] = df_teste['compound'].apply(classificar_sentimento)

# Visualizar o resultado
print(df_teste[['text_concat', 'sentiment_scores', 'compound', 'sentiment_label']].head(10))


                                         text_concat  \
0  Disappointing...read My Life in France by Juli...   
1  For once, the movie was better. I purchased th...   
2  Dazed and Confused I had such high hopes for t...   
3  Glad I found this Silver Pennies has made terr...   
4  An incomparable children's classic This book o...   
5  My favorite book of poetry from childhood My m...   
6  A joyful find! This book was given to me when ...   
7  What an Incredible Find A dear old friend gave...   
8  Found again... This book was given to me over ...   
9  Unequalled Collection of Children's Poetry for...   

                                    sentiment_scores  compound sentiment_label  
0  {'neg': 0.012, 'neu': 0.818, 'pos': 0.169, 'co...    0.9891        positive  
1  {'neg': 0.045, 'neu': 0.736, 'pos': 0.219, 'co...    0.9822        positive  
2  {'neg': 0.058, 'neu': 0.837, 'pos': 0.105, 'co...    0.8482        positive  
3  {'neg': 0.0, 'neu': 0.773, 'pos': 0.227, 'comp...    0.9

In [19]:
df_teste[0:5]

Unnamed: 0,Id,Title,Price,User_id,profileName,score,time,summary,text,text_concat,sentiment_scores,compound,sentiment_label
0,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,AWF1MPR7NZX07,Mary,2.0,1969-12-31 23:59:59,Disappointing...read My Life in France by Juli...,I eagerly snatched this book up when I saw it ...,Disappointing...read My Life in France by Juli...,"{'neg': 0.012, 'neu': 0.818, 'pos': 0.169, 'co...",0.9891,positive
1,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,A3LL5TMGX00LA1,Virginia Teacher Mom,2.0,1969-12-31 23:59:59,"For once, the movie was better.","I purchased this book after seeing, and truly ...","For once, the movie was better. I purchased th...","{'neg': 0.045, 'neu': 0.736, 'pos': 0.219, 'co...",0.9822,positive
2,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,A2ZE8PHSFIQBLQ,"Sarah Beagle ""Sarah Beagle""",1.0,1969-12-31 23:59:59,Dazed and Confused,I had such high hopes for this book and I was ...,Dazed and Confused I had such high hopes for t...,"{'neg': 0.058, 'neu': 0.837, 'pos': 0.105, 'co...",0.8482,positive
3,B000G167FA,Silver Pennies,,A21KZ6WAO2P1P1,MossMonster,5.0,1969-12-31 23:59:59,Glad I found this,Silver Pennies has made terrific bedtime and q...,Glad I found this Silver Pennies has made terr...,"{'neg': 0.0, 'neu': 0.773, 'pos': 0.227, 'comp...",0.9169,positive
4,B000G167FA,Silver Pennies,,AAFZZHA2I598B,Byron C. Benson,5.0,1969-12-31 23:59:59,An incomparable children's classic,This book of children's poems has been enjoyed...,An incomparable children's classic This book o...,"{'neg': 0.0, 'neu': 0.701, 'pos': 0.299, 'comp...",0.9451,positive


In [20]:
# printando o valor de summary e text

# Escolha o índice que deseja visualizar (por exemplo, índice 0)
indice = 4

print(f"\nÍndice {indice}:")
print(f"\nSummary:\n{df_teste.loc[indice, 'summary']}")
print(f"\nText:\n{df_teste.loc[indice, 'text']}")
print(f"\nsentiment_scores:\n{df_teste.loc[indice, 'sentiment_scores']}")
print(f"\ncompound:\n{df_teste.loc[indice, 'compound']}")
print(f"\nsentiment_label:\n{df_teste.loc[indice, 'sentiment_label']}")



Índice 4:

Summary:
An incomparable children's classic

Text:
This book of children's poems has been enjoyed by three generations in our family. It captures the imagination of young children and makes a wonderful bedtime reading. I credit this book for giving me an early and continuing love of poetry.

sentiment_scores:
{'neg': 0.0, 'neu': 0.701, 'pos': 0.299, 'compound': 0.9451}

compound:
0.9451

sentiment_label:
positive


#### bert-base-multilingual-uncased-sentiment

In [21]:
# TESTE PARA AVALIAR EM QUANTO TEMPO A ANÁLISE DE SENTIMENTO É FEITA
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from time import time
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def sentiment_transformers(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # Esse modelo retorna 5 classes (1 a 5 estrelas).
    # Podemos converter para algo entre -1 e +1, ou simplesmente pegar a classe argmax
    star_rating = torch.argmax(probs, dim=1).item() + 1
    return star_rating




df_final_teste = df_final[0:1001].copy()


start_time = time()
df_final_teste["bert_sentiment"] = df_final_teste["text_concat"].apply(sentiment_transformers)
print(df_final_teste[["text_concat", "bert_sentiment"]])
end_time = time()
print(f"Tempo de execução: {end_time - start_time} segundos")


                                            text_concat  bert_sentiment
0     Disappointing...read My Life in France by Juli...               2
1     For once, the movie was better. I purchased th...               3
2     Dazed and Confused I had such high hopes for t...               2
3     Glad I found this Silver Pennies has made terr...               5
4     An incomparable children's classic This book o...               5
...                                                 ...             ...
996   Ghost In The Shell is a very cool book ! I'am ...               5
997   This story has haunted me for over 25 years I ...               5
998   The Best Book Ever Written About a Womens' Bas...               5
999   well written and thurough coverage of Os/2 War...               5
1000  The Best Book Ever Written About a Womens' Bas...               5

[1001 rows x 2 columns]
Tempo de execução: 104.26567840576172 segundos


In [22]:
df_final_teste.head(5)

Unnamed: 0,Id,Title,Price,User_id,profileName,score,time,summary,text,text_concat,bert_sentiment
0,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,AWF1MPR7NZX07,Mary,2.0,1969-12-31 23:59:59,Disappointing...read My Life in France by Juli...,I eagerly snatched this book up when I saw it ...,Disappointing...read My Life in France by Juli...,2
1,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,A3LL5TMGX00LA1,Virginia Teacher Mom,2.0,1969-12-31 23:59:59,"For once, the movie was better.","I purchased this book after seeing, and truly ...","For once, the movie was better. I purchased th...",3
2,0786280670,"Julie and Julia: 365 Days, 524 Recipes, 1 Tiny...",,A2ZE8PHSFIQBLQ,"Sarah Beagle ""Sarah Beagle""",1.0,1969-12-31 23:59:59,Dazed and Confused,I had such high hopes for this book and I was ...,Dazed and Confused I had such high hopes for t...,2
3,B000G167FA,Silver Pennies,,A21KZ6WAO2P1P1,MossMonster,5.0,1969-12-31 23:59:59,Glad I found this,Silver Pennies has made terrific bedtime and q...,Glad I found this Silver Pennies has made terr...,5
4,B000G167FA,Silver Pennies,,AAFZZHA2I598B,Byron C. Benson,5.0,1969-12-31 23:59:59,An incomparable children's classic,This book of children's poems has been enjoyed...,An incomparable children's classic This book o...,5


É notório que o bert performou melhor em relação ao NLTK, para analise de sentimento