### 1.0 - Importações e configurações

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### 2.0 - Bases

In [175]:
bd = pd.read_parquet(r'..\\..\\Desafio_A3Data\\Databases\\books_data.parquet', engine='pyarrow')
br = pd.read_parquet(r'..\\..\\Desafio_A3Data\\Databases\\books_rating.parquet', engine='pyarrow')

### 3.0 - Tratamentos

In [176]:
#-----------------------BOOKS RATING------------------------------

br['summary'].fillna('No summary', inplace=True)
br['text'].fillna('No text comments', inplace=True)

#Realizando agrupamento pelo título do livro e obtendo a média dos valores numéricos
temporary = br[['Title', 'Price', 'score', 'time']]
temporary = temporary.groupby('Title').mean()
temporary.reset_index(inplace=True)

#Identificando quantas avaliações cada livro recebeu pelos leitores
temporary2 = br.groupby('Title').size().reset_index(name='readersRatings')


#-----------------------BOOKS DATA--------------------------------

#Adicionando informações capturadas acima ao dataset dos livros
merge = bd.merge(temporary, on='Title', how='inner')
merge2 = merge.merge(temporary2, on='Title', how='inner')
bd = merge2

#Retirando caracteres poluentes das duas variáveis abaixo
bd['categories'] = bd['categories'].str.replace('[\[\]\'\"]', '', regex=True)
bd['authors'] = bd['authors'].str.replace('[\[\]\'\"]', '', regex=True)

#Reservando somente o ano de publicação, dado que não havia uma uniformidade no formato da data, alguns somente com o ano, outros no formato yyyy/mm/dd
bd['publishedDate'] = bd['publishedDate'].str[:4]

#Obtendo totais de avaliações para cada livro
bd['ratingsCount'].fillna(0, inplace=True)
bd['totalRatings'] = bd['readersRatings'] + bd['ratingsCount']

In [178]:
#Filtro realizado devido quantidade de cometários no dataset books_rating ser muito grande, 
#o que ocasionou uma limitação no tempo de procesamento dos campos textuais
#Retirando essa célula do processamento, o fluxo ocorre normalmente para o dataset completo
bd = bd[bd['readersRatings'] == 100]
selected_books = bd['Title'].unique()

br = br[br['Title'].isin(selected_books)]

In [8]:
#pip install transformers
#pip install tensorflow
#pip install tf-keras
#pip install ipywidgets

In [158]:
from transformers import pipeline

#Modelo de captação de sentimento usado
sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [181]:
text = br['summary'][66533]
text

'Man vs Beast and the Birth of Civilization'

In [182]:
result = sentiment_classifier(text)
result

[[{'label': 'positive', 'score': 0.531891942024231},
  {'label': 'neutral', 'score': 0.18506908416748047},
  {'label': 'negative', 'score': 0.2830389142036438}]]

In [184]:
# Criando uma lista vazia para armazenar os resultados
positive = []
neutral = []
negative = []

# Iterando sobre cada valor da coluna 'publisher'
for value in tqdm(br['summary'], desc='Progress'):
    
    # Aplicando a função a cada valor
    result = sentiment_classifier(value)

    #Obtendo os valores dos resultados
    positive_value = result[0][0]['score']
    neutral_value = result[0][1]['score']
    negative_value = result[0][2]['score']

    # Adicionando o resultado à lista
    positive.append(positive_value)
    neutral.append(neutral_value)
    negative.append(negative_value)

# Criando uma nova coluna no DataFrame com os resultados
br['summary_posit_score'] = positive
br['summary_neut_score'] = neutral
br['summary_neg_score'] = negative

Progress:   0%|          | 0/4000 [00:00<?, ?it/s]

Progress: 100%|██████████| 4000/4000 [26:06<00:00,  2.55it/s]


In [191]:
#Função utilizada para obter resumo dos scores gerados pela função sentiment_classifier
def categorize(row):
    if (row['summary_posit_score'] + row['summary_neut_score']) > row['summary_neg_score']:
        return 'positivo'
    else:
        return 'negativo'

# Aplicando a função a cada linha do DataFrame
br['summary_score'] = br.apply(categorize, axis=1)

In [193]:
print(bd.shape)
bd.head(5)

(40, 15)


Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,Price,score,time,readersRatings,totalRatings
4553,Death in the Afternoon,Still considered one of the best books ever wr...,Ernest Hemingway,http://books.google.com/books/content?id=AdFQA...,http://books.google.nl/books?id=AdFQAQAAQBAJ&p...,Simon and Schuster,2014,https://play.google.com/store/books/details?id...,Literary Criticism,9.0,,3.96,1112270000.0,100,109.0
9230,The Geographer's Library,Item 1: An alembic is the top part of an appar...,Jon Fasman,http://books.google.com/books/content?id=H5ogA...,http://books.google.nl/books?id=H5ogAQAAIAAJ&q...,Penguin Press HC,2005,http://books.google.nl/books?id=H5ogAQAAIAAJ&d...,Fiction,33.0,,3.06,1167344000.0,100,133.0
13509,LifeSupport,From New York Times bestselling author Tess Ge...,Tess Gerritsen,http://books.google.com/books/content?id=s-bKp...,http://books.google.nl/books?id=s-bKpPSlbbUC&q...,Pocket Books,1998,http://books.google.nl/books?id=s-bKpPSlbbUC&d...,Fiction,2.0,,4.2,1118287000.0,100,102.0
14923,Population: 485 : Meeting Your Neighbors One S...,"Welcome to New Auburn, Wisconsin, where the lo...",Michael Perry,http://books.google.com/books/content?id=IOhWr...,http://books.google.nl/books?id=IOhWrgEACAAJ&d...,Harper Perennial,2007,http://books.google.nl/books?id=IOhWrgEACAAJ&d...,Travel,25.0,,4.62,1176783000.0,100,125.0
21679,The Light of Western Stars,This eBook features the unabridged text of ‘Th...,Zane Grey,http://books.google.com/books/content?id=EKDWD...,http://books.google.com/books?id=EKDWDwAAQBAJ&...,Delphi Classics,2017,https://play.google.com/store/books/details?id...,Fiction,0.0,25.8,4.38,1347519000.0,100,100.0


In [194]:
print(br.shape)
br.head(5)

(4000, 13)


Unnamed: 0,Id,Title,Price,User_id,profileName,score,time,summary,text,summary_posit_score,summary_neut_score,summary_neg_score,summary_score
66533,B000JWW1UG,Death in the Afternoon,,A1ZA12IECEKIY,Mark Cohen,4.0,1356048000,Man vs Beast and the Birth of Civilization,Bullfighting was born in Spain many centuries ...,0.531892,0.185069,0.283039,positivo
66534,B000JWW1UG,Death in the Afternoon,,A2UO3C0HLKCZ4F,R. Huitron,4.0,1354752000,if one is born outside of bullfighting culture...,A lot of times people end up hating something ...,0.190422,0.166635,0.642942,negativo
66535,B000JWW1UG,Death in the Afternoon,,AIHBELNQRGPVD,acacia,4.0,1350086400,death in the afternoon,I enjoyed the first part of the book all about...,0.095906,0.146184,0.757909,negativo
66536,B000JWW1UG,Death in the Afternoon,,A1N7T3TGQT3CS3,diogenes,4.0,1344988800,Excellent primer for bullfighting lore.,I've reread this book for the third time this ...,0.893192,0.067662,0.039146,positivo
66537,B000JWW1UG,Death in the Afternoon,,A2WXFSE5PJ70NV,Jim,5.0,1342137600,Classic,Excellent book about bullfighting and travelli...,0.63795,0.234253,0.127797,positivo


In [195]:
bd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40 entries, 4553 to 183514
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Title           40 non-null     object 
 1   description     38 non-null     object 
 2   authors         38 non-null     object 
 3   image           36 non-null     object 
 4   previewLink     38 non-null     object 
 5   publisher       34 non-null     object 
 6   publishedDate   38 non-null     object 
 7   infoLink        38 non-null     object 
 8   categories      37 non-null     object 
 9   ratingsCount    40 non-null     float64
 10  Price           8 non-null      float64
 11  score           40 non-null     float64
 12  time            40 non-null     float64
 13  readersRatings  40 non-null     int64  
 14  totalRatings    40 non-null     float64
dtypes: float64(5), int64(1), object(9)
memory usage: 5.0+ KB


In [196]:
br.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 66533 to 2980494
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   4000 non-null   object 
 1   Title                4000 non-null   object 
 2   Price                726 non-null    float64
 3   User_id              3337 non-null   object 
 4   profileName          3337 non-null   object 
 5   score                4000 non-null   float64
 6   time                 4000 non-null   int64  
 7   summary              4000 non-null   object 
 8   text                 4000 non-null   object 
 9   summary_posit_score  4000 non-null   float64
 10  summary_neut_score   4000 non-null   float64
 11  summary_neg_score    4000 non-null   float64
 12  summary_score        4000 non-null   object 
dtypes: float64(5), int64(1), object(7)
memory usage: 566.5+ KB


In [198]:
bd.to_parquet(r'..\\..\\Desafio_A3Data\\Databases\\work_books_data.parquet', engine='pyarrow')
bd.to_parquet(r'..\\..\\Desafio_A3Data\\Databases\\work_books_rating.parquet', engine='pyarrow')

### 4.0 - EDA

In [4]:
bd = pd.read_parquet(r'..\\..\\Desafio_A3Data\\Databases\\work_books_data.parquet', engine='pyarrow')
br = pd.read_parquet(r'..\\..\\Desafio_A3Data\\Databases\\work_books_rating.parquet', engine='pyarrow')

In [14]:
import plotly.graph_objects as go

# Ordenar os dados pelo score do maior para o menor
bd_sorted = bd.sort_values(by='score', ascending=False)

# Dados para o gráfico
categorias = bd_sorted['Title']
valores = bd_sorted['score']

# Criação do gráfico de barras
fig = go.Figure(data=[
    go.Bar(name='Valores', x=categorias, y=valores)
])

# Personalização do layout do gráfico
fig.update_layout(
    title='Scores dos livros: avaliação dos leitores',
    xaxis_title='Livros',
    yaxis_title='Score'
)

# Exibir o gráfico
fig.show()

In [16]:
# Dados para o gráfico de barras com a variável 'publisher' no eixo x
categorias_editora = bd['publisher'].unique()
valores_editora = bd.groupby('publisher')['score'].mean()

# Ordenar os dados de scores médios por editora do maior para o menor
valores_editora_sorted = valores_editora.sort_values(ascending=False)

# Criação do gráfico de barras com a variável 'publisher' no eixo x
fig2 = go.Figure(data=[
    go.Bar(name='Valores', x=valores_editora_sorted.index, y=valores_editora_sorted)
])

# Personalização do layout do gráfico com a variável 'publisher' no eixo x
fig2.update_layout(
    title='Scores por editora: avaliação dos leitores',
    xaxis_title='Editora',
    yaxis_title='Score'
)

# Exibir o gráfico com a variável 'publisher' no eixo x
fig2.show()

In [21]:
# Dados para o gráfico de barras com a variável 'publisher' no eixo x
categorias_segmento = bd['categories'].unique()
valores_segmento = bd.groupby('categories')['score'].mean()

# Ordenar os dados de scores médios por segmento do maior para o menor
valores_segmento_sorted = valores_segmento.sort_values(ascending=False)

# Criação do gráfico de barras com a variável 'publisher' no eixo x
fig3 = go.Figure(data=[
    go.Bar(name='Valores', x=valores_segmento_sorted.index, y=valores_segmento_sorted)
])

# Personalização do layout do gráfico com a variável 'publisher' no eixo x
fig3.update_layout(
    title='Scores por categoria: avaliação dos leitores',
    xaxis_title='Categoria',
    yaxis_title='Score'
)

# Exibir o gráfico com a variável 'publisher' no eixo x
fig3.show()

In [20]:
# Dados para o gráfico de barras com a variável 'publisher' no eixo x
categorias_autor = bd['authors'].unique()
valores_autor = bd.groupby('authors')['score'].mean()

# Ordenar os dados de scores médios por autor do maior para o menor
valores_autor_sorted = valores_autor.sort_values(ascending=False)

# Criação do gráfico de barras com a variável 'publisher' no eixo x
fig4 = go.Figure(data=[
    go.Bar(name='Valores', x=valores_autor_sorted.index, y=valores_autor_sorted)
])

# Personalização do layout do gráfico com a variável 'publisher' no eixo x
fig4.update_layout(
    title='Scores por autor(a): avaliação dos leitores',
    xaxis_title='Autor(a)',
    yaxis_title='Score'
)

# Exibir o gráfico com a variável 'publisher' no eixo x
fig4.show()

In [128]:
test = br['text'][4]
test

'Philip Nel - Dr. Seuss: American IconThis is basically an academic overview of Seuss poetry, art, cartoons, and the problems with the commercialization of the Seuss name and works after his death. It is not, to any real extent, a biography. Those seeking such should move on.As an academic book it leans on the dry side. It assumes the reader has a fairly good knowledge of Children\'s Literature and 20th Century cartoons (not the animated kind). Not a book to begin your Dr. Seuss experience with. But if you have read them to your children and are interested about the writing style (there is a good chapter about his poetry) or his art style (not as good a chapter, but still interesting).What interested me the most was the deconstruction of the recent rush to "cash in" on Seuss by Hollywood and advertisers. I think that Nel wants to come down against it, but based on Seuss\' background (he started out drawing Flit ads) and the projects he approved during his lifetime; it is a tough argume

In [117]:
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [120]:
print(test)
result = summarizer(test, max_length=500, min_length=30, do_sample=False)
print(result)

Your max_length is set to 500, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


This is only for Julie Strain fans. It's a collection of her photos -- about 80 pages worth with a nice section of paintings by Olivia.If you're looking for heavy literary content, this isn't the place to find it -- there's only about 2 pages with text and everything else is photos.Bottom line: if you only want one book, the Six Foot One ... is probably a better choice, however, if you like Julie like I like Julie, you won't go wrong on this one either.
[{'summary_text': "This is only for Julie Strain fans . It's about 80 pages worth with a nice section of paintings by Olivia . There's only about 2 pages with text and everything else is photos ."}]


In [123]:
result[0]['summary_text']

"This is only for Julie Strain fans . It's about 80 pages worth with a nice section of paintings by Olivia . There's only about 2 pages with text and everything else is photos ."

In [None]:
# Criando uma lista vazia para armazenar os resultados
summary = []

# Iterando sobre cada valor da coluna 'publisher'
for value in tqdm(br['text'], desc='Progress'):
    # Aplicando a função a cada valor
    result = summarizer(value)

    summary_value = result[0]['summary_text']

    # Adicionando o resultado à lista
    summary.append(summary_value)
    
# Criando uma nova coluna no DataFrame com os resultados
br['summaryText'] = summary

In [None]:
# Crear el histograma
sns.histplot(br['time'], bins=40, kde=False)

# Añadir títulos y etiquetas
plt.title('Histograma con Seaborn')
plt.xlabel('Valor')
plt.ylabel('Frecuencia')

# Mostrar el gráfico
plt.show()