Importando bibliotecas

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

import pandas as pd
import collections
import logging
import random

Configurando nível de logs

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Lendo arquivos de texto

In [242]:
frzd_olist_order_reviews = pd.read_parquet('../dataset/delivery/dlzd_olist_order_reviews.parquet.snappy')

In [243]:
frzd_olist_order_reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_comment_title_and_message,review_creation_date,review_answer_timestamp
0,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,[],"[parabens, lojas, lannister, adorei, comprar, ...","[parabens, lojas, lannister, adorei, comprar, ...",2018-03-01,2018-03-02 10:26:53
1,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,[recomendo],"[aparelho, eficiente, no, site, marca, do, apa...","[recomendo, aparelho, eficiente, no, site, mar...",2018-05-22,2018-05-23 16:45:47
2,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,[],"[mas, um, pouco, travando, pelo, valor, ta, boa]","[mas, um, pouco, travando, pelo, valor, ta, boa]",2018-02-16,2018-02-20 10:52:22
3,d21bbc789670eab777d27372ab9094cc,4fc44d78867142c627497b60a7e0228a,5,[otimo],"[loja, nota]","[otimo, loja, nota]",2018-07-10,2018-07-11 14:10:25
4,0e0190b9db53b689b285d3f3916f8441,79832b7cb59ac6f887088ffd686e1d5e,5,[],"[obrigado, pela, atencao, amim, dispensada]","[obrigado, pela, atencao, amim, dispensada]",2017-12-01,2017-12-09 22:58:58


In [244]:
frzd_olist_order_reviews.set_index('review_id', inplace=True)

Dividindo entre conjunto de treino e teste de forma stratificada

In [270]:
X = frzd_olist_order_reviews.loc[:, ['review_comment_title_and_message', 'review_score']]
y = frzd_olist_order_reviews.loc[:, ['review_score']]

In [271]:
X_train, X_test, _, _ = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=51
)

Criando tag para armazenar marcar cada documento.

In [272]:
train_documents = []

for index, data in X_train.iterrows():
    train_documents.append(TaggedDocument(data['review_comment_title_and_message'].tolist(), [data['review_score']]))

In [273]:
test_documents = []

for index, data in X_test.iterrows():
    test_documents.append(TaggedDocument(data['review_comment_title_and_message'].tolist(), [data['review_score']]))

Definindo modelo Doc2vec

In [274]:
model = Doc2Vec(
    vector_size=400, # tamanho do vetor de embedding
    min_count=3, # quantidade mínima de repetições palavras para entrarem no treinamento 
    sample=10**-5, # frequência teórica para ponderar palavras muito frequêntes
    window=7, # número máximo de palavras utilizadas para prever a palavra alvo
    shrink_windows=True, # abilitando troca dinâmica da janela de 1 a 7 palavras
    hs=0, # utilizando amostragem negativa para acelarar o treinamento
    negative=5, # determina que 20 palavras aleatórias serão utilizadas para treinar a palavra predita
    dm=1, # usando modelo PV-DM
    dm_concat=1, # concatenando vetor de parágrafo com de palavras
    dbow_words=1, # treinando também vetor de palavras word2vec
    workers=8, # numero sde cores utilizados no paralelismo
    seed=51, # fixando a aleatoriada para deixar o modelo reprodutível
)

2024-09-22 14:18:54,546 : INFO : using concatenative 6000-dimensional layer1
2024-09-22 14:18:54,548 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/c,d400,n5,w7,mc3,s1e-05,t8>', 'datetime': '2024-09-22T14:18:54.548398', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'created'}


Criando vocabulário

In [275]:
model.build_vocab(train_documents)

2024-09-22 14:18:57,366 : INFO : collecting all words and their counts
2024-09-22 14:18:57,369 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2024-09-22 14:18:57,416 : INFO : PROGRESS: at example #10000, processed 83266 words (1884477 words/s), 6819 word types, 0 tags
2024-09-22 14:18:57,455 : INFO : collected 9668 word types and 6 unique tags from a corpus of 18721 examples and 157562 words
2024-09-22 14:18:57,456 : INFO : Creating a fresh vocabulary
2024-09-22 14:18:57,486 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 retains 3297 unique words (34.10% of original 9668, drops 6371)', 'datetime': '2024-09-22T14:18:57.486002', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
2024-09-22 14:18:57,489 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 leaves 149855 word corpus (95.11% of original 157562, dro

In [276]:
model.train(
    train_documents,
    total_examples=model.corpus_count,
    epochs=70
)

2024-09-22 14:18:59,575 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 8 workers on 3298 vocabulary and 6000 features, using sg=0 hs=0 sample=1e-05 negative=5 window=7 shrink_windows=True', 'datetime': '2024-09-22T14:18:59.575775', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'train'}
2024-09-22 14:19:00,588 : INFO : EPOCH 0: training on 157562 raw words (40721 effective words) took 1.0s, 41701 effective words/s
2024-09-22 14:19:01,338 : INFO : EPOCH 1: training on 157562 raw words (40669 effective words) took 0.7s, 56200 effective words/s
2024-09-22 14:19:02,101 : INFO : EPOCH 2: training on 157562 raw words (40706 effective words) took 0.8s, 54226 effective words/s
2024-09-22 14:19:02,892 : INFO : EPOCH 3: training on 157562 raw words (40549 effective words) took 0.8s, 52793 effective words/s
2024-09-22 14:19:03,648 : INFO : EPOCH 4: training on 157562 raw words (40721

Teste de 'sanidade', calculando similaridade de documentos em relação ao conjunto todo

In [293]:
train_ranks = []
train_features = []

for _, record in X_train.iterrows():
    inferred_vector = model.infer_vector(record['review_comment_title_and_message'], epochs=0)
    train_features.append(inferred_vector)

    sims = model.dv.most_similar([inferred_vector])
    rank = [docid for docid, _ in sims].index(record['review_score'])
    
    train_ranks.append(rank)

X_train['document_embeddings'] = train_features

In [294]:
collections.Counter(train_ranks)

Counter({0: 12714, 1: 3448, 2: 1023, 3: 618, 4: 496, 5: 422})

12942 de 18721 avaliações (69,13 %) são similares a elas mesmas, considerando o segundo vetor mais similar (3512) são 87,89 %

In [295]:
test_ranks = []
test_features = []

for _, record in X_test.iterrows():
    inferred_vector = model.infer_vector(record['review_comment_title_and_message'], epochs=0)
    test_features.append(inferred_vector)

    sims = model.dv.most_similar([inferred_vector])
    rank = [docid for docid, _ in sims].index(record['review_score'])
    test_ranks.append(rank)

X_test['document_embeddings'] = test_features

In [296]:
collections.Counter(test_ranks)

Counter({0: 2672, 1: 816, 4: 319, 2: 311, 5: 284, 3: 279})

2698 de 4681 avaliações (57,63 %) são similares a elas mesmas, considerando o segundo vetor mais similar (840) são 75,58 %

In [298]:
doc_id = 66

inferred_vector = model.infer_vector(train_documents[doc_id].words, epochs=10)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print('Train Document ({}) | tag ({}): «{}»\n'.format(doc_id, train_documents[doc_id].tags[0], ' '.join(train_documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
sims

Train Document (66) | tag (2): «loja alem do frete ser muito caro produto leva quase dias uteis para chegar na sua casa»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/c,d400,n5,w7,mc3,s1e-05,t8>:



[(2, 0.6423330307006836),
 (5, 0.3176053762435913),
 (3, 0.10236295312643051),
 (0, -0.038047101348638535),
 (1, -0.10344215482473373),
 (4, -0.6692941784858704)]

Testando o modelo com o conjunto de teste

In [301]:
doc_id = random.randint(0, len(test_documents) - 1)

inferred_vector = model.infer_vector(test_documents[doc_id].words, epochs=20)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print('Test Document ({}) | tag ({}): «{}»\n'.format(doc_id, test_documents[doc_id].tags[0], ' '.join(test_documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
sims

Test Document (4421) | tag (1): «botao liga desliga nao funciona»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/c,d400,n5,w7,mc3,s1e-05,t8>:



[(3, 0.6133701205253601),
 (1, 0.5921914577484131),
 (0, -0.006346350070089102),
 (2, -0.01563362032175064),
 (5, -0.40768080949783325),
 (4, -0.5910491347312927)]

Busca semântica

In [302]:
semantic_query = 'pessimo'

inferred_vector = model.infer_vector(semantic_query.split(' '))
model.dv.most_similar([inferred_vector], topn=len(model.dv))

[(1, 0.8174888491630554),
 (2, 0.24454033374786377),
 (3, 0.08240251988172531),
 (0, 0.0015772177139297128),
 (4, -0.41590118408203125),
 (5, -0.6713008880615234)]

Salvando modelo Doc2Vec

In [303]:
model.save('../model/doc2vec_olist_reviews_model')

2024-09-22 15:02:06,088 : INFO : Doc2Vec lifecycle event {'fname_or_handle': '../model/doc2vec_olist_reviews_model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-09-22T15:02:06.088911', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'saving'}
2024-09-22 15:02:06,089 : INFO : storing np array 'syn1neg' to ../model/doc2vec_olist_reviews_model.syn1neg.npy
2024-09-22 15:02:06,206 : INFO : not storing attribute cum_table
2024-09-22 15:02:06,213 : INFO : saved ../model/doc2vec_olist_reviews_model


Gerando dataset com features

In [305]:
X_train.loc[:, ['document_embeddings', 'review_score']]

Unnamed: 0_level_0,document_embeddings,review_score
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
93fd9c6fa65efb09369e376a50671957,"[-0.014303607, 0.0460826, 0.06431428, -0.04204...",1
c495c0857dbe5f29c0baf66303b76c70,"[0.026620146, -0.052907914, 0.012574526, -0.00...",5
1505b3f70ffd4ca313ab2a11f5122a64,"[-0.007346918, 0.01763874, -0.0022261313, 0.00...",1
78506167e7bd88b9872369f1f08cfbbf,"[0.0008607363, -0.0020089224, -0.0038993454, 0...",4
39fd531151e50b3921788e6f4f5998a2,"[0.00022667034, -0.00058941205, -0.0017346638,...",4
...,...,...
d6c83df262048aecd3ea81a870947ff5,"[-0.000409146, -0.005511231, 0.0038278988, -0....",3
f3eb1df9849ff68f0ab37e810e243f49,"[0.00025570183, -0.0008346692, -0.00032678223,...",5
bb60be11b24531adc0b311991c34b4f6,"[0.0029502513, 0.026504697, 0.09811896, -0.015...",1
fc68f150ba899eceb6714f05b57cf752,"[-0.0023710367, 0.0029868702, 0.002054534, 0.0...",1


In [307]:
frzd_olist_reviews = pd.concat([
    X_train.loc[:, ['document_embeddings', 'review_score']],
    X_test.loc[:, ['document_embeddings', 'review_score']],
])

In [311]:
features_dataset = pd.DataFrame(
    frzd_olist_reviews['document_embeddings'].to_list(),
    index=frzd_olist_reviews.index
)

features_dataset['review_score'] = frzd_olist_reviews['review_score']

In [313]:
features_dataset.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,391,392,393,394,395,396,397,398,399,review_score
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93fd9c6fa65efb09369e376a50671957,-0.014304,0.046083,0.064314,-0.042043,0.078966,0.103096,-0.074035,0.008829,0.092525,0.052451,...,-0.050971,0.060181,-0.059561,0.010675,-0.001023,-0.074058,0.092981,-0.013092,-0.127212,1
c495c0857dbe5f29c0baf66303b76c70,0.02662,-0.052908,0.012575,-0.008329,0.008591,-0.040909,0.061809,0.047803,-0.013991,-0.039676,...,0.001048,-0.013073,0.040552,0.053647,-0.019549,-0.023987,-0.09867,0.005631,0.114044,5
1505b3f70ffd4ca313ab2a11f5122a64,-0.007347,0.017639,-0.002226,0.000294,0.009467,0.040869,-0.037483,-0.020645,0.010395,0.022611,...,-0.025128,0.027613,-0.021887,-0.024825,0.007665,-0.005868,0.053988,-0.008634,-0.048408,1
78506167e7bd88b9872369f1f08cfbbf,0.000861,-0.002009,-0.003899,0.001644,-0.002695,-0.001742,0.00084,-0.001546,-0.003324,-0.002399,...,0.001652,-0.001388,0.001979,-0.002136,0.001121,0.004049,-0.000225,0.000636,0.00433,4
39fd531151e50b3921788e6f4f5998a2,0.000227,-0.000589,-0.001735,0.002609,-0.001102,-0.002125,-0.000116,-0.002329,-0.000847,0.000684,...,-0.001178,-0.001532,0.000821,-0.001306,-0.000218,0.001057,0.000293,-0.000349,0.003135,4


Salvando dataset com features na camada de delivery

In [317]:
features_dataset.to_parquet(
    path='../dataset/featured/frzd_olist_document_embeddings.parquet.snappy',
    engine='pyarrow',
    compression='snappy'
)

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
