Importando bibliotecas

In [71]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

import pandas as pd
import collections
import logging
import random

Configurando nível de logs

In [72]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Lendo arquivos de texto

In [116]:
X_train = pd.read_parquet('../dataset/delivery/dlzd_olist_order_reviews_training.parquet.snappy')
X_test = pd.read_parquet('../dataset/delivery/dlzd_olist_order_reviews_test.parquet.snappy')

Alterando rótulos para mais adequados ao modelo doc2vec

In [117]:
X_train['review_tag'] = X_train['review_sentiment'].replace(
    {
        -1: 'negative',
        0: 'neutral',
        1: 'positive'
    }
)

X_test['review_tag'] = X_test['review_sentiment'].replace(
    {
        -1: 'negative',
        0: 'neutral',
        1: 'positive'
    }
)

Criando tag para armazenar marcar cada documento.

In [118]:
train_documents = []

for index, data in X_train.iterrows():
    train_documents.append(TaggedDocument(data['review_comment_title_and_message'].tolist(), [data['review_tag']]))

In [119]:
test_documents = []

for index, data in X_test.iterrows():
    test_documents.append(TaggedDocument(data['review_comment_title_and_message'].tolist(), [data['review_tag']]))

Definindo modelo Doc2vec

In [120]:
model = Doc2Vec(
    vector_size=400, # tamanho do vetor de embedding
    min_count=3, # quantidade mínima de repetições palavras para entrarem no treinamento 
    sample=10**-5, # frequência teórica para ponderar palavras muito frequêntes
    window=7, # número máximo de palavras utilizadas para prever a palavra alvo
    shrink_windows=True, # abilitando troca dinâmica da janela de 1 a 7 palavras
    hs=0, # utilizando amostragem negativa para acelarar o treinamento
    negative=5, # determina que 20 palavras aleatórias serão utilizadas para treinar a palavra predita
    dm=1, # usando modelo PV-DM
    dm_concat=1, # concatenando vetor de parágrafo com de palavras
    dbow_words=1, # treinando também vetor de palavras word2vec
    workers=8, # numero sde cores utilizados no paralelismo
    seed=51, # fixando a aleatoriada para deixar o modelo reprodutível
)

2024-09-23 19:50:42,985 : INFO : using concatenative 6000-dimensional layer1
2024-09-23 19:50:42,991 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/c,d400,n5,w7,mc3,s1e-05,t8>', 'datetime': '2024-09-23T19:50:42.991391', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'created'}


Criando vocabulário

In [121]:
model.build_vocab(train_documents)

2024-09-23 19:50:44,263 : INFO : collecting all words and their counts
2024-09-23 19:50:44,266 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2024-09-23 19:50:44,301 : INFO : PROGRESS: at example #10000, processed 79054 words (2414843 words/s), 6711 word types, 3 tags
2024-09-23 19:50:44,320 : INFO : collected 8735 word types and 3 unique tags from a corpus of 16512 examples and 129470 words
2024-09-23 19:50:44,321 : INFO : Creating a fresh vocabulary
2024-09-23 19:50:44,335 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 retains 2980 unique words (34.12% of original 8735, drops 5755)', 'datetime': '2024-09-23T19:50:44.335078', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
2024-09-23 19:50:44,337 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 leaves 122544 word corpus (94.65% of original 129470, dro

In [122]:
model.train(
    train_documents,
    total_examples=model.corpus_count,
    epochs=70
)

2024-09-23 19:50:44,846 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 8 workers on 2981 vocabulary and 6000 features, using sg=0 hs=0 sample=1e-05 negative=5 window=7 shrink_windows=True', 'datetime': '2024-09-23T19:50:44.846575', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'train'}
2024-09-23 19:50:45,402 : INFO : EPOCH 0: training on 129470 raw words (33797 effective words) took 0.5s, 64563 effective words/s
2024-09-23 19:50:45,944 : INFO : EPOCH 1: training on 129470 raw words (33940 effective words) took 0.5s, 65173 effective words/s
2024-09-23 19:50:46,487 : INFO : EPOCH 2: training on 129470 raw words (33702 effective words) took 0.5s, 64386 effective words/s
2024-09-23 19:50:47,014 : INFO : EPOCH 3: training on 129470 raw words (33971 effective words) took 0.5s, 67308 effective words/s
2024-09-23 19:50:47,597 : INFO : EPOCH 4: training on 129470 raw words (33915

Teste de 'sanidade', calculando similaridade de documentos em relação ao conjunto todo

In [124]:
train_ranks = []
train_features = []

for _, record in X_train.iterrows():
    inferred_vector = model.infer_vector(record['review_comment_title_and_message'], epochs=0)
    train_features.append(inferred_vector)

    sims = model.dv.most_similar([inferred_vector])
    rank = [docid for docid, _ in sims].index(record['review_tag'])
    
    train_ranks.append(rank)

X_train['document_embeddings'] = train_features

In [126]:
collections.Counter(train_ranks)

Counter({0: 13378, 1: 2189, 2: 945})

13514 de 16512 avaliações (81,05 %) são similares a elas mesmas, considerando o segundo vetor mais similar (2200) são 94,37 %

In [125]:
test_ranks = []
test_features = []

for _, record in X_test.iterrows():
    inferred_vector = model.infer_vector(record['review_comment_title_and_message'], epochs=0)
    test_features.append(inferred_vector)

    sims = model.dv.most_similar([inferred_vector])
    rank = [docid for docid, _ in sims].index(record['review_tag'])
    test_ranks.append(rank)

X_test['document_embeddings'] = test_features

In [127]:
collections.Counter(test_ranks)

Counter({0: 3057, 1: 642, 2: 430})

3068 de 4129 avaliações (74,3 %) são similares a elas mesmas, considerando o segundo vetor mais similar (644) são 89,9 %

In [131]:
doc_id = 23

inferred_vector = model.infer_vector(train_documents[doc_id].words, epochs=0)
sims = model.dv.most_similar([inferred_vector])

print('Train Document ({}) | tag ({}): «{}»\n'.format(doc_id, train_documents[doc_id].tags[0], ' '.join(train_documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
sims

Train Document (23) | tag (negative): «vendeu um produto entregou outro de qualidade infinitamente inferior»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/c,d400,n5,w7,mc3,s1e-05,t8>:



[('negative', 0.9044221639633179),
 ('neutral', -0.4551694691181183),
 ('positive', -0.599165141582489)]

Testando o modelo com o conjunto de teste

In [135]:
doc_id = random.randint(0, len(test_documents) - 1)

inferred_vector = model.infer_vector(test_documents[doc_id].words, epochs=0)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print('Test Document ({}) | tag ({}): «{}»\n'.format(doc_id, test_documents[doc_id].tags[0], ' '.join(test_documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
sims

Test Document (1928) | tag (positive): «recomendo nao vao se arrepender»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/c,d400,n5,w7,mc3,s1e-05,t8>:



[('negative', 0.8566605448722839),
 ('neutral', 0.25025463104248047),
 ('positive', -0.8897149562835693)]

Busca semântica

In [139]:
semantic_query = 'nao gostei do produto horrivel'

inferred_vector = model.infer_vector(semantic_query.split(' '), epochs=0)
model.dv.most_similar([inferred_vector])

[('negative', 0.8972170352935791),
 ('neutral', -0.09920229017734528),
 ('positive', -0.8382478356361389)]

Salvando modelo Doc2Vec

In [140]:
model.save('../model/doc2vec_olist_reviews_model')

2024-09-23 19:54:07,854 : INFO : Doc2Vec lifecycle event {'fname_or_handle': '../model/doc2vec_olist_reviews_model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-09-23T19:54:07.854922', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'saving'}
2024-09-23 19:54:07,858 : INFO : storing np array 'syn1neg' to ../model/doc2vec_olist_reviews_model.syn1neg.npy
2024-09-23 19:54:08,010 : INFO : not storing attribute cum_table
2024-09-23 19:54:08,017 : INFO : saved ../model/doc2vec_olist_reviews_model


Gerando dataset com features

In [146]:
X_train_doc2vec = X_train.loc[:, ['document_embeddings', 'review_sentiment']]
X_test_doc2vec = X_test.loc[:, ['document_embeddings', 'review_sentiment']]

In [147]:
X_train_doc2vec_embeddings = pd.DataFrame(
    X_train_doc2vec['document_embeddings'].to_list(),
    index=X_train_doc2vec.index
)

X_train_doc2vec_embeddings['review_sentiment'] = X_train_doc2vec['review_sentiment']

In [148]:
X_test_doc2vec_embeddings = pd.DataFrame(
    X_test_doc2vec['document_embeddings'].to_list(),
    index=X_test_doc2vec.index
)

X_test_doc2vec_embeddings['review_sentiment'] = X_test_doc2vec['review_sentiment']

Salvando dataset com features na camada de delivery

In [154]:
X_train_doc2vec_embeddings.to_parquet(
    path='../dataset/featured/frzd_olist_document_embeddings_train.parquet.snappy',
    engine='pyarrow',
    compression='snappy',
    index=True
)

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


In [155]:
X_test_doc2vec_embeddings.to_parquet(
    path='../dataset/featured/frzd_olist_document_embeddings_test.parquet.snappy',
    engine='pyarrow',
    compression='snappy',
    index=True
)