Importando bibliotecas

In [60]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

import pandas as pd
import collections
import logging
import random

Configurando nível de logs

In [61]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Lendo arquivos de texto

In [62]:
frzd_olist_order_reviews = pd.read_parquet('../dataset/delivery/dlzd_olist_order_reviews.parquet.snappy')

In [63]:
frzd_olist_order_reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_comment_title_and_message,review_creation_date,review_answer_timestamp
0,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,[],"[parabens, lojas, lannister, adorei, comprar, ...","[parabens, lojas, lannister, adorei, comprar, ...",2018-03-01,2018-03-02 10:26:53
1,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,[recomendo],"[aparelho, eficiente, no, site, marca, do, apa...","[recomendo, aparelho, eficiente, no, site, mar...",2018-05-22,2018-05-23 16:45:47
2,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,[],"[mas, um, pouco, travando, pelo, valor, ta, boa]","[mas, um, pouco, travando, pelo, valor, ta, boa]",2018-02-16,2018-02-20 10:52:22
3,d21bbc789670eab777d27372ab9094cc,4fc44d78867142c627497b60a7e0228a,5,[otimo],"[loja, nota]","[otimo, loja, nota]",2018-07-10,2018-07-11 14:10:25
4,0e0190b9db53b689b285d3f3916f8441,79832b7cb59ac6f887088ffd686e1d5e,5,[],"[obrigado, pela, atencao, amim, dispensada]","[obrigado, pela, atencao, amim, dispensada]",2017-12-01,2017-12-09 22:58:58


Dividindo entre conjunto de treino e teste de forma stratificada

In [82]:
X = frzd_olist_order_reviews.loc[:, ['review_comment_title_and_message']]
y = frzd_olist_order_reviews.loc[:, ['review_score']]

In [83]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=51
)

In [145]:
X

Unnamed: 0,review_comment_title_and_message
0,"[parabens, lojas, lannister, adorei, comprar, ..."
1,"[recomendo, aparelho, eficiente, no, site, mar..."
2,"[mas, um, pouco, travando, pelo, valor, ta, boa]"
3,"[otimo, loja, nota]"
4,"[obrigado, pela, atencao, amim, dispensada]"
...,...
23397,[aprovado]
23398,"[muito, bom, produto, ficamos, muito, satisfei..."
23399,"[otima, embalagem]"
23400,"[foto, enganosa, foto, muito, diferente, princ..."


Criando tag para armazenar marcar cada documento.

In [126]:
train_documents = []

for ordered_index, (index, data) in enumerate(X_train.iterrows()):
    train_documents.append(TaggedDocument(data['review_comment_title_and_message'].tolist(), [ordered_index]))

In [127]:
test_documents = []

for index, data in X_test.iterrows():
    test_documents.append(data['review_comment_title_and_message'].tolist())

Definindo modelo Doc2vec

In [138]:
model = Doc2Vec(
    vector_size=400, # tamanho do vetor de embedding
    min_count=5, # quantidade mínima de repetições palavras para entrarem no treinamento 
    sample=10**-5, # frequência teórica para ponderar palavras muito frequêntes
    window=3,
    hs=1, # utilizando amostragem negativa para acelarar o treinamento
    negative=20, # determina que 20 palavras aleatórias serão utilizadas para treinar a palavra predita
    dm=1, # usando modelo PV-DM
    dm_concat=1, # concatenando vetor de parágrafo com de palavras
    dbow_words=1, # treinando também vetor de palavras word2vec
    workers=8, # numero de cores utilizados no paralelismo
    seed=51, # fixando a aleatoriada para deixar o modelo reprodutível
)

2024-09-20 21:03:39,614 : INFO : using concatenative 2800-dimensional layer1
2024-09-20 21:03:39,618 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/c,d400,n20,hs,w3,mc5,s0.001,t8>', 'datetime': '2024-09-20T21:03:39.618330', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'created'}


Criando vocabulário

In [139]:
model.build_vocab(train_documents)

2024-09-20 21:03:42,255 : INFO : collecting all words and their counts
2024-09-20 21:03:42,259 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2024-09-20 21:03:42,359 : INFO : PROGRESS: at example #10000, processed 83266 words (870376 words/s), 6819 word types, 0 tags
2024-09-20 21:03:42,433 : INFO : collected 9668 word types and 18721 unique tags from a corpus of 18721 examples and 157562 words
2024-09-20 21:03:42,437 : INFO : Creating a fresh vocabulary
2024-09-20 21:03:42,488 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2246 unique words (23.23% of original 9668, drops 7422)', 'datetime': '2024-09-20T21:03:42.487745', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
2024-09-20 21:03:42,491 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 146291 word corpus (92.85% of original 157562, 

In [140]:
model.train(
    train_documents,
    total_examples=model.corpus_count,
    epochs=32
)

2024-09-20 21:03:50,699 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 8 workers on 2247 vocabulary and 2800 features, using sg=0 hs=1 sample=0.001 negative=20 window=3 shrink_windows=True', 'datetime': '2024-09-20T21:03:50.699493', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'train'}
2024-09-20 21:03:51,732 : INFO : EPOCH 0 - PROGRESS: at 19.28% examples, 23087 words/s, in_qsize 13, out_qsize 0
2024-09-20 21:03:52,742 : INFO : EPOCH 0 - PROGRESS: at 62.32% examples, 37520 words/s, in_qsize 6, out_qsize 1
2024-09-20 21:03:52,761 : INFO : EPOCH 0: training on 157562 raw words (121784 effective words) took 2.0s, 59883 effective words/s
2024-09-20 21:03:53,799 : INFO : EPOCH 1 - PROGRESS: at 25.00% examples, 30379 words/s, in_qsize 11, out_qsize 1
2024-09-20 21:03:54,839 : INFO : EPOCH 1 - PROGRESS: at 62.17% examples, 36825 words/s, in_qsize 6, out_qsize 1
2024-09-20 21:0

Teste de 'sanidade', calculando similaridade de documentos em relação ao conjunto todo

In [141]:
ranks = []
second_ranks = []

for doc_id in range(len(train_documents)):
    inferred_vector = model.infer_vector(train_documents[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [142]:
collections.Counter(ranks)

Counter({0: 12864,
         1: 671,
         2: 312,
         3: 225,
         4: 190,
         5: 139,
         6: 117,
         7: 113,
         8: 86,
         9: 80,
         10: 74,
         14: 61,
         11: 57,
         15: 56,
         12: 55,
         17: 50,
         18: 46,
         24: 45,
         23: 45,
         13: 45,
         19: 44,
         20: 43,
         21: 34,
         16: 34,
         29: 34,
         31: 33,
         26: 31,
         35: 31,
         22: 30,
         25: 28,
         27: 27,
         36: 27,
         30: 25,
         34: 25,
         37: 24,
         28: 24,
         44: 23,
         46: 21,
         54: 21,
         88: 20,
         33: 20,
         45: 20,
         41: 20,
         48: 20,
         32: 20,
         39: 19,
         53: 19,
         79: 18,
         75: 18,
         113: 18,
         69: 18,
         60: 18,
         42: 17,
         55: 17,
         57: 17,
         77: 16,
         50: 16,
         40: 16,
         81: 

18248 de 22635 reclamações (80,62 %) são similares a elas mesmas, o que é um bom sinal

In [144]:
doc_id = 50

inferred_vector = model.infer_vector(train_documents[doc_id].words)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_documents[sims[index][0]].words)))

Document (50): «veio no tempo certo direito achei bolsa pequena pequena demais mas gostei ta valendo»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/c,d400,n20,hs,w3,mc5,s0.001,t8>:

MOST (50, 0.968439519405365): «veio no tempo certo direito achei bolsa pequena pequena demais mas gostei ta valendo»

SECOND-MOST (761, 0.6666141152381897): «comprei pela primeira vez pra nunca mais decepcionadx»

MEDIAN (2571, 0.013635230250656605): «pedido cancelado unilateralmente pela lojas lannister sem dar satisfacao»

LEAST (17734, -0.6238632202148438): «claro super confiavel esta de parabens»



In [135]:
semantic_query = 'estou com problemas com meu numero da claro'

inferred_vector = model.infer_vector(semantic_query.split(' '))
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print(u'%s %s: «%s»\n' % ('MOST', sims[index], ' '.join(train_documents[sims[index][0]].words)))

MOST (5807, -0.22642099857330322): «suporta ate»



Testando o modelo com o conjunto de teste

In [137]:
# Escolhendo aleatoriamente um documento do conjunto de teste
doc_id = random.randint(0, len(test_documents) - 1)
inferred_vector = model.infer_vector(test_documents[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Comparando este documento com o conjunto de treinamento
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_documents[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_documents[sims[index][0]].words)))

Test Document (3221): «otimo»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/c,d300,n20,w5,mc5,s1e-05,t8>:

MOST (8283, 0.22291600704193115): «muito rui eles nao ciporta com criente»

MEDIAN (12816, 0.0006949880626052618): «nota»

LEAST (13287, -0.24041034281253815): «toner ja esta instalado funcionando perfeitamente»



Salvando modelo Doc2Vec

In [27]:
model.save('../model/doc2vec_telecom_pandemic_claims')

2024-09-19 19:02:23,668 : INFO : Doc2Vec lifecycle event {'fname_or_handle': '../model/doc2vec_telecom_pandemic_claims', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-09-19T19:02:23.667075', 'gensim': '4.3.3', 'python': '3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'saving'}
2024-09-19 19:02:23,672 : INFO : storing np array 'syn1neg' to ../model/doc2vec_telecom_pandemic_claims.syn1neg.npy
2024-09-19 19:02:23,999 : INFO : not storing attribute cum_table
2024-09-19 19:02:24,126 : INFO : saved ../model/doc2vec_telecom_pandemic_claims
