# latih word2vec dengan gensim dan corpus wikipedia bahasa indonesia
https://medium.com/@diekanugraha/membuat-model-word2vec-bahasa-indonesia-dari-wikipedia-menggunakan-gensim-e5745b98714d

In [6]:
# Konversi Corpus Wikipedia Menjadi Teks
import io
import time
from datetime import timedelta

import gensim

if __name__ == '__main__':
    start_time = time.time()
    print('Loading Wikipedia corpus...')
    id_wiki = gensim.corpora.WikiCorpus('Corpus_wikipedia/idwiki-20231201-pages-articles-multistream.xml.bz2',  dictionary={}, lower=True)
    article_count = 0
    with io.open('Corpus_wikipedia/idwiki-20231201-pages-articles-multistream.txt', 'w', encoding='utf-8') as wiki_txt:
        for text in id_wiki.get_texts():
            wiki_txt.write(' '.join(text) + '\n')
            article_count += 1
            if article_count % 10000 == 0:
                print('{} articles processed'.format(article_count))
    print('Processing complete. {} articles processed'.format(article_count))
    finish_time = time.time()
    print('Elapsed time: {}'.format(timedelta(seconds=finish_time - start_time)))

Loading Wikipedia corpus...




10000 articles processed
20000 articles processed
30000 articles processed
40000 articles processed
50000 articles processed
60000 articles processed
70000 articles processed
80000 articles processed
90000 articles processed
100000 articles processed
110000 articles processed
120000 articles processed
130000 articles processed
140000 articles processed
150000 articles processed
160000 articles processed
170000 articles processed
180000 articles processed
190000 articles processed
200000 articles processed
210000 articles processed
220000 articles processed
230000 articles processed
240000 articles processed
250000 articles processed
260000 articles processed
270000 articles processed
280000 articles processed
290000 articles processed
300000 articles processed
310000 articles processed
320000 articles processed
330000 articles processed
340000 articles processed
350000 articles processed
360000 articles processed
370000 articles processed
380000 articles processed
390000 articles proce

In [9]:
# training model word2vec
import multiprocessing

from gensim.models import word2vec

if __name__ == '__main__':
    start_time = time.time()
    print('Training word2vec model...')
    sentences = word2vec.LineSentence('Corpus_wikipedia/idwiki-20231201-pages-articles-multistream.txt')
    model = word2vec.Word2Vec(sentences, vector_size=200 ,workers=multiprocessing.cpu_count()-1)
    model.save('Model_wikipedia/idwiki_word2vec.model')
    finish_time = time.time()
    print('Training complete. Elapsed time: {}'.format(timedelta(seconds=finish_time - start_time)))

Training word2vec model...
Training complete. Elapsed time: 0:37:54.195273


# Visualisasi word2vec dengan t-SNE

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

if __name__ == '__main__':
    start_time = time.time()
    print('Loading word2vec model...')
    model = word2vec.Word2Vec.load('Model_wikipedia/idwiki_word2vec.model')
    print('Model loaded. Generating t-SNE visualization...')
    labels = []
    tokens = []
    for word in model.wv.key_to_index:
        tokens.append(model.wv[word])
        labels.append(word)
    tokens = np.array(tokens)
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
    finish_time = time.time()
    print('Elapsed time: {}'.format(timedelta(seconds=finish_time - start_time)))


Loading word2vec model...
Model loaded. Generating t-SNE visualization...


In [12]:
# mencoba model word2vec denga analogi kata menggunakan metode most_similar_cosmul dari gensim
# laki:raja — perempuan:?

# Load the model
model_test = word2vec.Word2Vec.load('Model_wikipedia/idwiki_word2vec.model')

# Perform the analogy task
result_analogi = model_test.wv.most_similar_cosmul(positive=['perempuan', 'raja'], negative=['laki'])

# Print the result
print(result_analogi)

[('ratu', 0.9368245601654053), ('permaisuri', 0.8864334225654602), ('penguasa', 0.8795064687728882), ('firaun', 0.8777711987495422), ('kerajaan', 0.8759269714355469), ('permaisurinya', 0.855896532535553), ('sultan', 0.8493581414222717), ('rakyatnya', 0.8453277945518494), ('bangsawan', 0.8364384174346924), ('hulubalang', 0.8353058695793152)]


In [24]:
# test dengan kata jenis asuransi Allianz
result_analogi = model_test.wv.most_similar_cosmul(positive=['manfaat', 'allianz'], negative=['prudential'])

# Print the result
print(result_analogi)

[('manfaatnya', 0.8674401640892029), ('kesegaran', 0.8583281636238098), ('keuntungan', 0.8545964360237122), ('konsumsi', 0.8523703217506409), ('bermanfaat', 0.8400729298591614), ('pakan', 0.8386765718460083), ('sumbangsih', 0.8291501998901367), ('kemanfaatan', 0.8284624814987183), ('nutrisi', 0.8277636170387268), ('rezeki', 0.8253624439239502)]


In [28]:
result_analogi = model_test.wv.most_similar_cosmul(positive=['asuransi', 'manulife'])

# Print the result
print(result_analogi)

[('reasuransi', 0.9945306181907654), ('takaful', 0.9732854962348938), ('tabungan', 0.9588580131530762), ('sukuk', 0.9221739768981934), ('anuitas', 0.9207339882850647), ('premi', 0.897469162940979), ('deposito', 0.8810760378837585), ('avrist', 0.8764405846595764), ('kpr', 0.8697476387023926), ('santunan', 0.8678401112556458)]
