# A training corpus

In [1]:
from sklearn.datasets import fetch_20newsgroups

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')
documents = newsgroups.data

# Preprocess and process text data to the model

In [2]:
from gensim.models import Phrases

class Corpus:
    def __init__(self, documents, preprocessor, use_phrases = True):
        self.documents = documents
        self.preprocessor = preprocessor
        self.phrases_have_been_identified = False
        if use_phrases:
            self._learn_phrases()

    def _phrases_identification(self):
        for doc in self.documents:
            yield self.preprocessor(doc)

    def __iter__(self):
        for doc in self.documents:
            if self.phrases_have_been_identified:
                yield self.phrases[self.preprocessor(doc)]
            else:
                yield self.preprocessor(doc)

    def _learn_phrases(self):
        self.phrases = Phrases(self._phrases_identification())
        self.phrases_have_been_identified = True


# Train the model

In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


corpus = Corpus(documents=documents, preprocessor=simple_preprocess)
model = Word2Vec(corpus)

2024-11-22 11:55:56,776 : INFO : collecting all words and their counts
2024-11-22 11:55:56,777 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-11-22 11:55:59,267 : INFO : PROGRESS: at sentence #10000, processed 2815610 words and 961694 word types
2024-11-22 11:56:01,565 : INFO : collected 1453195 token types (unigram + bigrams) from a corpus of 5227570 words and 18846 sentences
2024-11-22 11:56:01,566 : INFO : merged Phrases<1453195 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2024-11-22 11:56:01,566 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1453195 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 4.79s', 'datetime': '2024-11-22T11:56:01.566483', 'gensim': '4.3.3', 'python': '3.11.7 (main, Dec  4 2023, 18:10:11) [Clang 15.0.0 (clang-1500.1.0.2.5)]', 'platform': 'macOS-15.0.1-arm64-arm-64bit', 'event': 'created'}
2024-11-22 11:56:01,578 : INFO : collecting all words and their counts
2024-11-22 11:56:01,578 : INFO : 

# Examine the vocabulary

In [16]:
vocab = model.wv.index_to_key
vocab[150:200]

['very',
 'take',
 'things',
 'point',
 'have_been',
 'both',
 'made',
 'information',
 'find',
 'windows',
 'etc',
 'another',
 'part',
 'without',
 'government',
 'help',
 'apr',
 'program',
 'however',
 'lines_nntp',
 'll',
 'anyone',
 'case',
 'through',
 'much',
 'ax_max',
 'fact',
 'max_ax',
 'data',
 'sure',
 'too',
 'available',
 'never',
 'anything',
 'under',
 'number',
 'probably',
 'file',
 've',
 'world',
 'true',
 'better',
 'got',
 'does_not',
 'around',
 'game',
 'bit',
 'again',
 'state',
 'try']

# Examine the word vector

In [17]:
model.wv["government"]

array([-1.7955906 ,  2.0579085 , -0.21899213,  0.57640946,  0.93009514,
        0.9880855 , -0.5923485 , -0.85724294, -0.47745353,  0.3002107 ,
        0.02649226, -2.3515918 ,  1.6552851 ,  0.6870508 ,  0.15229462,
       -2.0009549 ,  1.5693572 , -1.9632825 ,  1.1420659 , -5.4039636 ,
        2.2855954 ,  1.1865    ,  2.0600853 , -2.046923  ,  1.0506628 ,
        0.06809015, -2.7398582 ,  1.4598666 ,  2.6805813 ,  0.7936425 ,
       -0.46178615,  0.44689244, -1.3400604 ,  0.15905245,  1.9218838 ,
       -1.0814312 ,  1.2326314 ,  1.7156591 , -1.9361781 ,  1.373386  ,
       -0.79441994, -1.5766234 , -0.46121615, -2.5349312 ,  0.5340692 ,
       -1.2103965 ,  1.001188  ,  1.4004501 , -1.6127284 , -0.49637172,
       -0.32029074, -3.6038728 , -1.3122703 ,  1.2851677 , -2.6535141 ,
        0.20665184,  0.642841  , -1.0688782 , -1.3588166 ,  0.10809685,
        0.06344327,  3.0690608 , -0.60781074,  1.4185796 ,  1.3655983 ,
        0.24590139,  0.4982278 ,  0.91251576, -0.06000046, -0.04

# Find similar words

In [18]:
model.wv.most_similar("government", topn = 20)

[('citizens', 0.7772632241249084),
 ('law_enforcement', 0.7499463558197021),
 ('nsa', 0.7466758489608765),
 ('administration', 0.7452960014343262),
 ('govt', 0.734394907951355),
 ('land', 0.7303112745285034),
 ('criminal', 0.7282655835151672),
 ('criminals', 0.727948009967804),
 ('israel', 0.7217745780944824),
 ('palestinians', 0.7214918732643127),
 ('economy', 0.7200716733932495),
 ('federal_government', 0.7188506126403809),
 ('property', 0.7177056670188904),
 ('authorities', 0.7070791125297546),
 ('illegal', 0.7069562673568726),
 ('federal', 0.7066277265548706),
 ('weapons', 0.7043834328651428),
 ('demand', 0.7030037045478821),
 ('laws', 0.7028645277023315),
 ('israeli', 0.7009912729263306)]