In [1]:
import gzip
import gensim
import logging

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
data_file="reviews_data.txt"

with open(data_file, 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [4]:
def read_input_file(input_file):
    logging.info("reading file {0}...this may take a while".format(input_file))
    with open(input_file, 'rb') as f:
        for i, line in enumerate (f):
            if i%10000==0:
                logging.info ("read {0} reviews".format (i))
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a series of words
# so this becomes a list of lists
documents = list (read_input_file (data_file))
logging.info ("Done reading data file")

2023-03-22 11:00:01,977 : INFO : reading file reviews_data.txt...this may take a while
2023-03-22 11:00:01,984 : INFO : read 0 reviews
2023-03-22 11:00:03,637 : INFO : read 10000 reviews
2023-03-22 11:00:05,306 : INFO : read 20000 reviews
2023-03-22 11:00:07,101 : INFO : read 30000 reviews
2023-03-22 11:00:08,813 : INFO : read 40000 reviews
2023-03-22 11:00:10,922 : INFO : read 50000 reviews
2023-03-22 11:00:12,744 : INFO : read 60000 reviews
2023-03-22 11:00:14,271 : INFO : read 70000 reviews
2023-03-22 11:00:15,652 : INFO : read 80000 reviews
2023-03-22 11:00:17,054 : INFO : read 90000 reviews
2023-03-22 11:00:18,400 : INFO : read 100000 reviews
2023-03-22 11:00:19,753 : INFO : read 110000 reviews
2023-03-22 11:00:21,108 : INFO : read 120000 reviews
2023-03-22 11:00:22,492 : INFO : read 130000 reviews
2023-03-22 11:00:24,257 : INFO : read 140000 reviews
2023-03-22 11:00:25,611 : INFO : read 150000 reviews
2023-03-22 11:00:27,017 : INFO : read 160000 reviews
2023-03-22 11:00:28,406 : 

In [5]:
model = gensim.models.Word2Vec (documents, vector_size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2023-03-22 11:00:46,251 : INFO : collecting all words and their counts
2023-03-22 11:00:46,251 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-03-22 11:00:46,489 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2023-03-22 11:00:46,741 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2023-03-22 11:00:47,038 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2023-03-22 11:00:47,328 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2023-03-22 11:00:47,642 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2023-03-22 11:00:47,973 : INFO : PROGRESS: at sentence #60000, processed 11013727 words, keeping 76787 word types
2023-03-22 11:00:48,236 : INFO : PROGRESS: at sentence #70000, processed 12637529 words, keeping 83200 word types
2023-03-22 11:00:48,460 : INFO : PROG

(303486796, 415193590)

In [12]:
w1 = "school"
model.wv.most_similar (positive=w1)

[('college', 0.6145505309104919),
 ('fived', 0.6014430522918701),
 ('students', 0.542107343673706),
 ('rollers', 0.5164012908935547),
 ('schoolers', 0.503605306148529),
 ('yrs', 0.5011335611343384),
 ('pitched', 0.49420368671417236),
 ('ceilinged', 0.48946911096572876),
 ('teachers', 0.48282700777053833),
 ('student', 0.48234280943870544)]

In [13]:
model.save("simpsons_word2vec.model")

2023-03-22 11:10:11,750 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'simpsons_word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-22T11:10:11.750220', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2023-03-22 11:10:11,751 : INFO : storing np array 'vectors' to simpsons_word2vec.model.wv.vectors.npy
2023-03-22 11:10:11,896 : INFO : storing np array 'syn1neg' to simpsons_word2vec.model.syn1neg.npy
2023-03-22 11:10:12,065 : INFO : not storing attribute cum_table
2023-03-22 11:10:12,096 : INFO : saved simpsons_word2vec.model
