In [1]:
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
import tempfile
import time

In [5]:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-download-auto-examples-tutorials-run-word2vec-py
def split(word):
    return [char for char in word]

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self):
        self.start = time.time()
        self.count = 0


    def __iter__(self):
        corpus_path = "only_ch.txt"
        for line in open(corpus_path, encoding='utf-8'):
            # one document per line, tokens not separated
            self.count += 1
            if self.count % 50000 == 0:
                end = time.time()
                print(self.count)
                print((end - self.start)/60) # in mins 
            #a ="开 发了 即可 我 能 放"
            #print(utils.simple_preprocess(a))
            #yield utils.simple_preprocess(line)
            yield split(line)

In [6]:
sentences = MyCorpus()

In [None]:
start = time.time()
model = gensim.models.Word2Vec(
    sentences=sentences, 
    vector_size=150, 
    window=5, 
    min_count=1, 
    workers=4, 
    sg=0)
model.save("pre_trained_char_150_iter5.txt")

50000
0.2589478890101115
100000
0.42760929266611736
150000
0.5699123422304789
200000
0.7028570969899496
250000
0.8304741700490316
300000
0.966543443997701
350000
1.090379520257314
400000
1.22016628185908
450000
1.8865447044372559
500000
2.6818228999773663
550000
3.3129217942555744
600000
3.931385171413422
650000
4.472918144861857
700000
5.0595053633054095
750000
5.629963751633962


In [None]:
vec_king = model.wv['王']
vec_king


In [None]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

In [None]:
# save and load 
with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    model.save(temporary_filepath)
    #
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    #
    # To load a saved model:
    #
    new_model = gensim.models.Word2Vec.load(temporary_filepath)

In [None]:
# visual result 
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)