In [1]:
import codecs
import numpy as np
import pandas as pd
 
from sklearn.manifold import TSNE
 
def main():
    file_name = "vector.txt"
    max_vec_num = 10000
    csv_path = "statics/vec.csv"

    vocabulary, vectors = read_w2v(file_name, max_vec_num)
    print(vectors.shape)
    vecs_2d = to_2d(vectors) 
    to_csv(vecs_2d, vocabulary, csv_path)

def read_w2v(file_name, max_vec_num):
    vocabulary = []
    vectors = np.empty((0,100), np.float32)
    with codecs.open(file_name, 'r', 'utf-8') as f:
        company_num = 0
        for i,line in enumerate(f):
            if i == 0:
                continue
            vocab , wv = line.strip().split(' ', 1)
            v = np.fromstring(wv, sep=' ', dtype=np.float32)
            if v.shape[0] == 100:
                vocabulary.append(vocab)
                vectors= np.append(vectors, np.array([v]), axis=0)
            if "<" in vocab:
                company_num += 1
            #if company_num == 100:
            #    break
            if len(vectors) == max_vec_num:
                break
    return vocabulary, vectors
        

def to_2d(vectors):
    tsne = TSNE(n_components=2, random_state=0,n_iter=2000)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(vectors)
    return Y

def to_csv(Y, vocabulary, csv_path):
    df = pd.DataFrame(data=Y, columns=["x","y"])
    df.index.name = "index"
    df["vocab"] = vocabulary
    df.to_csv(csv_path)
    print("finish")