In [2]:
import os
import sys
import gensim
import logging
import shutil

program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

In [3]:
glove_pretrained_model = '/data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/glove.6B.300d.txt'
glove_2_gensim_pretrained_model_path = '/data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim'

In [11]:
def prepend_line(infile, outfile, line):
    """ 
    Function use to prepend lines using bash utilities in Linux. 
    (source: http://stackoverflow.com/a/10850588/610569)
    """
    with open(infile, 'r') as old:
        with open(outfile, 'w') as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)


def prepend_slow(infile, outfile, line):
    """
    Slower way to prepend the line by re-creating the inputfile.
    """
    with open(infile, 'r') as fin:
        with open(outfile, 'w') as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)


def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims


def convert_glove_model_2_gensim_w2c(glove_file, save_gensim_model_path):
    """
    load glove model and convert to gensim word2vec model and save it
    :param glove_file: GloVe Model File, glove.6B.300d.txt, download from http://nlp.stanford.edu/projects/glove/
    :param save_gensim_model_file: Gensim Model text format
    :return: 
    """
    glove_model_filename = glove_file.split('/')[-1]
    print 'glove model: {}'.format(glove_model_filename)
    
    print 'convert glove model format to gensim model...'
    num_lines, dims = get_lines(glove_file)
    gensim_first_line = "{} {}".format(num_lines, dims)

    # Prepends the line.
    new_gensime_text = save_gensim_model_path+'/'+glove_model_filename + '.converted_gensim_w2c.txt'
    
    if platform == "linux" or platform == "linux2":
        prepend_line(glove_file, new_gensime_text, gensim_first_line)
    else:
        prepend_slow(glove_file, new_gensime_text, gensim_first_line)

    print 'loads the newly created gensim model into gensim api...'
    # Demo: Loads the newly created glove_model.txt into gensim API.
    model = gensim.models.KeyedVectors.load_word2vec_format(new_gensime_text, binary=False)  # GloVe Model
    
    model.save(new_gensime_text + '.model')
    model.save_word2vec_format(new_gensime_text + '.vector', binary=False)
    print 'done!'


In [12]:
convert_glove_model_2_gensim_w2c(glove_pretrained_model, glove_2_gensim_pretrained_model_path)

glove model: glove.6B.300d.txt
convert glove model format to gensim model...


2017-09-04 15:16:27,775: INFO: loading projection weights from /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt


loads the newly created gensim model into gensim api...


2017-09-04 15:19:06,560: INFO: loaded (400001, 300) matrix from /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt
2017-09-04 15:19:07,016: INFO: saving KeyedVectors object under /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.model, separately None
2017-09-04 15:19:07,095: INFO: not storing attribute syn0norm
2017-09-04 15:19:07,116: INFO: storing np array 'syn0' to /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.model.syn0.npy
2017-09-04 15:19:18,109: INFO: saved /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.model
2017-09-04 15:19:18,281: INFO: storing 400001x300 projection weights into /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.vector


In [14]:
f_model = '/data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.model'
model = gensim.models.KeyedVectors.load(f_model)

2017-09-04 15:22:50,539: INFO: loading KeyedVectors object from /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.model
2017-09-04 15:22:55,767: INFO: loading syn0 from /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.model.syn0.npy with mmap=None
2017-09-04 15:23:02,981: INFO: setting ignored attribute syn0norm to None
2017-09-04 15:23:03,532: INFO: loaded /data/sunnymarkliu/pretrained_models/glove/Wikipedia_2014/convert2gensim/glove.6B.300d.txt.converted_gensim_w2c.txt.model


In [15]:
model.vector_size

300

In [17]:
model.most_similar("queen")

2017-09-04 15:23:52,447: INFO: precomputing L2-norms of word weight vectors


[(u'elizabeth', 0.6771447658538818),
 (u'princess', 0.635676383972168),
 (u'king', 0.6336469650268555),
 (u'monarch', 0.5814188122749329),
 (u'royal', 0.543052613735199),
 (u'majesty', 0.5350357294082642),
 (u'victoria', 0.5239557027816772),
 (u'throne', 0.5097099542617798),
 (u'lady', 0.5045416355133057),
 (u'crown', 0.49980056285858154)]

In [18]:
model.most_similar("frog")

[(u'toad', 0.6331218481063843),
 (u'frogs', 0.6233975887298584),
 (u'snake', 0.48968827724456787),
 (u'monkey', 0.4887806177139282),
 (u'toads', 0.4861146807670593),
 (u'squirrel', 0.47789764404296875),
 (u'species', 0.4764746427536011),
 (u'rodent', 0.45831966400146484),
 (u'parrot', 0.45635986328125),
 (u'spider', 0.4531068503856659)]

In [19]:
model['man']

array([-0.29784   , -0.13255   , -0.14505   , -0.22752   , -0.027429  ,
        0.11005   , -0.039245  , -0.0089607 , -0.18866   , -1.12129998,
        0.34793001, -0.30056   , -0.50103003, -0.031383  , -0.032185  ,
        0.018318  , -0.090429  , -0.14427   , -0.14306   , -0.057477  ,
       -0.020931  ,  0.56276   , -0.018557  ,  0.15167999, -0.25586   ,
       -0.081564  ,  0.28029999, -0.10585   , -0.16777   ,  0.21814001,
       -0.11845   ,  0.56475002, -0.12645   , -0.062461  , -0.68043   ,
        0.10507   ,  0.24793001, -0.20249   , -0.30726001,  0.42815   ,
        0.38378   , -0.19371   , -0.075951  , -0.058287  , -0.067195  ,
        0.2192    ,  0.56116003, -0.28156   , -0.13705   ,  0.45754001,
       -0.14670999, -0.18561999, -0.074146  ,  0.60737002,  0.07952   ,
        0.41023001,  0.18377   , -0.08532   ,  0.43794999, -0.34727001,
        0.2077    ,  0.50454003,  0.40244001,  0.1095    , -0.48078001,
       -0.22372   , -0.54619002, -0.20782   ,  0.13751   , -0.16

In [20]:
model.accuracy(questions='/data/sunnymarkliu/wikimedia/enwiki/evaluate_text/questions-words.txt')

2017-09-04 15:25:31,389: INFO: capital-common-countries: 95.3% (482/506)
2017-09-04 15:25:54,113: INFO: capital-world: 95.7% (2598/2715)
2017-09-04 15:25:56,423: INFO: currency: 18.2% (43/236)
2017-09-04 15:26:14,397: INFO: city-in-state: 59.6% (1314/2203)
2017-09-04 15:26:17,464: INFO: family: 92.7% (317/342)
2017-09-04 15:26:24,634: INFO: gram1-adjective-to-adverb: 21.6% (188/870)
2017-09-04 15:26:27,999: INFO: gram2-opposite: 32.9% (125/380)
2017-09-04 15:26:38,643: INFO: gram3-comparative: 88.3% (1176/1332)
2017-09-04 15:26:44,044: INFO: gram4-superlative: 79.5% (477/600)
2017-09-04 15:26:53,007: INFO: gram5-present-participle: 68.7% (639/930)
2017-09-04 15:27:05,463: INFO: gram6-nationality-adjective: 97.5% (1409/1445)
2017-09-04 15:27:17,953: INFO: gram7-past-tense: 61.2% (954/1560)
2017-09-04 15:27:27,014: INFO: gram8-plural: 83.2% (879/1056)
2017-09-04 15:27:33,816: INFO: gram9-plural-verbs: 59.1% (415/702)
2017-09-04 15:27:33,819: INFO: total: 74.0% (11016/14877)


[{'correct': [(u'ATHENS', u'GREECE', u'BAGHDAD', u'IRAQ'),
   (u'ATHENS', u'GREECE', u'BANGKOK', u'THAILAND'),
   (u'ATHENS', u'GREECE', u'BEIJING', u'CHINA'),
   (u'ATHENS', u'GREECE', u'BERLIN', u'GERMANY'),
   (u'ATHENS', u'GREECE', u'BERN', u'SWITZERLAND'),
   (u'ATHENS', u'GREECE', u'CAIRO', u'EGYPT'),
   (u'ATHENS', u'GREECE', u'CANBERRA', u'AUSTRALIA'),
   (u'ATHENS', u'GREECE', u'HANOI', u'VIETNAM'),
   (u'ATHENS', u'GREECE', u'HAVANA', u'CUBA'),
   (u'ATHENS', u'GREECE', u'HELSINKI', u'FINLAND'),
   (u'ATHENS', u'GREECE', u'ISLAMABAD', u'PAKISTAN'),
   (u'ATHENS', u'GREECE', u'KABUL', u'AFGHANISTAN'),
   (u'ATHENS', u'GREECE', u'MADRID', u'SPAIN'),
   (u'ATHENS', u'GREECE', u'MOSCOW', u'RUSSIA'),
   (u'ATHENS', u'GREECE', u'OSLO', u'NORWAY'),
   (u'ATHENS', u'GREECE', u'OTTAWA', u'CANADA'),
   (u'ATHENS', u'GREECE', u'PARIS', u'FRANCE'),
   (u'ATHENS', u'GREECE', u'ROME', u'ITALY'),
   (u'ATHENS', u'GREECE', u'STOCKHOLM', u'SWEDEN'),
   (u'ATHENS', u'GREECE', u'TEHRAN', u'IRAN