In [1]:
import numpy as np
import os
from random import shuffle
import re
import urllib.request
import zipfile
import lxml.etree

#download the data
urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [2]:
# extract subtitle
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))

In [3]:
#Clean up the text
# remove parenthesis 
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)
# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
# store as list of lists of words
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [4]:
from gensim.models import Word2Vec
model_ted = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=4, sg=0)



In [5]:
print(model_ted.wv['earth'])
print(model_ted.wv['man'])

[-0.68391436 -0.25471973  0.6525762   0.4104716  -0.15702242 -2.3139324
 -0.22513187  0.9998831  -0.9485713   0.68279684 -0.37711877  1.0382502
  0.04595969 -0.55829126 -0.07567953 -1.3099108   0.2569745   0.49885386
  1.1250598  -1.2648084  -0.7343621   0.8196438  -0.23289843  0.4288456
  0.8723032  -0.08758009  2.2787936  -0.0853721   2.077441   -0.28898084
 -0.31309655  0.702591   -2.3045537  -1.7554103  -1.2319196   0.72069556
  0.6650018  -1.148195    1.1785841   0.45326874  1.7265474   0.31190673
 -0.95666075  0.33973533  0.1946441  -0.582915    1.6129501   2.5764961
  0.5560557  -0.99855125  0.64411294  1.2264754  -0.7171068   1.1212615
  0.8099502   0.728674    1.8731914   1.0267409  -0.4086136  -0.52536494
  0.10080434 -0.43553832 -0.16205995 -1.049386    0.37816978  1.8489345
 -2.128845   -2.0945213  -0.94764274 -1.1442782   0.30290273  0.38237146
  2.0778217  -1.9277537  -0.47328013 -1.3439789   0.66601175 -1.1618676
  2.306674   -0.18465856  0.18493979  0.30162436  1.200030

In [6]:
model_ted.wv.most_similar('earth')

[('mars', 0.8177419304847717),
 ('planet', 0.8079463243484497),
 ('moon', 0.7199641466140747),
 ('sun', 0.6545634269714355),
 ('surface', 0.6520480513572693),
 ('ocean', 0.6383042335510254),
 ('orbit', 0.6329202651977539),
 ('continent', 0.632485568523407),
 ('land', 0.6294218897819519),
 ('horizon', 0.6091650724411011)]

In [7]:
print(model_ted.wv.most_similar("sun"))
print(model_ted.wv.most_similar("son"))

[('sky', 0.7832568883895874), ('surface', 0.7750268578529358), ('ocean', 0.7709890604019165), ('sea', 0.7641971111297607), ('atmosphere', 0.7530478835105896), ('wind', 0.7467349767684937), ('moon', 0.7372883558273315), ('light', 0.7301375865936279), ('river', 0.7301061153411865), ('ice', 0.7211363315582275)]
[('father', 0.9359591007232666), ('daughter', 0.9347150325775146), ('husband', 0.9121847152709961), ('brother', 0.8964835405349731), ('grandmother', 0.8833799362182617), ('mother', 0.8832564949989319), ('sister', 0.8821182250976562), ('wife', 0.8817569017410278), ('uncle', 0.8758556842803955), ('cousin', 0.8413651585578918)]


In [None]:
model_ted.wv.most_similar("Gastroenteritis")

In [None]:
from gensim.models import FastText
model_ted_fast = FastText(sentences_ted, size=100, window=5, min_count=5, workers=4,sg=1)

In [None]:
model_ted_fast.wv.most_similar("Gastroenteritis")