## Welcome to Data, Machines and the 🐍
<img src="https://raw.githubusercontent.com/NSF-EC/INFO490Assets/master/src/dmap/lessons/word-vectors/html/section00.png" align="left"/>

<a id="install"></a>
## Notebook Preparation for Lesson
Each lesson will start with a template (given in the course schedule).  
Once you open the notebook:
1. **save** in on your google drive (copy to drive) and share the notebook
2. **copy** the share ID to the `NOTEBOOK_ID` (and re-save the notebook)
3. **run** the next cell to install the IDE.

In [0]:
# After changing NOTEBOOK_ID to the shared ID
# 1. SAVE THE NOTEBOOK cmd+s
# 2. Re-RUN this code cell
NOTEBOOK_ID  = '1GBamwb67eCSPp0YreXMIrFF-IZOT1nT3'  # change me!!
LESSON_ID    = 'dmap:text:word-vectors'       # keep this as is
VERSION_ID   = 25

def install_ide(lesson_id, nb_id, reload=True):
  import os
  if not os.path.exists('Bootstrap.py'):
     !wget 'https://raw.githubusercontent.com/NSF-EC/INFO490Assets/master/src/tools/Bootstrap.py' -O Bootstrap.py > out.txt 2>&1 
  try:
    import Bootstrap, importlib
    importlib.reload(Bootstrap)

    boot = Bootstrap.BootStrap()
    return boot.create_ide(lesson_id, nb_id, reload)
  except Exception as e:
    class Nop(object):
        def __init__(self, e): self.e = e
        def nop(self, *args, **kw): return("unable to test:" + self.e, None)
        def __getattr__(self, _): return self.nop 
    class IDE():
      tester=Nop(str(e))
      reader=Nop(str(e)) # RemoteReader ??
    return IDE()

ide = install_ide(LESSON_ID, NOTEBOOK_ID)
ide.tester.hello_world()

# Lesson Word Embeddings

(run the next cell to read the first part of the lesson)

In [0]:
# run to read the next section
ide.reader.view_section(1)

# Words as Vectors

In [0]:
# run to read the next section
ide.reader.view_section(2)

In [0]:
import spacy
try:
  print('installing model, please wait')
  import en_core_web_md  # don't use lg (too big)
except ImportError:
  print('need to download .. please wait')
  !python -m spacy download en_core_web_md
  import en_core_web_md 

nlp = en_core_web_md.load()

In [0]:
# run to read the next section
ide.reader.view_section(4)

In [0]:
# retrieve words from the English model vocabulary
cat = nlp.vocab['cat']
dog = nlp.vocab['dog']
car = nlp.vocab['car']

# print the dimension of word vectors
print('vector length:', len(cat.vector))

# print the word vector
print('cat:', cat.vector)

In [0]:
# run to read the next section
ide.reader.view_section(6)

In [0]:
print('The similarity between dog and dog:', dog.similarity(dog))
print('The similarity between dog and car:', dog.similarity(car))
print('The similarity between dog and cat:', dog.similarity(cat))

In [0]:
# run to read the next section
ide.reader.view_section(8)

In [0]:
def sim(a, b):
    return 0
    
print(sim(dog,car))
# or use the tester: print(ide.tester.test_fn(sim))

In [0]:
# run to read the next section
ide.reader.view_section(10)

In [0]:
def sim2(a,b):
  return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
  
man = nlp.vocab['father'].vector
woman = nlp.vocab['mother'].vector
d1 = man - woman

uncle = nlp.vocab['uncle'].vector
aunt = nlp.vocab['aunt'].vector
d2 = uncle - aunt

print(sim2(d1, d2))

In [0]:
# run to read the next section
ide.reader.view_section(12)

In [0]:
def most_similar(word, topn=10):

    # get all words in the vocabulary
    allwords = [w for w in nlp.vocab if w.has_vector and w.is_lower and w.lower_ != word.lower_]  
    
    # sort words by similarity in descending order
    out = sorted(allwords, key=lambda w: word.similarity(w), reverse=True)  
    return out[:topn]

neighbors = most_similar(car)
print([w.text for w in neighbors])

In [0]:
# run to read the next section
ide.reader.view_section(14)

In [0]:
def find_closest(v, exclude):

  cos = lambda v1, v2: np.dot(v1, v2)/(np.linalg.norm(v1) * np.linalg.norm(v2))
  # valid ensures we don't include any of the words 
  # that are part of the equation (input)
  def valid(w, exclude):
    if w.has_vector and w.is_lower:
      for t in exclude:
        if w.lower_.find(t)>=0:
          return False
      return True
    return False

  w_set = [w for w in nlp.vocab if valid(w, exclude)]
  candidates = sorted(w_set, key=lambda w: cos(result, w.vector), reverse=True)
  return candidates[:10]

In [0]:
# run to read the next section
ide.reader.view_section(16)

In [0]:
def reduce_dimensions(labels):
  from sklearn.manifold import TSNE
  import numpy as np
  
  data = np.array([nlp.vocab[w].vector for w in labels])
  # reduce to two
  tsne_model = TSNE(n_components=2)
  data_2d = tsne_model.fit_transform(data)
  
  return data_2d

def plot_results(data_2d, labels):
  import matplotlib
  import matplotlib.pyplot as plt
  
  # plot the 2d vectors and show their labels
  fig, axes = plt.subplots()
  axes.scatter(data_2d[:, 0], data_2d[:, 1], s=100)
  for i, txt in enumerate(labels):
    axes.annotate(txt, (data_2d[i,0], data_2d[i,1]), xytext=(2, 3), textcoords='offset points')
  axes.grid()
  
  return fig

labels = ['king', 'man', 'queen', 'woman']
data = reduce_dimensions(labels)
fig = plot_results(data, labels)

In [0]:
# run to read the next section
ide.reader.view_section(18)

# Lesson Assignment

In [0]:
# run to read the next section
ide.reader.view_section(19)

In [0]:
def find_analogy(three_words, nlp):
   return None

In [0]:
# run to read the next section
ide.reader.view_section(21)

# Bonus Vectors

In [0]:
# run to read the next section
ide.reader.view_section(22)

# Test and Submit

In [0]:
# run to read the next section
ide.reader.view_section(23)

In [0]:
# print(ide.tester.test_notebook()) 
# print(ide.tester.test_notebook(verbose=True)) 

ide.tester.download_solution()