# Some remarks after week 1

## numpy

In [1]:
# easy multiplication
import numpy as np

In [4]:
A = np.ones([5,5])
B = np.ones([5,5])
A@B

array([[ 5.,  5.,  5.,  5.,  5.],
       [ 5.,  5.,  5.,  5.,  5.],
       [ 5.,  5.,  5.,  5.,  5.],
       [ 5.,  5.,  5.,  5.,  5.],
       [ 5.,  5.,  5.,  5.,  5.]])

## Latex

$2^3$

## Using unnecessary for loop will increase run time

This is because numpy uses SIMD. See

https://docs.scipy.org/doc/numpy/release.html

https://en.wikipedia.org/wiki/SIMD

In [5]:
def BadDotProduct(x,y):
    dp = 0
    for i in range(len(x)):
        dp += x[i]*y[i]
    return dp

In [6]:
def GoodDotProduct(x,y):
    return sum(x*y)

In [7]:
n = 1000
x = np.random.randn(n)
y = np.random.rand(n)

In [8]:
import timeit

In [10]:
%time GoodDotProduct(x,y)
%time BadDotProduct(x,y)

CPU times: user 239 µs, sys: 21 µs, total: 260 µs
Wall time: 254 µs
CPU times: user 751 µs, sys: 2 µs, total: 753 µs
Wall time: 764 µs


25.924448280734715

## GloVe: Global Vectors for Word Representation

https://nlp.stanford.edu/projects/glove/

I am downloading pretrained data (glove_50_glove_100.tgz) from fast.ai (100d)

http://files.fast.ai/models/

In [11]:
import json

In [12]:
path = "./data/"

In [13]:
vecs = np.load(path + "glove_vectors_100d.npy")

In [14]:
with open(path + "words.txt") as f:
    content = f.readlines()
words = [x.strip() for x in content] 

In [15]:
wordidx = json.load(open(path + "wordsidx.txt"))

words has all the words

In [16]:
len(words)

400000

In [17]:
words[:10]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]

In [18]:
words[500:510]

['working',
 'community',
 'eight',
 'groups',
 'despite',
 'level',
 'largest',
 'whose',
 'attacks',
 'germany']

wordidx allows us to look up a word in order to find out its index:

In [19]:
wordidx['happy']

1751

In [20]:
vecs[1751]

array([-0.090436  ,  0.19636001,  0.29473999, -0.47705999, -0.80435997,
        0.30779999, -0.55204999,  0.58453   , -0.17056   , -0.84846002,
        0.19528   ,  0.23671   ,  0.46827   , -0.58977002, -0.12163   ,
       -0.24697   , -0.072944  ,  0.17259   , -0.0485    ,  0.95270002,
        0.50629002,  0.58497   , -0.19367   , -0.45458999, -0.031095  ,
        0.51633   , -0.24052   , -0.1007    ,  0.53627002,  0.024225  ,
       -0.50161999,  0.73692   ,  0.49467999, -0.34744   ,  0.89336997,
        0.057439  , -0.19126999,  0.39333001,  0.21182001, -0.89837003,
        0.078704  , -0.16344   ,  0.45260999, -0.41095999, -0.19498999,
       -0.13489   , -0.016313  , -0.021849  ,  0.17136   , -1.24129999,
        0.079503  , -0.91144001,  0.35699001,  0.36289001, -0.24934   ,
       -2.11960006,  0.14534   ,  0.52964002,  0.90134001,  0.033603  ,
        0.022809  ,  0.70625001, -1.03620005, -0.59808999,  0.70591998,
       -0.072793  ,  0.67032999,  0.52762997, -0.47806999, -0.67

In [21]:
np.linalg.norm(vecs[1751])

5.2950091

In [22]:
vecs[wordidx['python']]

array([ 0.24934   ,  0.68317997, -0.044711  , -1.38419998, -0.0073079 ,
        0.65100002, -0.33958   , -0.19785   , -0.33925   ,  0.26690999,
       -0.033062  ,  0.15915   ,  0.89547002,  0.53999001, -0.55817002,
        0.46245   ,  0.36722001,  0.18889999,  0.83188999,  0.81421   ,
       -0.11835   , -0.53463   ,  0.24157999, -0.038864  ,  1.19070005,
        0.79352999, -0.12308   ,  0.66420001, -0.77618998, -0.45713001,
       -1.05400002, -0.20557   , -0.13296001,  0.12239   ,  0.88458002,
        1.02400005,  0.32288   ,  0.82104999, -0.069367  ,  0.024211  ,
       -0.51418   ,  0.87269998,  0.25759   ,  0.91526002, -0.64221001,
        0.041159  , -0.60207999,  0.54631001,  0.66075999,  0.19796   ,
       -1.13929999,  0.79514003,  0.45965999, -0.18463001, -0.64130998,
       -0.24929   , -0.40193999, -0.50786   ,  0.80579001,  0.53364998,
        0.52732003,  0.39247   , -0.29883999,  0.009585  ,  0.99953002,
       -0.061279  ,  0.71935999,  0.32901001, -0.052772  ,  0.67

In [23]:
def cosine_angle(x,y):
    return sum(x*y)/np.linalg.norm(x)/np.linalg.norm(y)

In [24]:
wordidx['mom']

6075

In [25]:
wordidx['mother']

808

In [27]:
cosine_angle(vecs[wordidx['father']],vecs[wordidx['mother']])

0.86566615869991792

In [28]:
cosine_angle(2*vecs[wordidx['new']],vecs[wordidx['antique']])

0.31497272673471055