## Creating a System of Equations

In [1]:
sword = [10,1,5,1]
knife = [10,1,5,9]
herring = [2,2,2,6]

In [2]:
import numpy as np

In [3]:
# Calculates the Euclidean distance between 2 vectors
def euclidean(vector1, vector2):
    distance = np.array(vector1) - np.array(vector2)
    squared_sum = np.sum(distance**2)
    return np.sqrt(squared_sum)

In [4]:
print(euclidean(sword,knife))
print(euclidean(sword,herring))
print(euclidean(knife,herring))

8.0
9.9498743710662
9.1104335791443


In [5]:
# This needed an older version of numpy 1.26.4 and not the most recent version of numpy 2.2.3 
import gensim.downloader as api

In [8]:
# download the vectors for word2vec that used google news
vectors = api.load('word2vec-google-news-300')



In [10]:
print(vectors['sword'])

[ 0.51953125  0.1875      0.31445312 -0.20605469 -0.0078125   0.375
  0.22558594 -0.02441406 -0.06445312  0.27929688  0.02746582 -0.24511719
 -0.21582031  0.13574219 -0.27148438 -0.09130859 -0.06884766 -0.08349609
  0.14160156 -0.14160156  0.24316406 -0.23730469  0.32421875 -0.00582886
 -0.12792969  0.0201416   0.07617188 -0.10742188  0.16894531 -0.12988281
  0.07958984  0.2265625   0.11035156  0.12792969  0.02856445  0.01965332
 -0.06933594  0.21875    -0.06738281 -0.04370117  0.23046875  0.07714844
  0.49804688 -0.14550781  0.23632812 -0.10009766  0.02893066 -0.16699219
  0.09814453 -0.24804688 -0.09082031  0.3515625  -0.00439453 -0.29296875
  0.00793457 -0.140625   -0.10888672  0.00212097 -0.13476562 -0.02575684
 -0.02148438  0.10888672  0.07324219  0.15332031 -0.06835938 -0.01831055
  0.08544922 -0.39257812  0.03979492  0.12890625  0.10595703 -0.13476562
  0.05224609 -0.25       -0.16113281 -0.11523438  0.00117493 -0.17480469
  0.04248047 -0.42382812  0.03710938 -0.02770996 -0.0644

In [11]:
print(euclidean(vectors['sword'],vectors['knife']))
print(euclidean(vectors['sword'],vectors['herring']))
print(euclidean(vectors['car'],vectors['van']))

3.2766972
4.9384727
2.608656


In [12]:
def dot_product(vector1,vector2):
    thedotproduct=np.sum([vector1[k]*vector2[k] for k in range(0,len(vector1))])
    return(thedotproduct)

def vector_norm(vector):
    thenorm = np.sqrt(dot_product(vector,vector))
    return(thenorm)

def cosine_similarity(vector1,vector2):
    thecosine=0
    thedotproduct = dot_product(vector1,vector2)
    thecosine = thedotproduct/(vector_norm(vector1)*vector_norm(vector2))
    thecosine=np.round(thecosine,4)
    return(thecosine)

In [13]:
print(cosine_similarity(vectors['sword'],vectors['knife']))
print(cosine_similarity(vectors['sword'],vectors['herring']))
print(cosine_similarity(vectors['car'],vectors['van']))

0.5576
0.0529
0.6116


## Manipulating Vectors with Mathematical Calculations

In [14]:
king = vectors['king']
queen = vectors['queen']
man = vectors['man']
woman = vectors['woman']

In [15]:
newvector = king-man+woman

In [17]:
print(cosine_similarity(newvector, queen))
print(euclidean(newvector, queen))

0.7301
2.298658


In [18]:
print(cosine_similarity(vectors['fish'], vectors['herring']))
print(euclidean(vectors['fish'], vectors['herring']))

0.6992
2.7537737


## Detecting Plagiarism With word2vec

In [19]:
# These are the results when vectors are the same
print(cosine_similarity(vectors['the'], vectors['the']))
print(euclidean(vectors['having'], vectors['having']))

1.0
0.0


In [20]:
print(cosine_similarity(vectors['trouble'], vectors['problem']))
print(euclidean(vectors['come'], vectors['approach']))
print(cosine_similarity(vectors['put'], vectors['insert']))

0.5327
2.9844923
0.3435


## Using Skip-Thoughts

In [22]:
import tensorflow_hub as hub
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/5')














In [23]:
Sentences = [
    "The trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.",\
    "The problem with having an open mind is that people will insist on approaching and trying to insert things into your mind.",\
    "To be or not to be, that is the question",\
    "Call me Ishmael"
]

In [24]:
embeddings=embed(Sentences)

In [25]:
print(embeddings[0])

tf.Tensor(
[ 9.70248657e-04 -5.99743053e-02 -2.84198509e-03  7.49066239e-03
  7.74949491e-02 -1.00518751e-03 -7.75496513e-02  4.12208289e-02
 -1.55480194e-03 -1.11693345e-01  2.58275699e-02 -1.15299895e-02
 -3.84846426e-05 -4.07183729e-02  3.69430296e-02  6.66357949e-02
 -2.35660914e-02  8.99885036e-03 -2.71403249e-02 -5.69594046e-03
  5.81006221e-02 -7.52891833e-03 -1.46736680e-02  3.34217250e-02
 -2.83413939e-02 -2.66162232e-02  8.27136561e-02  2.97723804e-02
 -6.22679852e-02 -1.03822220e-02 -2.17885282e-02 -6.10623099e-02
 -1.82683337e-02 -1.00083120e-01 -4.11456749e-02  1.05086416e-02
  5.70789874e-02  2.31803339e-02  1.20034218e-02 -7.01201381e-03
  1.10151106e-02  2.03509480e-02  6.42644241e-02 -3.29411551e-02
  1.36516206e-02 -3.94356735e-02 -2.31789015e-02  3.67378853e-02
 -4.63136248e-02 -8.05744082e-02  3.57695073e-02  7.44108036e-02
  4.48199958e-02  7.86103867e-03  1.17291128e-02  7.10513117e-03
 -3.36441658e-02 -2.01602671e-02  2.59170309e-02 -2.25279741e-02
 -6.86499402e-

In [26]:
print(cosine_similarity(embeddings[0],embeddings[1]))

0.8525


In [27]:
print(cosine_similarity(embeddings[0],embeddings[2]))
print(cosine_similarity(embeddings[0],embeddings[3]))

0.0656
-0.0705


## Topic Modeling

In [31]:
Sentences = [
    "The corn and cheese are delicious when they're roasted together",
    "Several of the scenes have rich settings but weak characterization",
    "Consider adding extra seasoning to the pork",
    "The prose was overwrought and pretentious",
    "There are some nice brisket slices on the menu",
    "It would be better to have a chapter to introduce your main plot ideas",
    "Everything was cold when the waiter brought it to the table",
    "You can probably find it at a cheaper price in bookstores"
]

In [44]:
embeddings = embed(Sentences)

In [36]:
import pandas as pd

In [45]:
arrays = []
for i in range(len(Sentences)):
    arrays.append(np.array(embeddings[i]))

sentencematrix = np.empty((len(Sentences),512,), order="F")

for i in range(len(Sentences)):
    sentencematrix[i] = arrays[i]

pandasmatrix = pd.DataFrame(sentencematrix)

In [46]:
from sklearn.cluster import KMeans

In [47]:
m=KMeans(2)
m.fit(pandasmatrix)

In [48]:
pandasmatrix['topic'] = m.labels_
pandasmatrix['sentences'] = Sentences

In [49]:
# These clusters can change in contents. Remember KMeans picks spots at random to begin the cluster
print(pandasmatrix.loc[pandasmatrix['topic']==0,'sentences'])
print(pandasmatrix.loc[pandasmatrix['topic']==1,'sentences'])

1    Several of the scenes have rich settings but w...
3            The prose was overwrought and pretentious
5    It would be better to have a chapter to introd...
7    You can probably find it at a cheaper price in...
Name: sentences, dtype: object
0    The corn and cheese are delicious when they're...
2          Consider adding extra seasoning to the pork
4       There are some nice brisket slices on the menu
6    Everything was cold when the waiter brought it...
Name: sentences, dtype: object
