In [23]:
corpus = ["machine learning is very important for the future",
          "deep learning is transforming the world",
          "AI, machine learning, and deep learning are part of a data scientist's toolkit"]

In [26]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
def preprocess(txt):
    txt = txt.lower() #normalize text
    txt = re.sub(r'[^a-zA-Z]',' ', txt) #removes punctuation and digits
    #we are not going to lemmatize -- you do it!
    #let us remove stopwords
    words = txt.split()
    words = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return " ".join(words)
    
    

In [27]:
preprocess("The sky is blue, not a cloud in the sky! This is 2021")

'sky blue cloud sky'

In [28]:
corpus = [preprocess(txt) for txt in corpus]

In [29]:
corpus

['machine learning important future',
 'deep learning transforming world',
 'ai machine learning deep learning data scientist s toolkit']

In [35]:
#we need to convert our text to numbers
#To do that, we use vectorizers
#let us use a CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
v = TfidfVectorizer()
dtm = v.fit_transform(corpus)

In [36]:
vocab = v.get_feature_names()
vocab

['ai',
 'data',
 'deep',
 'future',
 'important',
 'learning',
 'machine',
 'scientist',
 'toolkit',
 'transforming',
 'world']

In [37]:
dtm.toarray()

array([[0.        , 0.        , 0.        , 0.5844829 , 0.5844829 ,
        0.34520502, 0.44451431, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.44451431, 0.        , 0.        ,
        0.34520502, 0.        , 0.        , 0.        , 0.5844829 ,
        0.5844829 ],
       [0.39066946, 0.39066946, 0.29711419, 0.        , 0.        ,
        0.46147135, 0.29711419, 0.39066946, 0.39066946, 0.        ,
        0.        ]])

In [38]:
import pandas as pd
df = pd.DataFrame(dtm.toarray(), columns = vocab)
df

Unnamed: 0,ai,data,deep,future,important,learning,machine,scientist,toolkit,transforming,world
0,0.0,0.0,0.0,0.584483,0.584483,0.345205,0.444514,0.0,0.0,0.0,0.0
1,0.0,0.0,0.444514,0.0,0.0,0.345205,0.0,0.0,0.0,0.584483,0.584483
2,0.390669,0.390669,0.297114,0.0,0.0,0.461471,0.297114,0.390669,0.390669,0.0,0.0


In [42]:
a = list(df.iloc[0])
a

[0.0,
 0.0,
 0.0,
 0.5844829010200651,
 0.5844829010200651,
 0.34520501686496574,
 0.444514311537431,
 0.0,
 0.0,
 0.0,
 0.0]

In [43]:
sum([x * x for x in a])

0.9999999999999998

Sparse matrix:
[0 0 0 0 0 0 0 1 0 0 0 0.....
........
...] Let us assume this is a very large matrix (100000 x 1000000)

[1000000, [5, 1005, 999000], [23, 11, 5]]

In [44]:
a = dtm.toarray()[0]
b = dtm.toarray()[1]
c = dtm.toarray()[2]
a

array([0.        , 0.        , 0.        , 0.5844829 , 0.5844829 ,
       0.34520502, 0.44451431, 0.        , 0.        , 0.        ,
       0.        ])

a = [1,2,3]
b = [1,3,3]
Euclidean distance = ? = math.sqrt([(1 -1)**2 + (2 - 3) ** 2 + (3 - 3)** 2])

In [45]:
import numpy as np
import math
def euclidean(a,b):
    return math.sqrt(np.sum([(x - y)** 2 for x, y in zip(a,b)]))
    

In [49]:
def cosine_distance(a, b):
    numerator = a.dot(b) #dot product of a and b
    length_of_a = math.sqrt(np.sum([x ** 2 for x in a]))
    length_of_b = math.sqrt(np.sum([x ** 2 for x in b]))
    return numerator / (length_of_a * length_of_b)

In [51]:
cosine_distance(np.array([1,2,3]),np.array([2,3,4]))

0.9925833339709303