In [21]:
from sentence_transformers import SentenceTransformer, util 

In [13]:
model = SentenceTransformer("all-MiniLM-L6-v2")
model 

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [14]:
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of strings.",
    "The quick brown fox jumps over the lazy dog.",
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)
embeddings 

array([[-0.01371733, -0.04285158, -0.01562865, ...,  0.10017826,
         0.12365725, -0.04229672],
       [ 0.05273376,  0.0432319 ,  0.03611053, ...,  0.07960209,
         0.08644354, -0.04385225],
       [ 0.04393357,  0.05893439,  0.04817842, ...,  0.05216277,
         0.05610653,  0.10206392]], dtype=float32)

In [15]:
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Incomplete Embedding: \n", embedding[:10])
    

Sentence: This framework generates embeddings for each input sentence
Incomplete Embedding: 
 [-0.01371733 -0.04285158 -0.01562865  0.01405375  0.03955377  0.12179632
  0.02943334 -0.03175237  0.03549596 -0.079314  ]
Sentence: Sentences are passed as a list of strings.
Incomplete Embedding: 
 [ 0.05273376  0.0432319   0.03611053  0.02490369 -0.03182166  0.08561208
  0.10055906 -0.00032588 -0.00795473 -0.01028667]
Sentence: The quick brown fox jumps over the lazy dog.
Incomplete Embedding: 
 [ 0.04393357  0.05893439  0.04817842  0.07754809  0.02674441 -0.03762958
 -0.00260509 -0.05994302 -0.00249604  0.02207279]


In [18]:
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The new movie is awesome",
]

sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
]


In [19]:
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
embeddings1, embeddings2

(tensor([[ 0.1392,  0.0030,  0.0470,  ...,  0.0641, -0.0163,  0.0636],
         [ 0.0227, -0.0014, -0.0056,  ..., -0.0225,  0.0846, -0.0283],
         [-0.1004, -0.0774, -0.0014,  ..., -0.0010,  0.0718,  0.0221]]),
 tensor([[ 0.0163, -0.0700,  0.0384,  ...,  0.0447,  0.0254, -0.0023],
         [ 0.0054, -0.0920,  0.0140,  ...,  0.0167, -0.0086, -0.0424],
         [-0.0842, -0.0592, -0.0010,  ..., -0.0157,  0.0764,  0.0389]]))

In [22]:
cosine_scores = util.cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939


In [23]:
sentences3 = [
    "The cat sits outside",
    "A man is playing guitar",
    "I love pasta",
    "The new movie is awesome",
    "The cat plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
    "Do you like pizza?",
]

In [24]:
embeddings3 = model.encode(sentences3, convert_to_tensor=True)
embeddings3 

tensor([[ 0.1392,  0.0030,  0.0470,  ...,  0.0641, -0.0163,  0.0636],
        [ 0.0227, -0.0014, -0.0056,  ..., -0.0225,  0.0846, -0.0283],
        [-0.1025, -0.0541,  0.0108,  ...,  0.1097,  0.0851, -0.0738],
        ...,
        [ 0.0054, -0.0920,  0.0140,  ...,  0.0167, -0.0086, -0.0424],
        [-0.0842, -0.0592, -0.0010,  ..., -0.0157,  0.0764,  0.0389],
        [-0.1047,  0.0302, -0.0049,  ...,  0.0555,  0.0570, -0.0948]])

In [25]:
cosine_scores = util.cos_sim(embeddings, embeddings)
cosine_scores 

tensor([[1.0000, 0.5527, 0.1181],
        [0.5527, 1.0000, 0.1012],
        [0.1181, 0.1012, 1.0000]])

In [27]:
pairs = []
for i in range(cosine_scores.shape[0]):
    for j in range(cosine_scores.shape[1]):
        pairs.append({"index": [i, j], "score": cosine_scores[i][j]})

# Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x["score"], reverse=True)

In [34]:
for pair in pairs[0:10]:
    i, j = pair["index"]
    print("{} \t {} \t Score: {:.1f}".format(
        sentences[i], sentences[j], pair["score"]
    ))

Sentences are passed as a list of strings. 	 Sentences are passed as a list of strings. 	 Score: 1.0
The quick brown fox jumps over the lazy dog. 	 The quick brown fox jumps over the lazy dog. 	 Score: 1.0
This framework generates embeddings for each input sentence 	 This framework generates embeddings for each input sentence 	 Score: 1.0
This framework generates embeddings for each input sentence 	 Sentences are passed as a list of strings. 	 Score: 0.6
Sentences are passed as a list of strings. 	 This framework generates embeddings for each input sentence 	 Score: 0.6
This framework generates embeddings for each input sentence 	 The quick brown fox jumps over the lazy dog. 	 Score: 0.1
The quick brown fox jumps over the lazy dog. 	 This framework generates embeddings for each input sentence 	 Score: 0.1
Sentences are passed as a list of strings. 	 The quick brown fox jumps over the lazy dog. 	 Score: 0.1
The quick brown fox jumps over the lazy dog. 	 Sentences are passed as a list of

In [36]:
# use sentence transformer for sentences comparison 
import pandas as pd 

movies_df = pd.read_csv('../../movie_dataset/movies_metadata.csv')
movies_df.head(1), movies_df.columns  

  movies_df = pd.read_csv('../../movie_dataset/movies_metadata.csv')


(   adult  ... vote_count
 0  False  ...     5415.0
 
 [1 rows x 24 columns],
 Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
        'imdb_id', 'original_language', 'original_title', 'overview',
        'popularity', 'poster_path', 'production_companies',
        'production_countries', 'release_date', 'revenue', 'runtime',
        'spoken_languages', 'status', 'tagline', 'title', 'video',
        'vote_average', 'vote_count'],
       dtype='object'))

In [37]:
# taking all overviews of all movies 
all_overviews = movies_df.overview.tolist() 
len(all_overviews) 

45466

In [39]:
embeddings_movies = model.encode(all_overviews[:100], convert_to_tensor=True)
embeddings_movies  

tensor([[ 0.0634,  0.0010,  0.0932,  ...,  0.0154,  0.0446,  0.0220],
        [ 0.0863,  0.0446, -0.0405,  ..., -0.0033, -0.0293, -0.0266],
        [-0.1009,  0.0374, -0.0009,  ...,  0.0568, -0.0262,  0.0183],
        ...,
        [-0.0612, -0.0261, -0.0052,  ...,  0.0711,  0.0009, -0.0289],
        [-0.0189,  0.0755,  0.0105,  ...,  0.0831,  0.0385, -0.0305],
        [-0.0727, -0.0136, -0.0711,  ...,  0.0009, -0.0264, -0.0036]])

In [40]:
cosine_scores_movies  = util.cos_sim(embeddings_movies, embeddings_movies)
cosine_scores_movies  

tensor([[1.0000, 0.3099, 0.0994,  ..., 0.0916, 0.0919, 0.1399],
        [0.3099, 1.0000, 0.2149,  ..., 0.1619, 0.0512, 0.2492],
        [0.0994, 0.2149, 1.0000,  ..., 0.1748, 0.0484, 0.2267],
        ...,
        [0.0916, 0.1619, 0.1748,  ..., 1.0000, 0.0721, 0.1308],
        [0.0919, 0.0512, 0.0484,  ..., 0.0721, 1.0000, 0.1819],
        [0.1399, 0.2492, 0.2267,  ..., 0.1308, 0.1819, 1.0000]])

In [49]:
pairs = []
for i in range(cosine_scores_movies.shape[0]):
    for j in range(cosine_scores_movies.shape[1]):
        if i != j and cosine_scores_movies[i][j] < 1.0:
            pairs.append({"index": [i, j], "score": cosine_scores_movies[i][j]})

# Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x["score"], reverse=True)

for pair in pairs[0:3]:
    i, j = pair["index"]
    print("{} \t \n\n{} \n\n\t Score: {:.1f}".format(
        all_overviews[i], all_overviews[j], pair["score"]
    ))

A justice drama based on a true story about a man on death row who in his last days forms a strong relationship with a nun who teaches him forgiveness and gives him spirituality as she accompanies him to his execution. Susan Sarandon won an Oscar for best female actress for her convincing portrayal of Sister Helen Prejean. 	 

Susan wants to work in television and will therefore do anything it takes, even if it means killing her husband. A very dark comedy from independent director Gus Van Sant with a brilliant Nicole Kidman in the leading role. 

	 Score: 0.6
Susan wants to work in television and will therefore do anything it takes, even if it means killing her husband. A very dark comedy from independent director Gus Van Sant with a brilliant Nicole Kidman in the leading role. 	 

A justice drama based on a true story about a man on death row who in his last days forms a strong relationship with a nun who teaches him forgiveness and gives him spirituality as she accompanies him to hi