# __[:+:]__ SBERT Embeddings Test

In [2]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses
import numpy as np
import matplotlib.pyplot as plt
import pickle
import json
import plotly.express as px
from umap import UMAP


### Get Data

In [3]:
with open("ideological_corpus.txt", "r") as f:
    corpus = f.readlines()
print("[+] -- Loaded ", len(corpus), ' docs ---------------------------------------------------------------------------------------------|')
for i in range(5):
    print(i, ': ', corpus[i])

with open("transcript.txt", "r") as f:
    transcript = f.readlines()
print("[+] -- Loaded ", len(transcript), ' docs ---------------------------------------------------------------------------------------------|')
for i in range(5):
    print(i, ': ', transcript[i])

[+] -- Loaded  70  docs ---------------------------------------------------------------------------------------------|
0 :  Support for Israeli settlements in the West Bank is crucial for security.

1 :  Palestinian statehood should be recognized and supported by the international community.

2 :  Economic cooperation between Israel and Palestine can lead to peace.

3 :  Military action is necessary to protect Israeli borders from threats.

4 :  Human rights abuses against Palestinians must be addressed by global organizations.

[+] -- Loaded  238  docs ---------------------------------------------------------------------------------------------|
0 :  Speaker 1 [0.01s - 26.53s]:  You're saying that you think it's the two narratives. Yeah, but we can't hear you very loudly. Maybe it's much better, much better.

1 :  Speaker 3 [4.60s - 25.21s]:  Yeah, I was just saying that I think it's pretty obvious why Native Americans would side with Palestinians. I didn't really know that there were

### Embedd Sentences

In [7]:
#Load the model(here we use minilm)
model = SentenceTransformer('all-MiniLM-L6-v2')

#Embed some examples
emb1 = model.encode("This is a red cat with a hat.")
emb2 = model.encode("Have you seen my red cat?")


#Get the cosine similarity score between sentences
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)



Cosine-Similarity: tensor([[0.6153]])


In [6]:
corpusEmb_arr= [model.encode(i) for i in corpus]
print(len(corpusEmb_arr), len(corpusEmb_arr[0]))

70 384


### UMAP Dimensionality Reduction

In [None]:
features = df.loc[:, :'petal_width']

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(features)
proj_3d = umap_3d.fit_transform(features)

fig_2d = px.scatter(
    proj_2d, x=0, y=1,
    color=df.species, labels={'color': 'species'}
)
fig_3d = px.scatter_3d(
    proj_3d, x=0, y=1, z=2,
    color=df.species, labels={'color': 'species'}
)
fig_3d.update_traces(marker_size=5)

fig_2d.show()
fig_3d.show()

#### 2D Projection

In [20]:

#[+] Also try "metric= cosine" arg for UMAP
umapper_2d= UMAP(n_components= 2, init= 'random', random_state=0, metric='cosine')
proj_2d= umapper_2d.fit_transform(corpusEmb_arr)
fig_2d= px.scatter(proj_2d, x=0, y=1)
fig_2d.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



#### 3D Projection

In [19]:
umapper_3d= UMAP(n_components= 3, init= 'random', random_state=0, metric='cosine')
proj_3d= umapper_3d.fit_transform(corpusEmb_arr)
fig_3d= px.scatter_3d(proj_3d, x=0, y=1, z=2)
fig_3d.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



## TODO
- [+] Install packages
- [+] Embed corpus lines
- [+] UMAP Dim reduction
- [ ] Explore different sentence transformers models
- [ ] Try other dimensionality reduction techiques