# __[:+:]__ SBERT Embeddings Test

In [27]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import plotly.express as px
from umap import UMAP
import pandas as pd
import numpy as np
import umap.plot
import pickle
import json

### Get Data

In [37]:
with open("ideological_corpus.txt", "r") as f:
    corpus = f.readlines()
print("[+] -- Loaded ", len(corpus), ' docs ---------------------------------------------------------------------------------------------|')
for i in range(3):
    print(i, ': ', corpus[i])

with open("transcript.txt", "r") as f:
    transcript = f.readlines()
print("[+] -- Loaded ", len(transcript), ' docs ---------------------------------------------------------------------------------------------|')
for i in range(3):
    print(i, ': ', transcript[i])

# Read csv of generated reference opinions as dataframe
refCluster_df = pd.read_csv('referenceClusters.csv')
print(' ---------------------------------------------------------------------------------------------|\n', refCluster_df.head(), '\n')

[+] -- Loaded  70  docs ---------------------------------------------------------------------------------------------|
0 :  Support for Israeli settlements in the West Bank is crucial for security.

1 :  Palestinian statehood should be recognized and supported by the international community.

2 :  Economic cooperation between Israel and Palestine can lead to peace.

[+] -- Loaded  238  docs ---------------------------------------------------------------------------------------------|
0 :  Speaker 1 [0.01s - 26.53s]:  You're saying that you think it's the two narratives. Yeah, but we can't hear you very loudly. Maybe it's much better, much better.

1 :  Speaker 3 [4.60s - 25.21s]:  Yeah, I was just saying that I think it's pretty obvious why Native Americans would side with Palestinians. I didn't really know that there were Palestinians, that there were Native Americans who were pro-Israel. Sorry, is this better?

2 :  Speaker 3 [26.65s - 39.19s]:  Yeah, I was just saying that I'm not s

### Load Model and Embed 
Choose and load model from https://sbert.net/docs/sentence_transformer/pretrained_models.html

In [29]:
#[+:]-- Top quality for general purpose
model = SentenceTransformer('all-mpnet-base-v2')

#[+:]-- Smaller, faster, but decent quality
# model = SentenceTransformer('all-MiniLM-L6-v2')

#Embed some examples
emb1 = model.encode("This is a red cat with a hat.")
emb2 = model.encode("Have you seen my red cat?")

#Get the cosine similarity score between example sentences
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



Cosine-Similarity: tensor([[0.6053]])


In [48]:
# corpusEmb_arr= [model.encode(i) for i in corpus]
# print(len(corpusEmb_arr), len(corpusEmb_arr[0]))

#[+:] extract only the sentences, (without the labels)
sentences = refCluster_df.loc[:,'opinion']
print('\nOpinion Sentences sans Labels: \n', sentences.head(), '\n')

#{+:] Embed the extracted sentences
sent_emb= [model.encode(i) for i in sentences]
print('Embeded Sentences: ', len(sent_emb),'x', len(sent_emb[0]))

70 768
Opinion Sentences: 
 0    Israel has the right to exist as a sovereign J...
1    Israel's security measures are necessary to pr...
2    The Israeli government's efforts to negotiate ...
3    Israel's withdrawal from Gaza in 2005 was a si...
4    Hamas' use of civilian areas for launching att...
Name: opinion, dtype: object 

Embeded Sentences:  100 x 768


### UMAP Dimensionality Reduction & Visualization

#### 2D Projection

In [31]:

#[+] Also try "metric= cosine" arg for UMAP
# umapper_2d= UMAP(n_components= 2, init= 'random', random_state=0, metric='cosine')
# proj_2d= umapper_2d.fit_transform(corpusEmb_arr)
# fig_2d= px.scatter(proj_2d, x=0, y=1)
# fig_2d.show()

#### 3D Projection

In [165]:
#[+]-- UMAP on corpus
# umapper_3d= UMAP(n_components= 3, init= 'random', random_state=0, metric='cosine')
# proj_3d= umapper_3d.fit_transform(corpusEmb_arr)
# fig_3d= px.scatter_3d(proj_3d, x=0, y=1, z=2)
# fig_3d.update_traces(marker_size=7)
# fig_3d.show()

#[+]-- UMAP on generated opinions
# umap_3d = UMAP(n_components=3, init='random', random_state=0, metric='cosine')
# proj_3d = umap_3d.fit_transform(sent_emb)
# print(len(proj_3d), len(proj_3d[0]))
# fig_3d = px.scatter_3d(
#     proj_3d,
#     x=0, y=1, z=2,
#     color=refCluster_df.stance, 
#     labels={'color': 'stance'}
# )
# fig_3d.update_traces(marker_size=5)
# fig_3d.show()



#[+] Project all the embeddings into 3d
umap_3d = UMAP(n_components=3, init='random', random_state=0, metric='cosine')
proj_3d = umap_3d.fit_transform(sent_emb)

#[+] Create a new dataframe where each sentence is not the sentence embedding projection
proj_3d_df= pd.DataFrame(proj_3d, columns= ['x', 'y', 'z'])
proj_3d_df['stance'] = refCluster_df.stance
for i in range(49,51): print(proj_3d_df.iloc[i], '\n')

#[+] Map the dataframe of projected embeddings in 3d
print(type(proj_3d),'\n', proj_3d[49:51])
fig_3d = px.scatter_3d(
    proj_3d_df,
    x='x', y='y', z='z',
    color='stance', 
    #labels={'color': 'stance'}
)
fig_3d.update_traces(marker_size=5)
fig_3d.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



x           9.101793
y           7.909717
z           7.746987
stance    pro-Israel
Name: 49, dtype: object 

x               6.83225
y              6.648385
z              8.856343
stance    pro-Palestine
Name: 50, dtype: object 

<class 'numpy.ndarray'> 
 [[9.101793  7.909717  7.7469873]
 [6.8322496 6.6483846 8.856343 ]]


## Take the Average of Each Embedding Cluster

In [169]:
pro_isr= refCluster_df[refCluster_df['stance']== "pro-Israel"]
pro_isr= pro_isr.loc[:,'opinion']
for i in range(5): print(pro_isr.iloc[i])
pro_isr_emb= np.array([model.encode(i) for i in pro_isr])
print(pro_isr_emb.shape)
pro_isr_avg= np.mean(pro_isr_emb, axis=0)
print(pro_isr_avg.shape)

Israel has the right to exist as a sovereign Jewish state.
Israel's security measures are necessary to protect its citizens from terrorism.
The Israeli government's efforts to negotiate peace have been met with consistent rejection from Palestinian leadership.
Israel's withdrawal from Gaza in 2005 was a significant concession aimed at fostering peace.
Hamas' use of civilian areas for launching attacks justifies Israel's self-defense actions.
(50, 768)
(768,)


In [70]:
pro_pal= refCluster_df[refCluster_df['stance']== "pro-Palestine"]
pro_pal= pro_pal.loc[:,'opinion']
print(pro_pal.head())
pro_pal_emb= np.array([model.encode(i) for i in pro_pal])
print(pro_pal_emb.shape)
pro_pal_avg= np.mean(pro_pal_emb, axis=0)
print(pro_pal_avg.shape)

50    Israel's continued expansion of settlements in...
51    The Israeli blockade of Gaza has created a hum...
52    Israel's use of disproportionate force in Gaza...
53    The Israeli government's refusal to engage in ...
54    Israel's occupation of Palestinian territories...
Name: opinion, dtype: object
(50, 768)
(768,)


### Find Nearest Cluster for a given Opinion

In [173]:
opinion_emb= model.encode("Israel is comitting genocide")


umap_3d = UMAP(n_components=3, init='random', random_state=0, metric='cosine')
proj_3d = umap_3d.fit_transform([pro_isr_avg, pro_pal_avg, opinion_emb])
print('Embedded: \n', proj_3d)

proj_3d_df= pd.DataFrame(proj_3d, columns= ['x', 'y', 'z'])
proj_3d_df['stance'] = ['pro-IS', 'pro-PS', 'Opinion']
print(proj_3d_df)


fig_3d = px.scatter_3d(
    data_frame= proj_3d_df,
    x='x', y='y', z='z',
    color='stance', 
    text= 'stance'
    #labels={'color': np.array(mapping.keys())}
)
fig_3d.update_traces(marker_size=10)
fig_3d.show()

#Get the cosine similarity score between the opnion and the two clusters
cos1 = util.cos_sim(opinion_emb, pro_isr_avg)
cos2 = util.cos_sim(opinion_emb, pro_pal_avg)
if cos1>cos2:
    print("pro Israel |  ", cos1)
else:
    print("pro Palestine | ", cos2)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1



Embedded: 
 [[-5.162796   3.8370411 15.713148 ]
 [-5.3541427  4.223686  14.909268 ]
 [-6.0192103  4.875412  15.2080965]]
          x         y          z   stance
0 -5.162796  3.837041  15.713148   pro-IS
1 -5.354143  4.223686  14.909268   pro-PS
2 -6.019210  4.875412  15.208097  Opinion


pro Palestine |  tensor([[0.7082]])


## TODO
- [+] Install packages
- [+] Embed corpus lines
- [+] UMAP Dim reduction
- [+] Get the Average Embeddign for the Cluster, and compare 
- [ ] Explore different sentence transformers models
- [ ] Explore clustering methods (read documentation)
- [ ] Embed LLM-generated polarized opinions as references
- [ ] Try other dimensionality reduction techiques


## Possible Ideas
-   Recreate Lupin: Classify embedding within a spectrum between two polar opposite references
-   KNN-based algo: instead of using a spectrum, simply classify each opinion based on its nearest neighbors in vector space, or some variation of that
    - Given the sometimes ambiguous and overlapping nature of ideologies, this might be used as an alternative to classifying based on discrete categories
    - ex: "given this person's opinion, he is most ideologically aligned with politicians x and y"
    - We could also create an embedding of entire ideology as an average of a collection of quotes, and do classification based on the single nearest neighbor

-   Recreate the 2019 Party Embedding method using SBERT
    - Embed each opinion in the dataset with a party/ideology label
    - Classify each new opinion based on distance to that label (need more research to confirm this method)