In [4]:
import os
# 为 jupter notebook 设置系统代理
# 否则可能无法连接到huggingface
proxy = 'http://192.168.50.182:7890'
os.environ['http_proxy'] = proxy
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy

In [5]:
from sentence_transformers import SentenceTransformer  # https://www.sbert.net/
import numpy as np

# 使用时会自动下载所需要的模型.  
# https://huggingface.co/BAAI/bge-large-en-v1.5
model = SentenceTransformer('BAAI/bge-large-en-v1.5')

def embedding(texts: list[str]) -> np.ndarray:
    """
    embedding获得text的list,
    并将每一个text进行embedding操作,
    返回每一个text的embedding结果.
    """
    embeddings = model.encode(texts, normalize_embeddings=True)
    return embeddings

def calculateSimilarity(a:np.ndarray, b:np.ndarray) -> np.ndarray:
    """
    计算两个embedding结果之间的相似度,以ndarray的形式输入,返回一个矩阵,表明其之间的相似度
    """
    return a @ b.T

In [6]:
# 测试样例.
# 在python 3.9.18 版本上能够成功使用.
text1 = ['sample1', 'sample2']
text2 = ['sample3', 'sample4']
embedding1 = embedding(text1)
embedding2 = embedding(text2)
print(embedding1)
print(embedding1 @ embedding2.T)

[[ 0.0156503   0.03648347  0.01993614 ... -0.00103826 -0.00561563
  -0.03185157]
 [ 0.00716446  0.04983097  0.04704388 ... -0.0212795  -0.00602324
  -0.01915275]]
[[0.8282864 0.833615 ]
 [0.8557756 0.8541577]]


In [8]:
# 取出top点的text内容
top_text = "LIVE FACT-CHECK: Clinton is right, Trump encouraged Japan to get a nuclear weapon. Mostly true."

top_extra_text = """
Hillary Clinton says Donald Trump argued U.S. should 'encourage' Japan to get nuclear weapons
During a high-profile foreign policy speech, Democratic presidential candidate Hillary Clinton repeated a series of past statements by Donald Trump to show that her Republican rival poses risks for international relations.

One of these past statements by Trump involved whether Japan should become a nuclear-armed state.
"It's no small thing when he suggests that America should withdraw our military support for Japan, encourage them to get nuclear weapons," Clinton said in her June 2, 2016, speech. 

Is Clinton correct that Trump encouraged Japan to get nuclear weapons?

What Trump has said

As we noted earlier, Clinton’s evidence includes interviews and comments Trump has made on the campaign trail.

When Trump sat for a March 26 interview with the New York Times, he was asked directly, "Would you object if (Japan) got their own nuclear arsenal, given the threat that they face from North Korea and China?"

Trump responded in part, "There’ll be a point at which we’re just not going to be able to do it anymore. Now, does that mean nuclear? It could mean nuclear. It’s a very scary nuclear world. Biggest problem, to me, in the world, is nuclear, and proliferation. At the same time, you know, we’re a country that doesn’t have money."

Let’s set aside Trump’s internal inconsistency -- saying that Japan might need nuclear weapons, and then, just seconds later, saying that the world’s "biggest problem" is nuclear proliferation.

If you read his words closely, Trump didn’t quite say he was encouraging Japan to get nuclear weapons, but he did go right up to the line. He was leaving the option on the table.

Having seen Trump’s comments to the New York Times, CNN’s Anderson Cooper asked him about it in a CNN town hall in Milwaukee on March 29.

Answering Cooper’s question about breaking with Japan’s non-nuclear stance, Trump said, "At some point we have to say, you know what, we're better off if Japan protects itself against this maniac in North Korea."

In fact, during the town hall, Cooper gave Trump multiple opportunities to back off his suggestion that Japan may need nuclear weapons, but Trump never did. He said, in succession, "maybe it's going to have to be time to change," "at some point we have to say, you know what, we're better off," "it's going to happen, anyway," and "wouldn't you rather in a certain sense have Japan have nuclear weapons?"
"""

top_text_embedding = embedding([top_text])

top_extra_text_embedding = embedding([top_extra_text])

In [11]:
# 这个是裁剪了最后的文本然后贴入这个的计算结果
print(calculateSimilarity(top_extra_text_embedding, top_text_embedding))

[[0.80357563]]
