In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import json
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from scipy.spatial.distance import cdist
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from nlp_function import pick_random_keys, stopwords_func, lower_processing

In [2]:
## Read in data
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)

## Preprocessing - Lowercase operation of the case
tclaim_data = lower_processing(tclaim_data, "claim_text")
dclaim_data = lower_processing(dclaim_data, "claim_text")
uclaim_data = lower_processing(uclaim_data, "claim_text")
evi_data = lower_processing(evi_data, 'evidence')

# ## Remove stopwords from claims and evidence (optional)
stop_words = set(stopwords.words('english'))
tclaim_data = stopwords_func(stop_words, "claim", tclaim_data)
dclaim_data = stopwords_func(stop_words, "claim", dclaim_data)
uclaim_data = stopwords_func(stop_words, "claim", uclaim_data)
evi_data = stopwords_func(stop_words, "evidence", evi_data)

# ## Create claim-evidence pair based on training set
# train_pairs = []
# for i in tclaim_data.values():
#     for j in i["evidences"]:
#         train_pairs.append((i["claim_text"], evi_data[j], 1))

# ## insert negative sample to the training set
# for i in tclaim_data.values():
#     excluded_keys = i["evidences"]
#     random_keys = pick_random_keys(evi_data, excluded_keys, len(excluded_keys))
#     for j in random_keys:
#         train_pairs.append((i["claim_text"], evi_data[j], 0))

In [3]:
# Obtain sentence list
sentence_dict = {"train": 0, "test": 0}
sentence_list = []
for i in tclaim_data:
    sentence_dict["train"] += 1
    sentence_list.append(tclaim_data[i]["claim_text"])
for i in uclaim_data:
    sentence_dict["test"] += 1
    sentence_list.append(uclaim_data[i]["claim_text"])

# Start embedding with bert
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# List of sentences
sentences = sentence_list

# Tokenize and encode the sentences
encoded_inputs = tokenizer.batch_encode_plus(
    sentences,
    add_special_tokens=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

# Generate sentence embeddings
with torch.no_grad():
    model_output = model(**encoded_inputs)
    sentence_embeddings = model_output.last_hidden_state[:, 0, :]

# Embed sentences and obtain test set vectors
embeddings = sentence_embeddings.numpy()
train_matrix = embeddings[:sentence_dict["train"]]
test_matrix = embeddings[sentence_dict["train"]:]

# Capture the closest training instance (index) to the test set
test_train_index = []
best_train_index = []
for i in range(test_matrix.shape[0]):
    distances = cdist(train_matrix, np.expand_dims(test_matrix[i], axis=0), metric='euclidean')
    best_train_index.append(np.argmin(distances))
    distances_flat = distances.flatten()
    indices = np.argpartition(distances_flat, 3)[:3]
    test_train_index.append(indices.tolist())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# 0-R Classification
label_list = []
for i in tclaim_data.values():
    label_list.append(i["claim_label"])
strings = label_list
counter = Counter(strings)
most_common = counter.most_common(1)
most_frequent_string = most_common[0][0]
frequency = most_common[0][1]

# Assign label and evidence to the test set
train_key_list = list(tclaim_data.keys())
count = 0
for i in uclaim_data.values():
    i["claim_label"] = tclaim_data[train_key_list[best_train_index[count]]]["claim_label"]
    evidence_list = []
    for j in test_train_index[count]:
        evidence_list += tclaim_data[train_key_list[j]]["evidences"]
    i["evidences"] = evidence_list
    count += 1

# Save the test set result
file_path = '../test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(uclaim_data, json_file)

In [12]:
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer

# Step 1: Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 2: Prepare input sentence
sentence = "This is an example sentence."

# Step 3: Tokenize and encode the sentence
tokens = tokenizer.tokenize(sentence)
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Step 4: Generate word embeddings
input_ids = torch.tensor([input_ids])
with torch.no_grad():
    outputs = model(input_ids)
    word_embeddings = outputs.last_hidden_state.squeeze(0)

# Step 5: Combine word embeddings to get sentence embedding
sentence_embedding = torch.mean(word_embeddings, dim=0)

# Step 6: Optional - Convert to NumPy array or other formats
sentence_embedding = sentence_embedding.numpy()

# Step 7: Print the sentence embedding
print("Sentence embedding:", (sentence_embedding.shape))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Sentence embedding: (768,)


In [16]:
word_embeddings.shape

torch.Size([6, 768])

In [None]:
# Obtain sentence list
sentence_dict = {"train": 0, "test": 0}
sentence_list = []
for i in tclaim_data:
    sentence_dict["train"] += 1
    sentence_list.append(tclaim_data[i]["claim_text"])
for i in uclaim_data:
    sentence_dict["test"] += 1
    sentence_list.append(uclaim_data[i]["claim_text"])

# Start embedding with bert
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# List of sentences
sentences = sentence_list

# Tokenize and encode the sentences
encoded_inputs = tokenizer.batch_encode_plus(
    sentences,
    add_special_tokens=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

# Generate sentence embeddings
with torch.no_grad():
    model_output = model(**encoded_inputs)
    sentence_embeddings = model_output.last_hidden_state[:, 0, :]

# Embed sentences and obtain test set vectors
embeddings = sentence_embeddings.numpy()
train_matrix = embeddings[:sentence_dict["train"]]
test_matrix = embeddings[sentence_dict["train"]:]

# Capture the closest training instance (index) to the test set
test_train_index = []
best_train_index = []
for i in range(test_matrix.shape[0]):
    distances = cdist(train_matrix, np.expand_dims(test_matrix[i], axis=0), metric='euclidean')
    best_train_index.append(np.argmin(distances))
    distances_flat = distances.flatten()
    indices = np.argpartition(distances_flat, 3)[:3]
    test_train_index.append(indices.tolist())

In [17]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# 加载预训练的RoBERTa模型和分词器
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

# 示例句子
sentence = "This is an example sentence."

# 分词和编码句子
encoded_input = tokenizer.encode_plus(
    sentence,
    add_special_tokens=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

# 生成句子嵌入
with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output.last_hidden_state[:, 0, :]

# 将结果转换为numpy数组（如果需要的话）
sentence_embeddings = sentence_embeddings.numpy()
sentence_embeddings.shape

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(1, 768)