In [78]:
import sys
import io, os
import numpy as np
from typing import Optional, Union, List, Dict, Tuple
import logging
import argparse
from prettytable import PrettyTable
import transformers
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
import torch.distributed as dist
from datasets import load_dataset
from scipy.stats import spearmanr
from train import DataTrainingArguments

from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase

In [79]:
# Load transformers' model checkpoint
model = AutoModel.from_pretrained("./result/")
tokenizer = AutoTokenizer.from_pretrained("./result/")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Some weights of BertModel were not initialized from the model checkpoint at ./result/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
import pandas as pd

file_path = "./data/COMPLAINTS_RECEIVED_2020_2023/com_data/test.csv"
test_dataset = pd.read_csv(file_path, sep=',' if 'csv' in file_path else None)

test_feature_list = []
for i in range(2):
    encoded_sent1 = tokenizer(test_dataset['sent0'][i], truncation=True, padding='max_length', return_tensors="pt")
    encoded_sent2 = tokenizer(test_dataset['sent1'][i], truncation=True, padding='max_length', return_tensors="pt")
    test_feature_list.append((encoded_sent1, encoded_sent2))

test_feature_list

[({'input_ids': tensor([[  101,  2026,  4316,  3631,  2091,  2006,  2260,  1013,  2603,  1013,
           16798,  2475,  1010,  2009,  2790,  3194,  4945,  1012,  2009,  3047,
            2096,  1045,  2001,  7194,  2000,  2026,  3105,  1999,  1996,  3944,
            1012,  1045,  9083,  2026,  2482,  2006,  5581,  2843,  1998,  2106,
            2025,  2693,  2009,  1012,  2349,  2000,  4234,  6209,  1010,  1045,
            2001,  4039,  2000, 15805,  1996,  4316,  2000,  2026,  7541, 27005,
           16743,  5605,  1006, 11427,  2015,  6341,  1010,  9108,  1007,  2127,
            2260,  1013,  2676,  1013, 16798,  2475,  1012,  2320,  2026,  4316,
            2001, 18948,  1010,  1996, 27005, 16743,  5605,  6727,  2033,  2008,
            2026,  4316,  2018,  1037,  2561,  1997,  2176,  1006,  1018,  1007,
           17722,  1010,  2021,  2028,  1997,  1996, 17722, 13735,  1012,  1045,
            6727,  1996, 27005, 16743,  5605,  2008,  1045,  2196,  2363,  9131,
           268

In [81]:
from torch.utils.data import Dataset, DataLoader
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

test_dataset = TestDataset(test_feature_list, tokenizer=tokenizer)
test_dataloader = DataLoader(test_dataset)

# test_dataset.data

In [82]:
from torch.nn import CosineSimilarity
from tqdm import tqdm

def evaluate(model, dataloader, device):
    model.eval()
    cos = CosineSimilarity(dim=-1)
    
    # sim_tensor = torch.tensor([], device=device)
    # label_array = np.array([])
    with torch.no_grad():
        sim_scores=[]
        for sent1, sent2 in tqdm(dataloader):
            # print(sent1)
            # print('-----------------------')
            # print(sent2)
            sent1_input_ids = sent1.get('input_ids').squeeze(1).to(device)
            sent1_attention_mask = sent1.get('attention_mask').squeeze(1).to(device)
            sent1_token_type_ids = sent2.get('token_type_ids').squeeze(1).to(device)
            sent1_pred = model(sent1_input_ids, sent1_attention_mask, sent1_token_type_ids).last_hidden_state[:, 0]
            # print('input ids:', sent1_input_ids.size())
            # print('pred', sent1_pred.size())
            # sent2
            sent2_input_ids = sent2.get('input_ids').squeeze(1).to(device)
            sent2_attention_mask = sent2.get('attention_mask').squeeze(1).to(device)
            sent2_token_type_ids = sent2.get('token_type_ids').squeeze(1).to(device)
            sent2_pred = model(sent2_input_ids, sent2_attention_mask, sent2_token_type_ids).last_hidden_state[:, 0]
            
            sim = cos(sent1_pred, sent2_pred).item()
            # print(f'sim score is {sim}')
            sim_scores.append(sim)
    
    return sim_scores



scores = evaluate(model, test_dataloader, device)
avg_sim = np.average(scores)

100%|██████████| 2/2 [00:09<00:00,  4.71s/it]


In [83]:
scores

[0.40635037422180176, 0.23144163191318512]

In [84]:
avg_sim

0.31889600306749344