# Generate encodings from dataset

#### Imports

In [1]:
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import notebook_login
from torch import device, cuda, save, load
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

pd.set_option('display.max_colwidth', None)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
organization = 'luiz-and-robert-thesis'
model_name = 'all-mpnet-base-newtriplets-v2-lr-1e-8-m-1-e-3'
model = SentenceTransformer(f'{organization}/{model_name}')

In [3]:
auto_model = model._first_module().auto_model

In [None]:
for name, param in auto_model.named_parameters():
    print(name)

In [None]:
print(model[0].auto_model)

In [3]:
device = device('cuda' if cuda.is_available() else 'cpu')
device
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

#### Log in to Hugging Face 

In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Get the model

In [4]:
dataset = 'hallsnas'
split_path = f'./encoding_data/whole_datasets/{dataset}_dataset.xlsx'
dataset_df = pd.read_excel(split_path, index_col='ID')

In [5]:
dataset_df.head()

Unnamed: 0_level_0,description
ID,Unnamed: 1_level_1
1,"When language=SWE set, other options are not visible unless you scrool"
2,"When Language = ENG, on the parking zone page, swedish words still remain"
3,"Random registration numbers are allowed. For example, ""LAKDHSBSKS"","
4,"Pressing change password, switched the language"
5,"Missing feedback during login, if info is wrong"


In [16]:
print(len(dataset_df))

75


In [6]:
test_df = dataset_df[0:10]
for index, row in test_df.iterrows():
    print(index, row[0]) # short
    print('*********************************************************************************')
    # print() # long
    # print('----------------------------------------------------------------------------------')

   
    

1 When language=SWE set, other options are not visible unless you scrool
*********************************************************************************
2 When Language = ENG, on the parking zone page, swedish words still remain
*********************************************************************************
3 Random registration numbers are allowed. For example, "LAKDHSBSKS", 
*********************************************************************************
4 Pressing change password, switched the language 
*********************************************************************************
5 Missing feedback during login, if info is wrong
*********************************************************************************
6 Customer was defaulted during login. Was not obvious it needed to be changed
*********************************************************************************
7 No validation when entering a reg (e.g. can enter too many characters, speical characters etc)
***********

  print(index, row[0]) # short


#### Encode the dataset with **long** description only

- Check which row index!!!

In [7]:

def encode_long_desc():
    # prefix = 'Represent this sentence for searching relevant passages:'
    encoded_desc = []
    report_id_list = []
    amount_of_none = 0
    
    for index, row in tqdm(dataset_df.iterrows()):
        try:
            # Encode the description of each bug report
            bug_desc = row[0]
            # desc_w_prefix = f'{prefix} {bug_desc}'
            encoded = model.encode(bug_desc)
        except:
            encoded = None
            amount_of_none += 1
        # Get the id of the bug report
        report_id = index
        # Add the encoded description and the id to their corresponding arrays
        encoded_desc.append(encoded)
        report_id_list.append(report_id)

    # Create a new dataframe with the encoded description and the bug id as columns
    dataset_encoded_long_desc_df = pd.DataFrame()
    dataset_encoded_long_desc_df["encoded_desc"] = encoded_desc
    dataset_encoded_long_desc_df["bug_id"] = report_id_list
    print('Amount of none: ', amount_of_none)
    return dataset_encoded_long_desc_df

#### Encode dataset with **long** description and **short** description

- Check row index!!!!

In [10]:
def encode_all_descs():
    encoded_both_descs = []
    report_id_list_both = []
    amount_of_none_both = 0

    # Iterate over the whole dataset
    for index, row in tqdm(dataset_df.iterrows()):

        try:
            # Encode the long and short description of each bug report
            long_desc = row[5]
            short_desc = row[10]
          
            both_descs_enc = model.encode(f'{short_desc} - {long_desc}')
        except:
            both_descs_enc = None
            amount_of_none_both += 1

        # Get the id of the bug report
        report_id_both_descs = index
        # Add the encoded descriptions and the id to their corresponding arrays
        encoded_both_descs.append(both_descs_enc)
        report_id_list_both.append(report_id_both_descs)

    # Create a new dataframe with the encoded description and the bug id as columns
    dataset_encoded_both_descs_df = pd.DataFrame()
    dataset_encoded_both_descs_df["encoded_desc"] = encoded_both_descs
    dataset_encoded_both_descs_df["bug_id"] = report_id_list_both
    print('Amount of none: ', amount_of_none_both)
    return dataset_encoded_both_descs_df


#### Create encodings
- choose right method

In [8]:
#Encode dataset
dataset_encoded_df = encode_long_desc()


  bug_desc = row[0]
  bug_desc = row[0]
75it [00:06, 10.88it/s]

Amount of none:  0





In [9]:
# Set the bug_id column as the label/index
dataset_encoded_df = dataset_encoded_df.set_index('bug_id')
dataset_encoded_df.head()

Unnamed: 0_level_0,encoded_desc
bug_id,Unnamed: 1_level_1
1,"[-0.012449062, -0.092524394, -0.017673265, 0.002895186, 0.02720169, -0.015151012, -0.004381051, -0.037702523, -0.031185754, 0.04736474, -0.02110277, 0.019928655, 0.0018177388, 0.06205484, 0.0015263349, 0.03835473, 0.038207788, -0.032465987, -0.028833048, -0.023916893, -0.021420507, 0.027777428, 0.032728445, -0.0072980504, 0.006813667, -0.018500652, 0.011122118, -0.04713803, 0.03657829, 0.042367034, -0.0057813195, 0.01377896, 0.06016402, -0.083999574, 2.1059294e-09, -0.06978049, -0.028868992, -0.04513728, -0.034817226, 0.002376858, 0.009903133, 0.028890455, -0.0044354103, 0.027343677, -0.025273385, -0.0070431926, 0.067718744, -0.037865084, -0.021001492, 0.008561814, 0.0139156375, 0.020511908, -0.040787242, -0.06298054, -0.05384617, 0.05706853, -0.037981786, -0.044503044, -0.003445838, 0.081666514, -0.0055157626, 0.021962428, 0.018705072, -0.00085091346, -0.06611018, -0.039541267, -0.09861349, -0.031353116, 0.055846997, -0.009330287, 0.0034196097, -0.01748743, -0.056697, 0.0051408587, 0.0029826767, 0.0559428, 0.023468878, 0.038080476, 0.011883072, -0.007458741, 0.037435997, -0.0006762537, -0.0052739223, 0.0018576672, 0.03679174, 0.13208881, -0.03843927, 0.048644472, 0.010914257, -0.024274623, 0.040144797, 0.0004855854, -0.019451734, -0.034940403, -0.07372645, 0.029502837, 0.033977173, -0.023075428, 0.024646096, -0.025119083, ...]"
2,"[0.04845379, -0.053859193, -0.027601514, 0.080250874, 0.014596229, -0.026597595, 0.005098598, 0.0017407266, -0.046224978, 0.041573215, 0.011309656, 0.008565416, 0.04685088, -0.025584769, -0.021070935, 0.003504769, -0.005192894, -0.07812236, -0.0718258, -0.031463876, 0.026281537, 0.033029173, 0.07439548, 0.03032604, 0.019203128, 0.063931115, 0.068456694, -0.05768675, 0.047437996, 0.015129595, 0.00819023, 0.019268272, 0.004051413, -0.03388242, -4.6209525e-08, 0.019323094, -0.011263088, -0.029185325, 0.0072537796, 0.020786807, -0.023988413, 0.051854976, 0.009607638, -0.008339368, 0.009887089, -0.071015455, 0.02543404, -0.049205597, 0.014228862, -0.014511232, -0.008239816, 0.0077734664, -0.058145396, -0.006987947, 0.012914433, -0.04191411, 0.010509629, -0.022468533, 0.008126526, 0.03230042, -0.05495346, 0.07007522, 0.030143436, 0.004592181, -0.003121501, 0.02584868, 0.022746965, -0.010968083, 0.050254416, -0.0027160093, -0.016713172, -0.017041104, 0.0007999684, -0.030747183, -0.038547337, 0.016047843, 0.02640158, -0.026368335, -0.003369438, -0.007937032, -0.0015147903, -0.048717923, 0.003215749, 0.056932207, -0.016600333, 0.066823475, 0.020802757, 0.013932891, 0.026326682, -0.013689484, 0.010722183, 0.008719767, -0.0037163764, -0.015598681, 0.0011525258, -0.0033720976, 0.009093802, -0.016338404, -0.03128844, -0.045197256, ...]"
3,"[0.055456948, -0.030786538, -0.004833868, -0.0012621245, -0.018625606, -0.03753863, 0.0850965, -0.017336057, 0.041074976, 0.023015758, 0.06537891, -0.07698511, 0.01706933, 0.008784045, -0.010717102, 0.0068811523, -0.052019693, -0.07657726, 0.0055672154, -0.024288246, -0.066195026, 0.026874207, -0.020503297, -0.030709947, 0.016121084, 0.06339809, 0.013100616, -0.013601379, 0.037821695, -0.059644844, -0.00019120179, 0.01844968, 0.02568871, 0.066230506, -6.4566805e-08, 0.0010323711, -0.03192729, -0.016420431, -0.06041742, -0.017598446, 0.014297013, 0.059014887, -0.0042054825, -0.04575712, -0.008098375, 0.024319226, 0.04357607, -0.016541561, 0.022513267, 0.038991377, 0.012295822, -0.06242748, 0.01742732, 0.00058891665, 0.055962943, 0.0359747, 0.047414783, 0.048657976, -0.03770038, 0.0543985, 0.019464307, -0.015626628, -0.008898938, 0.018368185, 0.0634514, 0.03489135, -0.06140488, -0.03396135, -0.0034304115, 0.045957636, -0.0015618409, -0.030988364, 0.05286459, 0.0114955455, -0.0138486, -0.031693, -0.027084703, -0.032458954, 0.033272218, -0.015273403, -0.023350913, 0.048479047, 0.003936214, -0.043815512, 0.04093461, -0.045293275, 0.039366588, 0.0060466835, -0.09125485, -0.036653224, -0.011744236, -0.01307399, -0.015474083, 0.0342948, 0.010214261, -0.035319984, -0.041806053, -0.048552744, 0.0076939464, -0.0027550291, ...]"
4,"[0.001497677, -0.049357854, 0.0034110672, 0.04489217, -0.009351993, 0.033400074, 0.0025348344, 0.02189892, -0.024831329, 0.027077116, -0.022180792, -0.019161906, 0.019847836, -0.029743494, 0.0027317512, 0.066544876, -0.036205888, -0.05082816, -0.03902675, -0.0055699428, 0.006560797, 0.011912659, 0.03855453, 0.023960257, 0.0051961513, 0.0013040325, 0.03463954, -0.021269267, 0.03060023, 0.021334546, -0.035255693, -0.024091102, -0.0104584815, -0.020580424, -3.9494363e-08, 0.001274326, 0.026710462, 0.010114482, -0.048702724, 0.03625887, -0.022539217, -0.008108059, 0.042494517, 0.004909311, -0.013397507, -0.017374184, -0.042962175, 0.040782746, -0.04129434, 0.04500941, -0.0031560054, -0.037171666, -0.03353834, -0.03689272, -0.02441456, -0.04882995, 0.008884389, -0.03381481, 0.005753698, 0.020469822, -0.028936787, -0.020747013, -0.028795963, -0.04041505, -0.075942315, 0.01512095, 0.00575623, 0.008335664, 0.029551953, -0.059676275, -0.052618016, 0.031208472, 0.035186708, 0.0011873728, 0.032177113, 0.03570217, 0.058505177, 0.016271545, -0.0220057, -0.031084822, 0.02340672, -0.047249246, 0.024163555, 0.03659894, -0.01845455, 0.025272207, 0.0127228135, 0.035491753, -0.022524305, -0.049672622, 0.05237138, 0.028611438, -0.011546091, -0.045834035, -0.025635745, -0.0036783915, 0.021016732, -0.0073958095, -0.018121367, -0.0904242, ...]"
5,"[0.02467378, -0.013197404, 0.0070057577, 0.026782928, 0.017893186, 0.031826552, -0.01109933, -0.0030212556, 0.028234806, 0.018390551, 0.018668, -0.09424613, 0.059941527, -0.015974628, 0.017785963, 0.06289032, -0.020752471, -0.039348368, -0.046459373, 0.0055256463, 0.024188995, -0.00800296, -0.025332265, 0.002503007, -0.030153193, -0.007301149, -0.019037232, -0.004666953, 0.01704771, -0.0049391696, -0.0061849393, 0.048022207, -0.05296472, 8.717233e-05, -2.7600898e-08, -0.035519205, 0.079139344, 0.009307289, -0.08462576, 0.009951849, -0.020942166, -0.029504556, 0.054782502, 0.017567614, -0.008839945, -0.029301036, -0.0004540201, 0.037119605, -0.00070383947, 0.017464034, -0.010068344, -0.011072724, -0.0057729343, -0.0025238951, 0.05633411, 0.07138526, 0.052757613, -0.019559482, 0.00662819, 0.0008602988, 0.010041436, -0.026795963, -0.0059630037, -0.0051750927, -0.0062321792, 0.04341822, 0.03363324, -0.007061799, -0.012730375, -0.00030306928, 0.045926843, 0.048060246, 0.0086743925, -0.042881515, 0.04960991, -0.011663261, 0.03425464, -0.0038100102, -0.048540473, -0.05044749, -0.03230027, -0.0062125884, -0.00721457, 0.04308988, 0.0065578404, -0.026106382, 0.021755893, -0.014113004, -0.093254164, 0.01811432, -0.00060839334, 0.016625982, 0.03260216, -0.010131796, -0.06388746, -0.017106254, -0.0021509624, 0.018872958, -0.021627283, 0.018257555, ...]"


In [10]:
enc1 = dataset_encoded_df['encoded_desc'][5]
enc2 = dataset_encoded_df['encoded_desc'][5]
cossim = util.cos_sim(enc1, enc2)
print(cossim)

tensor([[1.0000]])


#### Convert encoded dataset to dict and save to pytorch file

In [11]:
enc_dict = dataset_encoded_df.to_dict()

In [12]:
parent_folder = f'./encoding_data/encoded_splits/hallsnas_dataset'
# all-mpnet-lr5e-8-margin-1-ep-5-bs-32
file_name = f'{model_name}_hallsnas_dataset_enc.pt'
path = f'{parent_folder}/{file_name}'
save(enc_dict, path)

### Load

In [2]:
loaded_enc_dict = load('./encoding_data/encoded_splits/mpnet_m-5_ep-3/mpnet_firefox_test_enc_m-5_ep-3.pt')

NameError: name 'load' is not defined

In [33]:
load_enc1 = loaded_enc_dict['encoded_desc'][391030]
load_enc2 = loaded_enc_dict['encoded_desc'][391030]
cossim = util.cos_sim(load_enc1, load_enc2)
print(cossim)

tensor([[1.]])


In [None]:
br_test = dataset_df.iloc[2]

# print(br_test.loc['description'])
br_test.loc['description'] = None
br_test.loc['description']

# br_test.loc['description']
enc_desc_test = model.encode(br_test.loc['description'])
# enc_desc_test

In [None]:
try:
    cossim_test = util.cos_sim(enc_desc_test, load_enc1)
except:
    print('fail')
print(cossim_test)
