# Generate encodings from dataset

#### Imports

In [None]:
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import notebook_login
from torch import device, cuda, save, load
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)


#### Log in to HuggingFace 

In [None]:
notebook_login()

#### Get the model from Huggingface


In [2]:
organization = 'luiz-and-robert-thesis'
model_name = 'all-mpnet-base-newtriplets-v2-lr-1e-8-m-1-e-3'
model = SentenceTransformer(f'{organization}/{model_name}')

#### Use Cuda to run on the GPU 

In [None]:
device = device('cuda' if cuda.is_available() else 'cpu')
device
model.to(device)

#### Get the dataset to encode

In [4]:
dataset = 'hallsnas'
split_path = f'./encoding_data/whole_datasets/{dataset}_dataset.xlsx'
dataset_df = pd.read_excel(split_path, index_col='ID')

In [None]:
dataset_df.head()

In [None]:
print(len(dataset_df))

In [None]:
# To test that the datset was loaded correctly
test_df = dataset_df[0:10]
for index, row in test_df.iterrows():
    print(index, row[0])

#### Encode the dataset with **long** description only

- Make sure to select the correct row index!


In [7]:
def encode_long_desc():
    encoded_desc = []
    report_id_list = []
    amount_of_none = 0
    
    # Iterate through the whole dataset
    for index, row in tqdm(dataset_df.iterrows()):
        try:
            # Encode the description of each bug report
            bug_desc = row[0]
            encoded = model.encode(bug_desc)
        except:
            encoded = None
            amount_of_none += 1
        # Get the id of the bug report
        report_id = index
        # Add the encoded description and the id to their corresponding arrays
        encoded_desc.append(encoded)
        report_id_list.append(report_id)

    # Create a new dataframe with the encoded description and the bug id as columns
    dataset_encoded_long_desc_df = pd.DataFrame()
    dataset_encoded_long_desc_df["encoded_desc"] = encoded_desc
    dataset_encoded_long_desc_df["bug_id"] = report_id_list
    print('Amount of none: ', amount_of_none)
    return dataset_encoded_long_desc_df

#### Encode dataset with **long** description and **short** description

- Check row index!!!!

In [2]:
def encode_all_descs():
    encoded_both_descs = []
    report_id_list_both = []
    amount_of_none_both = 0

    # Iterate over the whole dataset
    for index, row in tqdm(dataset_df.iterrows()):

        try:
            # Encode the long and short description of each bug report
            long_desc = row[5]
            short_desc = row[10]
          
            both_descs_enc = model.encode(f'{short_desc} - {long_desc}')
        except:
            both_descs_enc = None
            amount_of_none_both += 1

        # Get the id of the bug report
        report_id_both_descs = index
        # Add the encoded descriptions and the id to their corresponding arrays
        encoded_both_descs.append(both_descs_enc)
        report_id_list_both.append(report_id_both_descs)

    # Create a new dataframe with the encoded description and the bug id as columns
    dataset_encoded_both_descs_df = pd.DataFrame()
    dataset_encoded_both_descs_df["encoded_desc"] = encoded_both_descs
    dataset_encoded_both_descs_df["bug_id"] = report_id_list_both
    print('Amount of none: ', amount_of_none_both)
    return dataset_encoded_both_descs_df


#### Create encodings
- choose right method

In [None]:
# Encode dataset
dataset_encoded_df = encode_long_desc()


In [None]:
# Set the bug_id column as the label/index
dataset_encoded_df = dataset_encoded_df.set_index('bug_id')
dataset_encoded_df.head()

#### Test the encodings

In [None]:
enc1 = dataset_encoded_df['encoded_desc'][5]
enc2 = dataset_encoded_df['encoded_desc'][5]
cossim = util.cos_sim(enc1, enc2)
print(cossim)

#### Convert encoded dataset to dict and save to pytorch file

In [11]:
enc_dict = dataset_encoded_df.to_dict()

In [12]:
parent_folder = f'./encoding_data/encoded_splits/hallsnas_dataset'
# all-mpnet-lr5e-8-margin-1-ep-5-bs-32
file_name = f'{model_name}_hallsnas_dataset_enc.pt'
path = f'{parent_folder}/{file_name}'
save(enc_dict, path)

### Load the encoded dataset using pytorch

In [None]:
loaded_enc_dict = load('./encoding_data/encoded_splits/mpnet_m-5_ep-3/mpnet_firefox_test_enc_m-5_ep-3.pt')

In [None]:
load_enc1 = loaded_enc_dict['encoded_desc'][391030]
load_enc2 = loaded_enc_dict['encoded_desc'][391030]
cossim = util.cos_sim(load_enc1, load_enc2)
print(cossim)