## Install Dependencies

### Torch and Transformers

#### Torch

In [None]:
! pip install torch

#### Huggingface Transformers

In [None]:
! pip install transformers

#### Tensorboard

In [None]:
! pip install tensorboard

#### SciPy

In [None]:
! pip install scipy

### Other Installs

***Not necessary on Google Colab***

In [None]:
! conda install -c intel mkl --yes

In [None]:
! conda install mkl --yes

In [None]:
! conda update mkl --yes

In [None]:
! conda update -n base -c defaults conda --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/samiha/miniconda3

  added / updated specs:
    - conda


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    boltons-23.0.0             |  py310h06a4308_0         430 KB
    conda-23.3.1               |  py310h06a4308_0         970 KB
    cryptography-39.0.1        |  py310h9ce1e76_0         1.4 MB
    jsonpatch-1.32             |     pyhd3eb1b0_0          15 KB
    jsonpointer-2.1            |     pyhd3eb1b0_0           9 KB
    packaging-23.0             |  py310h06a4308_0          68 KB
    pyopenssl-23.0.0           |  py310h06a4308_0          97 KB
    requests-2.29.0            |  py310h06a4308_0          97 KB
    sqlite-3.41.2              |       h5eee18b_0         1.2 MB
    tqdm-4.65.0                |  py310h2f386ee_0         132 KB
    tzdata-20

## Load Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import json
import gc
import time
from scipy.spatial import distance

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Working Directory

In [None]:
# Google Drive
# working_dir = r'/content/drive/MyDrive/Work/CodeBERT'
working_dir = r'/content/drive/MyDrive/ReportCC++/CodeBERT'
if os.path.exists(working_dir):
    print(working_dir)

/content/drive/MyDrive/ReportCC++/CodeBERT


## Pre-processing

In [None]:
! python os.path.join(working_dir, 'dataset', 'preprocess.py')

## Train the Model

In [None]:
! python os.path.join(working_dir, 'code', 'run.py') \
    --output_dir=os.path.join(working_dir, 'code', 'saved_models') \
    --model_type=roberta \
    --tokenizer_name=microsoft/codebert-base \
    --model_name_or_path=microsoft/codebert-base \
    --do_train \
    --train_data_file=os.path.join(working_dir, 'dataset', 'train.jsonl') \
    --eval_data_file=os.path.join(working_dir, 'dataset', 'valid.jsonl') \
    --test_data_file=os.path.join(working_dir, 'dataset', 'test.jsonl') \
    --epoch 5 \
    --block_size 400 \
    --train_batch_size 32 \
    --eval_batch_size 64 \
    --learning_rate 2e-5 \
    --max_grad_norm 1.0 \
    --evaluate_during_training \
    --seed 123456  2>&1 | tee os.path.join(working_dir, 'code', 'train.log')

## Load the Model

**Load the Tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

**Load the Pre-trained Model**

In [None]:
model = AutoModel.from_pretrained("microsoft/codebert-base")

**Load the Saved Weights**

In [None]:
model.load_state_dict(torch.load(os.path.join(working_dir, 'code', 'saved_models', 'checkpoint-best-acc', 'model.bin')), strict=False)

In [None]:
nl_tokens=tokenizer.tokenize("return maximum value")

code_tokens=tokenizer.tokenize("def max(a,b): if a>b: return a else return b")

tokens = [tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.eos_token]
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
context_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]
context_embeddings

## Create and Save Embeddings

### Load the Data
We'll load `test.jsonl` and create the embeddings

In [None]:
df = pd.read_json(os.path.join(working_dir, 'dataset', 'test.jsonl'), lines= True)

We are done with testing. Let's load the whole dataset from the `function.json` file.

In [None]:
df = pd.read_json(os.path.join(working_dir, 'dataset', 'function.json'))

In [None]:
df.shape

(27318, 4)

In [None]:
len(df['func'].unique())

27258

In [None]:
# Can cause silent death of the code!!
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
! %memit

/bin/bash: line 1: fg: no job control


In [None]:
# resource.setrlimit(resource.RLIMIT_DATA, (2**30, 2**30))

In [None]:
df.head(5)

Unnamed: 0,project,commit_id,target,func
0,FFmpeg,973b1a6b9070e2bf17d17568cbaf4043ce931f51,0,static av_cold int vdadec_init(AVCodecContext ...
1,FFmpeg,321b2a9ded0468670b7678b7c098886930ae16b2,0,static int transcode(AVFormatContext **output_...
2,FFmpeg,5d5de3eba4c7890c2e8077f5b4ae569671d11cf8,0,"static void v4l2_free_buffer(void *opaque, uin..."
3,FFmpeg,32bf6550cb9cc9f487a6722fe2bfc272a93c1065,0,"int ff_get_wav_header(AVFormatContext *s, AVIO..."
4,FFmpeg,57d77b3963ce1023eaf5ada8cba58b9379405cc8,0,"int av_opencl_buffer_write(cl_mem dst_cl_buf, ..."


In [None]:
df.iloc[0]['func']

### Create the Embeddings

In [None]:
vectors_table_padded = []
vectors_calculation_passes = []
row_counter = 0
pass_counter = 0
completed_passes = 0

In [None]:
# If we need to re-run the code after failing in the middle of execution
completed_passes = 27

**Method to create embeddings**  
This method will be applied each row of the dataframe loaded above. This will take the code (`func`), cleanup, create tensor embeddings, convert to numpy array, and save into a list `vectors_table_padded` for further manipulation.

In [None]:
def create_padded_vector(x):
    global vectors_table_padded
    global vectors_calculation_passes
    global row_counter
    global pass_counter

    row_counter = row_counter + 1

    # If the process failed after X passes (X * 1000 entries)
    # we can skip those rows and begin our calculation from (X * 1000 + 1) row
    if((completed_passes > 0) and (row_counter <= (completed_passes * 1000))):
        if((row_counter > 0) and (row_counter % 1000 == 0)):
            vectors_calculation_passes.append(time.time())
            print()
            print('Skipped pass: ' + str(len(vectors_calculation_passes)))
            print('Processed ' + str(row_counter) + ' entries.')
            print()
        return

    x = x.replace('\n', ' ').replace('"', ' ')

    tokens = [tokenizer.cls_token] + tokenizer.tokenize(x, truncation=True, max_length=510) + [tokenizer.sep_token]

    torch.set_printoptions(profile="full")

    try:
        context_embeddings = model(torch.tensor(tokenizer.convert_tokens_to_ids(tokens))[None,:])[0]
    except Exception as e:
        print()
        print(str(e))
        print()
        return

    # vectors_table_test.append(context_embeddings[-1,::])
    # lenV = len(vectors_table_test)
    c = list(context_embeddings[-1,::].size())
    t = list(context_embeddings.size())

    #size_list.append(t[1])

    target = torch.zeros(512, 768)
    source = context_embeddings[-1,::]
    target[:c[0]] = source
    vectors_table_padded.append(np.hstack(target.detach().numpy()))
    print('.', end='')

    # Since we are working with large data now
    # we will write the vectors to disk and
    # reset the array after every 1000 rows.
    if((len(vectors_table_padded) > 0) and (len(vectors_table_padded) % 1000 == 0)):
        vectors_calculation_passes.append(time.time())

        with open(os.path.join(working_dir, 'dataset', 'embed_chunks', 'outputCodeEmbeddingCodeBERT_' + str(len(vectors_calculation_passes)) + '.pkl'), 'wb') as handle:
            pickle.dump(vectors_table_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)

        with open(os.path.join(working_dir, 'dataset', 'outputCodeEmbeddingCodeBERT_all.pkl'), 'ab+') as handle:
            pickle.dump(vectors_table_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)

        # Reset the array to save memory
        vectors_table_padded = []

        print()
        print('Finished pass: ' + str(len(vectors_calculation_passes)))
        print('Processed ' + str(len(vectors_calculation_passes) * 1000) + ' entries.')
        print()

In [None]:
df['func'].apply(create_padded_vector)


Skipped pass: 1
Processed 1000 entries.


Skipped pass: 2
Processed 2000 entries.


Skipped pass: 3
Processed 3000 entries.


Skipped pass: 4
Processed 4000 entries.


Skipped pass: 5
Processed 5000 entries.


Skipped pass: 6
Processed 6000 entries.


Skipped pass: 7
Processed 7000 entries.


Skipped pass: 8
Processed 8000 entries.


Skipped pass: 9
Processed 9000 entries.


Skipped pass: 10
Processed 10000 entries.


Skipped pass: 11
Processed 11000 entries.


Skipped pass: 12
Processed 12000 entries.


Skipped pass: 13
Processed 13000 entries.


Skipped pass: 14
Processed 14000 entries.


Skipped pass: 15
Processed 15000 entries.


Skipped pass: 16
Processed 16000 entries.


Skipped pass: 17
Processed 17000 entries.


Skipped pass: 18
Processed 18000 entries.


Skipped pass: 19
Processed 19000 entries.


Skipped pass: 20
Processed 20000 entries.


Skipped pass: 21
Processed 21000 entries.


Skipped pass: 22
Processed 22000 entries.


Skipped pass: 23
Processed 23000 entries.


Skipp

0        None
1        None
2        None
3        None
4        None
         ... 
27313    None
27314    None
27315    None
27316    None
27317    None
Name: func, Length: 27318, dtype: object

**Handle the extra chunk**  
There can be extra rows after the last pass that did not fit in the even 1000s. We need to write those vectors in a file.

In [None]:
vectors_calculation_passes.append(time.time())

with open(os.path.join(working_dir, 'dataset', 'embed_chunks', 'outputCodeEmbeddingCodeBERT_' + str(len(vectors_calculation_passes)) + '.pkl'), 'wb') as handle:
    pickle.dump(vectors_table_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(working_dir, 'dataset', 'outputCodeEmbeddingCodeBERT_all.pkl'), 'ab+') as handle:
    pickle.dump(vectors_table_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Reset the array to save memory
print()
print('Finished pass: ' + str(len(vectors_calculation_passes)))
print('Processed ' + str((len(vectors_calculation_passes) * 1000 + len(vectors_table_padded) - 1000)) + ' entries.')
print()

vectors_table_padded = []


Finished pass: 28
Processed 27318 entries.



In [None]:
print('Finished processing in pass: ' + str(len(vectors_calculation_passes)))

Finished processing in pass: 28


**Clean up the pickle**  
Open and combine the dumps

In [None]:
vectors_table_padded = []
chunks_loaded = 0
# Load the pickle file with all the chunks
with open(os.path.join(working_dir, 'dataset', 'outputCodeEmbeddingCodeBERT_all.pkl'), 'rb') as handle:
    try:
        while True:
            vectors_table_padded.append(pickle.load(handle))
            chunks_loaded += 1
            print('Loaded chunk: ' + str(chunks_loaded))
    except EOFError:
        pass

Loaded chunk: 1
Loaded chunk: 2
Loaded chunk: 3
Loaded chunk: 4
Loaded chunk: 5
Loaded chunk: 6
Loaded chunk: 7
Loaded chunk: 8
Loaded chunk: 9
Loaded chunk: 10
Loaded chunk: 11
Loaded chunk: 12
Loaded chunk: 13
Loaded chunk: 14
Loaded chunk: 15
Loaded chunk: 16
Loaded chunk: 17
Loaded chunk: 18
Loaded chunk: 19
Loaded chunk: 20
Loaded chunk: 21
Loaded chunk: 22
Loaded chunk: 23
Loaded chunk: 24
Loaded chunk: 25
Loaded chunk: 26
Loaded chunk: 27
Loaded chunk: 28
Loaded chunk: 29
Loaded chunk: 30
Loaded chunk: 31
Loaded chunk: 32
Loaded chunk: 33
Loaded chunk: 34


In [None]:
len(vectors_table_padded[30][0])

393216

In [None]:
vectors_table_padded[0][0][0]

0.049869653

**Combine the chunks**

In [None]:
vectors_table_padded = []
#/content/drive/MyDrive/Work/CodeBERT/dataset/embed_chunks/outputCodeEmbeddingCodeBERT_1.pkl
# Load the pickle file
for i in range(28):
    with open(os.path.join(working_dir, 'dataset', 'embed_chunks', 'outputCodeEmbeddingCodeBERT_'+(str(i + 1)) + '.pkl'), 'rb') as handle:
        vectors_table_padded.extend(pickle.load(handle))

In [None]:
len(vectors_table_padded)

27318

In [None]:
len(vectors_table_padded[0])

393216

**Save the full embedding file**

In [None]:
with open(os.path.join(working_dir, 'dataset', 'outputCodeEmbeddingCodeBERT_all.pkl'), 'wb') as handle:
    pickle.dump(vectors_table_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
vectors_table_padded[:1]

[array([ 0.04986965, -0.07156827, -0.27428073, ..., -0.12229227,
        -0.5068423 ,  0.29837942], dtype=float32)]

In [None]:
len(vectors_table_padded)

2732

In [None]:
# Try some garbage collection. Although this doesn't work on Google Colab!
del df
del model
del tokenizer
# Garbage collection
gc.collect()

0

##### Create Dataframe and Save
This part is not necessary. Keeping here just in case.

In [None]:
d = pd.DataFrame.from_records(vectors_table_padded)

In [None]:
d

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,393206,393207,393208,393209,393210,393211,393212,393213,393214,393215
0,-0.041774,-0.129617,-0.310114,0.263362,0.25308,-0.414258,0.378246,-0.069554,0.400794,-0.628032,...,0.410113,-0.123037,-0.386891,1.279073,0.034445,0.033662,0.842423,-0.095404,-0.53622,0.512918
1,-0.102907,-0.177674,-0.372653,0.221366,0.229542,-0.287794,0.310212,-0.112319,0.389502,-0.583723,...,0.390833,-0.096609,-0.338342,1.238803,-0.061955,0.087662,0.774239,-0.022931,-0.553467,0.50844
2,0.038611,-0.120939,-0.44198,0.434366,0.365734,-0.351603,0.352667,-0.310955,0.449977,-0.676956,...,0.410262,-0.126431,-0.383458,1.404115,0.155906,0.001532,0.645607,-0.130769,-0.518799,0.315326
3,-0.06385,-0.109174,-0.361763,0.294476,0.319243,-0.302241,0.30933,-0.184119,0.31801,-0.584183,...,0.404721,-0.042341,-0.458128,1.283927,-0.012934,-0.023772,0.839537,-0.113522,-0.534425,0.281317
4,0.013163,-0.132875,-0.430251,0.265636,0.220051,-0.505864,0.437079,-0.165107,0.36425,-0.783481,...,0.355474,-0.072757,-0.360147,1.344247,0.023357,-0.015144,0.735134,0.029075,-0.487036,0.396164


In [None]:
d.to_csv(os.path.join(working_dir, 'dataset', 'outputCodeEmbeddingCodeBERT_df.txt'), sep=' ', index=False, header=False)

## Create Similarity Matrix

### Load the Data

In [None]:
vectors_table_padded = None

In [None]:
# Load the pickle file
with open(os.path.join(working_dir, 'dataset', 'embedding_all_np.pkl'), 'rb') as handle:
    vectors_table_padded = pickle.load(handle)

Above pickle file should be in numpy format saving us memory while converting list to NumPy. But if that doesn't work, we can use the NumPy dump like the next cell.

In [None]:
# Load the npy file
# with open(os.path.join(working_dir, 'dataset', 'embedding_all_np_direct.npy'), 'rb') as handle:
#     vectors_table_padded = np.load(handle)

In [None]:
type(vectors_table_padded)

numpy.ndarray

In [None]:
vectors_table_padded.shape

(27318, 393216)

In [None]:
vectors_table_padded[0]

array([ 0.04986965, -0.07156827, -0.27428073, ..., -0.12229227,
       -0.5068423 ,  0.29837942], dtype=float32)

In [None]:
len(vectors_table_padded)

27318

### Calculate Similarity

#### Manually Calculate

In [None]:
table_length = len(vectors_table_padded)

In [None]:
similarity_array = np.zeros((table_length, table_length))

**Calculate in chunks**  
We'll calculate the similarity as **1000 x 1000** matrix. Then update the similarity array row-wise and then column-wise. This way we'll only need to calculate half the table rows-wise.

In [None]:
# Load Pre-calculated similarity array
with open(os.path.join(working_dir, 'dataset', 'similarity_all.npy'), 'rb') as handle:
    similarity_array = np.load(handle)

In [None]:
similarity_array.shape

In [None]:
np.fill_diagonal(similarity_array, 1)

#### Use the chunks of the vector
Load chunks of the embedding vector in the memory to do the calculation.

In [None]:
all_start_time = time.time()
start_time = time.time()

for i in range(1, 29):
    row_start_time = time.time()
    chunk_i = None
    start_index_i = (i-1) * 1000
    end_index_i = i * 1000
    if end_index_i > table_length:
        end_index_i = table_length
    with open(os.path.join(working_dir, 'dataset', 'embed_chunks', 'outputCodeEmbeddingCodeBERT_' + str(i) + '.pkl'), 'rb') as handle:
        chunk_i = pickle.load(handle)
    for j in range(i, 29):
        chunk_j = None
        start_index_j = (j-1) * 1000
        end_index_j = j * 1000
        if end_index_j > table_length:
            end_index_j = table_length
        with open(os.path.join(working_dir, 'dataset', 'embed_chunks', 'outputCodeEmbeddingCodeBERT_' + str(j) + '.pkl'), 'rb') as handle:
            chunk_j = pickle.load(handle)

        similarity = 1 - distance.cdist(chunk_i, chunk_j, metric='cosine')

        # Row-wise update
        similarity_array[start_index_i : end_index_i, start_index_j : end_index_j] = similarity
        # Column-wise update
        similarity_array[start_index_j : end_index_j, start_index_i : end_index_i] = np.rot90(np.fliplr(similarity))

        print()
        print('Processed: (', start_index_i, '-', end_index_i, ',', start_index_j, '-', end_index_j, ')')
        print('Execution time:', (time.time() - start_time), 'seconds')
        # Save as NumPy file
        with open(os.path.join(working_dir, 'dataset', 'similarity_all.npy'), 'wb') as handle:
            np.save(handle, similarity_array)
            print('Saved similarity upto batch: (', start_index_i, '-', end_index_i, ',', start_index_j, '-', end_index_j, ')')
        print()
        start_time = time.time()
    print('Finished batch upto batch:', i)
    print('Total time for the batch upto[', i, '] took', (time.time() - row_start_time), 'seconds')
    print('*' * 20)
    print()
print('~' * 20)
print('Finished processing all batches in', (time.time() - all_start_time), 'seconds')
print('~' * 20)


#### Use the whole vector
Not very efficient since we are loading the whole vector in the memory.

In [None]:
past_execution_i = 1000
past_execution_j = 17001

In [None]:
all_start_time = time.time()
start_time = time.time()
i = 0

while i < table_length:
    row_start_time = time.time()

    # similarity_array[i][i] = 1

    j = i + 1

    start_of_i = i
    i += 500

    if i > table_length:
        i = table_length

    while j < table_length:
        similarity = None

        start_of_j = j
        j += 500

        if j > table_length:
            j = table_length

        if ((past_execution_i >= start_of_i) and (past_execution_j > start_of_j)):
            print()
            print('Skipped: (', start_of_i, '-', i, ',', start_of_j, '-', j, ')')
            print()
            continue

        similarity = 1 - distance.cdist(vectors_table_padded[start_of_i : i], vectors_table_padded[start_of_j : j], metric='cosine')

        # Row-wise update
        similarity_array[start_of_i : i, start_of_j : j] = similarity
        # Column-wise update
        similarity_array[start_of_j : j, start_of_i : i] = np.rot90(np.fliplr(similarity))

        print()
        print('Processed: (', start_of_i, '-', i, ',', start_of_j, '-', j, ')')
        print('Execution time:', (time.time() - start_time), 'seconds')
        # Save as NumPy file
        with open(os.path.join(working_dir, 'dataset', 'similarity_all.npy'), 'wb') as handle:
            np.save(handle, similarity_array)
            print('Saved similarity upto batch: (', start_of_i, '-', i, ',', start_of_j, '-', j, ')')
        print()
        start_time = time.time()
    print('Finished batch upto row:', i)
    print('Total time for the batch upto[', i, '] took', (time.time() - row_start_time), 'seconds')
    print('*' * 20)
    print()
print('~' * 20)
print('Finished processing all rows in', (time.time() - all_start_time), 'seconds')
print('~' * 20)

In [None]:
similarity_array[0:100, 0:100]

In [None]:
# Manual Calculation
# dot = (vectors_table_padded.T @ vectors_table_padded)
# norm = ((vectors_table_padded * vectors_table_padded).sum(0, keepdims=True) ** .5)
# v_norm = vectors_table_padded/norm

In [None]:
# cosine_similarity = (v_norm.T @ v_norm)

In [None]:
# cosine_distance = 1 - (v_norm.T @ v_norm)

#### Use SciPy

In [None]:
vectors_table_padded = np.asarray(vectors_table_padded)

In [None]:
vectors_table_padded.shape

(27318, 393216)

In [None]:
# Save as pickle file
with open(os.path.join(working_dir, 'dataset', 'embedding_all_np.pkl'), 'wb') as handle:
    pickle.dump(vectors_table_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(working_dir, 'dataset', 'embedding_all_np_direct.npy'), 'wb') as handle:
    np.save(handle, vectors_table_padded)

In [None]:
# Use Scipy Distance Calculation
similarity_array = 1 - distance.cdist(vectors_table_padded, vectors_table_padded, metric='cosine')

In [None]:
similarity_array.shape

(2732, 2732)

In [None]:
similarity_array

array([[1.        , 0.78262403, 0.75921619, ..., 0.76991508, 0.78127878,
        0.74125   ],
       [0.78262403, 1.        , 0.75355589, ..., 0.77375163, 0.78496239,
        0.7260659 ],
       [0.75921619, 0.75355589, 1.        , ..., 0.75280698, 0.75300894,
        0.73755132],
       ...,
       [0.76991508, 0.77375163, 0.75280698, ..., 1.        , 0.78454357,
        0.75005791],
       [0.78127878, 0.78496239, 0.75300894, ..., 0.78454357, 1.        ,
        0.73438539],
       [0.74125   , 0.7260659 , 0.73755132, ..., 0.75005791, 0.73438539,
        1.        ]])

#### Save the Similarity Values

In [None]:
# Save as pickle file
with open(os.path.join(working_dir, 'dataset', 'similarity_all.pkl'), 'wb') as handle:
    pickle.dump(similarity_array, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Save as NumPy file
with open(os.path.join(working_dir, 'dataset', 'similarity_all.npy'), 'wb') as handle:
    np.save(handle, similarity_array)

In [None]:
# Save as NP CSV
np.savetxt(os.path.join(working_dir, 'dataset', 'similarity_all.txt'), similarity_array, delimiter=' ', fmt='%.8f')

## Test the saved data

In [None]:
similarity_array = None

**From NumPy**

In [None]:
with open(os.path.join(working_dir, 'dataset', 'similarity_all.npy'), 'rb') as handle:
    similarity_array = np.load(handle)

In [None]:
type(similarity_array)

numpy.ndarray

In [None]:
similarity_array.shape

(27318, 27318)

In [None]:
similarity_array[:-10, :-10]

array([[1.        , 0.74163253, 0.58722038, ..., 0.75991286, 0.44038302,
        0.66806784],
       [0.74163253, 1.        , 0.56161037, ..., 0.74189584, 0.42780822,
        0.67684237],
       [0.58722038, 0.56161037, 1.        , ..., 0.57599907, 0.58375795,
        0.64999793],
       ...,
       [0.75991286, 0.74189584, 0.57599907, ..., 1.        , 0.42880097,
        0.67855398],
       [0.44038302, 0.42780822, 0.58375795, ..., 0.42880097, 1.        ,
        0.49533879],
       [0.66806784, 0.67684237, 0.64999793, ..., 0.67855398, 0.49533879,
        1.        ]])

**From Pickle**

In [None]:
with open(os.path.join(working_dir, 'dataset', 'similarity_all.pkl'), 'rb') as handle:
    similarity_array = pickle.load(handle)

In [None]:
type(similarity_array)

numpy.ndarray

In [None]:
similarity_array.shape

(27318, 27318)

In [None]:
similarity_array[:-10, :-10]

array([[1.        , 0.74163253, 0.58722038, ..., 0.75991286, 0.44038302,
        0.66806784],
       [0.74163253, 1.        , 0.56161037, ..., 0.74189584, 0.42780822,
        0.67684237],
       [0.58722038, 0.56161037, 1.        , ..., 0.57599907, 0.58375795,
        0.64999793],
       ...,
       [0.75991286, 0.74189584, 0.57599907, ..., 1.        , 0.42880097,
        0.67855398],
       [0.44038302, 0.42780822, 0.58375795, ..., 0.42880097, 1.        ,
        0.49533879],
       [0.66806784, 0.67684237, 0.64999793, ..., 0.67855398, 0.49533879,
        1.        ]])

**From CSV**

In [None]:
similarity_array = np.loadtxt(os.path.join(working_dir, 'dataset', 'similarity_all.txt'), delimiter=' ')

In [None]:
type(similarity_array)

numpy.ndarray

In [None]:
similarity_array.shape

(27318, 27318)

In [None]:
similarity_array[:-10, :-10]

array([[1.        , 0.74163253, 0.58722038, ..., 0.75991286, 0.44038302,
        0.66806784],
       [0.74163253, 1.        , 0.56161037, ..., 0.74189584, 0.42780822,
        0.67684237],
       [0.58722038, 0.56161037, 1.        , ..., 0.57599907, 0.58375795,
        0.64999793],
       ...,
       [0.75991286, 0.74189584, 0.57599907, ..., 1.        , 0.42880097,
        0.67855398],
       [0.44038302, 0.42780822, 0.58375795, ..., 0.42880097, 1.        ,
        0.49533879],
       [0.66806784, 0.67684237, 0.64999793, ..., 0.67855398, 0.49533879,
        1.        ]])