# Distance
All-vs-all at K=2 (which should be the fastest to compute).

In [10]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-10-25 15:16:56.214988
Python 3.10.6
sklearn 1.1.2


In [30]:
ATLAS_DIR = 'D:/Adjeroh/Localization/LncAtlas/'
RCI_FILE = 'CNRCI_coding_train_genes.csv'
GENCODE_DIR = 'D:/Adjeroh/Localization/GenCode/'
COUNTS_FILE='CNRCI_coding_train_counts.K2.2byte.csv'

In [31]:
def load_rci(filepath,cell_line_index):
    header = None
    data = {}
    with open (filepath,'r') as handle:
        for row in handle:
            if header is None:
                header = row
            else:
                fields = row.split(',')
                gene_id = fields[0]           # gene id (without version) is in column 0
                value = float(fields[1+cell_line_index])  # cell line 0 is in column 1
                data[gene_id]=value
    return(data)
gene_rci = load_rci(ATLAS_DIR+RCI_FILE,0)


In [33]:
def distance(a,b):
    # This works because the Euclidean distance is the l2 norm, and the default value of the ord parameter in numpy.linalg.norm is 2. 
    # https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
    return np.linalg.norm(a-b)

In [38]:
def load_counts(filepath,min_rci,max_rci,gene_rci):
    header = None
    data = []
    with open (filepath,'r') as handle:
        for row in handle:
            row = row.strip()
            if header is None:
                header = row
            else:
                fields = row.split(',')
                gene_id = fields.pop(0)
                transcript_id = fields.pop(0)
                if gene_id in gene_rci.keys():
                    rci = gene_rci[gene_id]
                    if rci != float('nan'):
                        if min_rci is None or rci>min_rci:
                            if max_rci is None or rci<=max_rci:
                                vector = np.asarray(fields,dtype=np.int16)
                                triple = (gene_id,transcript_id,vector)
                                data.append(triple)
    return(data)
data = load_counts(GENCODE_DIR+COUNTS_FILE,2,None,gene_rci)
total = len(data)
print('Total counts loaded =', total)

Total counts loaded = 1069


In [39]:
print(total*total)

1142761


In [40]:
# Compute
# Don't save yet
# Just seeing how much time it takes
left_index = 0
left_vector = data[left_index][2]
print('Vector length = ',len(left_vector))
print(datetime.now())
tick = 0
tock = 1000
values=[]
for left_index in range(total):
    left_vector = data[left_index][2]
    for right_index in range(total):
        right_vector = data[right_index][2]
        dist = distance(left_vector, right_vector)
        values.append(dist)
    tick += 1
    if tick==tock:
        tick=0
        print(left_index,datetime.now())
print('done')
print(datetime.now())

Vector length =  16
2022-10-25 15:37:22.376477
999 2022-10-25 15:37:26.234927
done
2022-10-25 15:37:26.508205


In [44]:
ary = np.asarray(values,dtype=np.float16)
print('mean',  np.mean(ary,dtype=np.float64))
print('stdev', np.std(ary,dtype=np.float64))

mean 130.5098460377706
stdev 172.6066773156202


Times for all vs all:

Vector length =  16  
2022-10-25 14:27:11.502706  
999 2022-10-25 14:28:15.763160  
1999 2022-10-25 14:29:20.491432  
2999 2022-10-25 14:30:25.117940  
3999 2022-10-25 14:31:31.273742  
4999 2022-10-25 14:32:41.525587  
5999 2022-10-25 14:33:52.231153  
6999 2022-10-25 14:35:02.161528  
7999 2022-10-25 14:36:12.259136  
8999 2022-10-25 14:37:22.127670  
9999 2022-10-25 14:38:32.026037  
10999 2022-10-25 14:39:41.712526  
11999 2022-10-25 14:40:50.733244  
12999 2022-10-25 14:42:00.245487  
13999 2022-10-25 14:43:10.268282  
14999 2022-10-25 14:44:21.594595  
15999 2022-10-25 14:45:31.889860  
16999 2022-10-25 14:46:42.125385  
17999 2022-10-25 14:47:51.963612  
2022-10-25 14:48:21.804961  
