# Distance
All-vs-all at K=2 (which should be the fastest to compute).

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-10-25 17:00:10.471158
Python 3.10.6
sklearn 1.1.2


In [2]:
ATLAS_DIR = 'D:/Adjeroh/Localization/LncAtlas/'
RCI_FILE = 'CNRCI_coding_train_genes.csv'
GENCODE_DIR = 'D:/Adjeroh/Localization/GenCode/'
COUNTS_FILE='CNRCI_coding_train_counts.K3.2byte.csv'

In [3]:
def load_rci(filepath,cell_line_index):
    header = None
    data = {}
    with open (filepath,'r') as handle:
        for row in handle:
            if header is None:
                header = row
            else:
                fields = row.split(',')
                gene_id = fields[0]           # gene id (without version) is in column 0
                value = float(fields[1+cell_line_index])  # cell line 0 is in column 1
                data[gene_id]=value
    return(data)


In [4]:
def distance(a,b):
    # This works because the Euclidean distance is the l2 norm, and the default value of the ord parameter in numpy.linalg.norm is 2. 
    # https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
    return np.linalg.norm(a-b)

In [5]:
def normalize(vec):
    total = np.sum(vec)
    vec = vec / total
    return vec

In [6]:
def load_counts(filepath,min_rci,max_rci,gene_rci):
    header = None
    data = []
    with open (filepath,'r') as handle:
        for row in handle:
            row = row.strip()
            if header is None:
                header = row
            else:
                fields = row.split(',')
                gene_id = fields.pop(0)
                transcript_id = fields.pop(0)
                if gene_id in gene_rci.keys():
                    rci = gene_rci[gene_id]
                    if rci != float('nan'):
                        if min_rci is None or rci>min_rci:
                            if max_rci is None or rci<=max_rci:
                                vector = np.asarray(fields,dtype=np.int16)
                                normalized = normalize(vector)
                                triple = (gene_id,transcript_id,normalized)
                                data.append(triple)
    return(data)

In [7]:
def compute_distances(left_data,right_data):
    tick = 0
    tock = 1000
    values=[]
    left_total = len(left_data)
    right_total = len(right_data)
    for left_index in range(left_total):
        left_vector = left_data[left_index][2]
        for right_index in range(right_total):
            right_vector = right_data[right_index][2]
            dist = distance(left_vector, right_vector)
            values.append(dist)
        tick += 1
        if tick==tock:
            tick=0
            print(datetime.now(),'processed',left_index)
    return values

In [8]:
def show_stats(ary):
    print('mean',  np.mean(ary,dtype=np.float64))
    print('stdev', np.std(ary,dtype=np.float64))

In [9]:
def do_all(min_rci,max_rci):
    print(datetime.now(), 'Load RCI values')
    gene_rci = load_rci(ATLAS_DIR+RCI_FILE,0)
    print(datetime.now(), 'Load K-mer counts')
    print(COUNTS_FILE)
    print('min_rci=',min_rci,'max_rci=',max_rci)
    left_data = load_counts(GENCODE_DIR+COUNTS_FILE,None,max_rci,gene_rci)
    print('Left transcripts:', len(left_data))
    right_data = load_counts(GENCODE_DIR+COUNTS_FILE,min_rci,None,gene_rci)
    print('Right transcripts:', len(right_data))
          
    print(datetime.now(), 'Compute distances for left vs left')
    values = compute_distances(left_data,left_data)
    print(datetime.now(), 'Compute statistics')
    ary = np.asarray(values,dtype=np.float16)
    show_stats(ary)

    print(datetime.now(), 'Compute distances for right vs right')
    values = compute_distances(right_data,right_data)
    print(datetime.now(), 'Compute statistics')
    ary = np.asarray(values,dtype=np.float16)
    show_stats(ary)

    print(datetime.now(), 'Compute distances for left vs right')
    values = compute_distances(left_data,right_data)
    print(datetime.now(), 'Compute statistics')
    ary = np.asarray(values,dtype=np.float16)
    show_stats(ary)

In [10]:
MIN_RCI=1
MAX_RCI=-1
do_all(MIN_RCI,MAX_RCI)

2022-10-25 17:00:12.414478 Load RCI values
2022-10-25 17:00:12.434570 Load K-mer counts
CNRCI_coding_train_counts.K3.2byte.csv
min_rci= 1 max_rci= -1
Left transcripts: 10992
Right transcripts: 8738
2022-10-25 17:00:13.288094 Compute distances for left vs left


KeyboardInterrupt: 