# Distance
Verify np.linalg.norm() gives same result faster than my distance().
Compare my numbers to Distance 02.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-10-25 13:54:08.426320
Python 3.10.6
sklearn 1.1.2


In [2]:
from KmerCounter import KmerCounter
K=4
counter=KmerCounter(K)
VOCABULARY_SIZE = counter.get_vocabulary_size() 
from cell_lines import Cell_Lines
CELL_LINE_NUMBER = 0
all_cell_lines = Cell_Lines.get_ordered_list()
cell_line_name = all_cell_lines[CELL_LINE_NUMBER]
print('Cell line for today:',CELL_LINE_NUMBER,'=',cell_line_name)

Cell line for today: 0 = A549


In [3]:
ATLAS_DIR = 'D:/Adjeroh/Localization/LncAtlas/'
RCI_FILE = 'CNRCI_noncoding_train_genes.csv'
GENCODE_DIR = 'D:/Adjeroh/Localization/GenCode/'
COUNTS_FILE='CNRCI_noncoding_train_counts.K4.csv'

In [4]:
from TrainValidSplit2 import Splitter2
splitter = Splitter2()
ATLAS_PATH = ATLAS_DIR+RCI_FILE
gene_rci = splitter.get_gene_universe(ATLAS_PATH,CELL_LINE_NUMBER)
COUNTS_PATH = GENCODE_DIR+COUNTS_FILE
gid_tid,ordered_counts = splitter.load_counts_universe(COUNTS_PATH)
splitter = None

Loaded RCI values for cell line 0
Selected 1422 values out of 4622 genes.
Loaded 18430 gid+tid combinations.
Loaded 18430 rows of K-mer counts.


In [5]:
print(type(gene_rci),type(gid_tid),type(ordered_counts))

<class 'dict'> <class 'list'> <class 'list'>


In [6]:
def split_genes(pairs,t1,t2):
    low=[]
    middle=[]
    high=[]
    for (gid,rci) in pairs.items():
        if rci<t1:
            low.append(gid)
        elif rci<t2:
            middle.append(gid)
        else:
            high.append(gid)
    return low,middle,high
def split_counts(ordered_counts,ordered_ids,select_genes):
    select_counts=[]
    for i in range(len(ordered_ids)):
        (gid,tid) = ordered_ids[i]
        if gid in select_genes:
            select_counts.append(ordered_counts[i])
    return select_counts

In [7]:
print(datetime.now())
low_genes,middle_genes,high_genes = split_genes(gene_rci,-1,1)
low_counts    = split_counts(ordered_counts,gid_tid,low_genes)
print('Genes/Transcripts Low   :',len(low_genes),len(low_counts))
middle_counts = split_counts(ordered_counts,gid_tid,middle_genes)
print('Genes/Transcripts Middle:',len(middle_genes),len(middle_counts))
high_counts   = split_counts(ordered_counts,gid_tid,high_genes)
print('Genes/Transcripts High  :',len(high_genes),len(high_counts))
print(datetime.now())

2022-10-25 13:54:10.464319
Genes/Transcripts Low   : 486 1631
Genes/Transcripts Middle: 683 3714
Genes/Transcripts High  : 253 1412
2022-10-25 13:54:10.735621


In [8]:
def length_filter(counts,minimum,maximum):
    filtered = []
    for count in counts:
        tot = np.sum(count)
        if tot>=minimum and tot<=maximum:
            filtered.append(count)
    return filtered

In [9]:
print(datetime.now())
low_counts    = length_filter(low_counts,2000,4000)
print('Genes/Transcripts Low   :',len(low_genes),len(low_counts))
middle_counts = length_filter(middle_counts,2000,4000)
print('Genes/Transcripts Middle:',len(middle_genes),len(middle_counts))
high_counts   = length_filter(high_counts,2000,4000)
print('Genes/Transcripts High  :',len(high_genes),len(high_counts))
print(datetime.now())

2022-10-25 13:54:10.767900
Genes/Transcripts Low   : 486 275
Genes/Transcripts Middle: 683 681
Genes/Transcripts High  : 253 256
2022-10-25 13:54:10.788114


In [10]:
def compute_average_vector(counts):
    means = np.mean(counts,axis=0)
    return means
low_avg = compute_average_vector(low_counts)
middle_avg = compute_average_vector(middle_counts)
high_avg = compute_average_vector(high_counts)

In [11]:
def distance_slow(a,b):
    ss = 0
    dim = len(a)
    for i in range(dim):
        ai = a[i]
        bi = b[i]
        df = ai-bi
        sq = df**2
        ss += sq
    return np.sqrt(ss)
def distance(a,b):
    # This works because the Euclidean distance is the l2 norm, and the default value of the ord parameter in numpy.linalg.norm is 2. 
    # https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
    return np.linalg.norm(a-b)

In [12]:
print(datetime.now())
print('dist(low,middle) =',distance(low_avg,middle_avg))
print('dist(middle,high)=',distance(middle_avg,high_avg))
print('dist(low,high)   =',distance(low_avg,high_avg))
print(datetime.now())

2022-10-25 13:54:10.847475
dist(low,middle) = 30.21686362266514
dist(middle,high)= 21.019557692241836
dist(low,high)   = 48.2060721825071
2022-10-25 13:54:10.847475


In [13]:
def average_distance(data,center):
    tot = 0
    for counts in data:
        dist = distance(counts,center)
        tot += dist
    avg = tot / len(data)
    return avg

In [14]:
print(datetime.now())
low_avg_dist = average_distance(low_counts,low_avg)
print('avg(low - avg(low))  ',low_avg_dist)
high_avg_dist = average_distance(high_counts,high_avg)
print('avg(high - avg(high))',high_avg_dist)
print(datetime.now())

2022-10-25 13:54:10.878720
avg(low - avg(low))   95.374431709019
avg(high - avg(high)) 73.78839414852989
2022-10-25 13:54:10.878720


In [15]:
print(datetime.now())
low_high_avg_dist = average_distance(low_counts,high_avg)
print('avg(low - avg(high))',low_high_avg_dist)
high_low_avg_dist = average_distance(high_counts,low_avg)
print('avg(high - avg(low))',high_low_avg_dist)
print(datetime.now())

2022-10-25 13:54:10.894340
avg(low - avg(high)) 104.03863246113501
avg(high - avg(low)) 88.49741222919877
2022-10-25 13:54:10.894340


Ideas:

Narrow the RNA set to just extremes, Gudenas-style.

The following vectors have 256 counts.

Compute the mean vector for high and for low RCI.

Compute vector of presence/absence i.e. sequences with this K-mer.

Compute mean & stdev distance to the mean vector.

Are nuclear RNA more like the nuclear mean vector?

How to estimate significance? Is the difference significant if any one K-mer count is significantly different (minus correction for repeat tests)?


In [16]:
type(high_avg)

numpy.ndarray