# Analyze RCI correlations between cell lines
lncAtlas, exclude cell line 1 (H1.hESC), training data only.

Include only genes also represented in GenCode 43.
We can only train and test on genes for which we have sequence.
Some gene IDs from lncAtlas (2017) are no longer represented in GenCode 43 (2023).
However, the exclusion makes little difference, if any.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import scipy.stats as ss
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2023-05-02 09:28:22.004759
Python 3.10.0
sklearn 1.1.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/TrainTest/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = 'D:/Adjeroh/Localization/TrainTest/'   # Windows
    DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'    # Mac
print(DATA_DIR)

/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/


In [3]:
PC_RCI_FILE  = 'all.pc_RCI.csv'
PC_GENE_FILE = 'all.canon_pc_transcripts.csv'
NC_RCI_FILE  = 'all.lncRNA_RCI.csv'
NC_GENE_FILE = 'all.canon_lncRNA_transcripts.csv'
CELL_LINES   = 15
EXCLUDE      = [] # [] or [1]

## Genes with GenCode sequence

In [4]:
def load_gencode_genes(filepath):
    gene_set = set()
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
            else:
                fields = row.strip().split(',')
                tran_id = fields[0]
                gene_id = fields[1]
                gene_set.add(gene_id)
    return gene_set
PC_GENES = load_gencode_genes(DATA_DIR + PC_GENE_FILE)
NC_GENES = load_gencode_genes(DATA_DIR + NC_GENE_FILE)
GC_GENES = PC_GENES.union(NC_GENES)
print('GenCode gene counts (pc,nc,both):',len(PC_GENES),len(NC_GENES),len(GC_GENES))

GenCode gene counts (pc,nc,both): 17668 6423 24091


## CNRCI values per gene

In [5]:
def values_per_gene(filepath,exclude_lines=[]):
    all_counts=[]
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
            else:
                count_rci = 0
                fields = row.strip().split(',')
                gene_id = fields.pop(0)
                if gene_id in GC_GENES:
                    for i in range(len(fields)):
                        if fields[i] != 'nan' and i not in exclude_lines:
                            count_rci += 1
                    all_counts.append(count_rci)
    all_counts = np.asarray(all_counts)
    return all_counts
def analyze_values_per_gene(all_counts):
    print(len(all_counts), 'genes examined')
    print(np.mean(all_counts), 'average number of CNRCI for one gene')
    print(min(all_counts), max(all_counts), 'min and max CNRCI for any gene')
    print()
    print('Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI')
    for threshold in range(CELL_LINES+1):
        equal = np.count_nonzero(all_counts==threshold)
        more  = np.count_nonzero(all_counts>threshold)
        less  = np.count_nonzero(all_counts<threshold)
        print("%2d %6d %6d %6d" % (threshold, equal, more, less))

In [6]:
print('Protein-coding genes with some number of CNRCI values')
filepath = DATA_DIR + PC_RCI_FILE
counts = values_per_gene(filepath,exclude_lines=EXCLUDE)
analyze_values_per_gene(counts)

Protein-coding genes with some number of CNRCI values
17668 genes examined
9.59310618066561 average number of CNRCI for one gene
1 15 min and max CNRCI for any gene

Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI
 0      0  17668      0
 1   1137  16531      0
 2   1010  15521   1137
 3    850  14671   2147
 4    716  13955   2997
 5    662  13293   3713
 6    653  12640   4375
 7    600  12040   5028
 8    686  11354   5628
 9    747  10607   6314
10    922   9685   7061
11   1246   8439   7983
12   1654   6785   9229
13   2256   4529  10883
14   2756   1773  13139
15   1773      0  15895


In [7]:
print('Non-coding genes with some number of CNRCI values')
filepath = DATA_DIR + NC_RCI_FILE
counts = values_per_gene(filepath)
analyze_values_per_gene(counts)

Non-coding genes with some number of CNRCI values
6423 genes examined
4.154756344387358 average number of CNRCI for one gene
1 15 min and max CNRCI for any gene

Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI
 0      0   6423      0
 1   2004   4419      0
 2    992   3427   2004
 3    670   2757   2996
 4    504   2253   3666
 5    417   1836   4170
 6    342   1494   4587
 7    300   1194   4929
 8    256    938   5229
 9    202    736   5485
10    223    513   5687
11    161    352   5910
12    138    214   6071
13    110    104   6209
14     78     26   6319
15     26      0   6397


## Values per cell line

In [8]:
def values_per_cell_line(filepath):
    names = np.zeros(CELL_LINES)
    values = []
    for i in range(CELL_LINES):
        values.append([])
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            row = row.strip()
            if header is None:
                header = row 
                # ignore col 0 = 'gene_id'
                names = row.split(',')
                names.pop(0)
            else:
                fields = row.split(',')
                gene_id = fields.pop(0)
                if gene_id in GC_GENES:
                    for i in range(len(fields)):
                        if fields[i] != 'nan':
                            value = float(fields[i])
                            values[i].append(value)
    return names,values
def analyze_values_per_cell_line(names,counts,exclude_lines=[]):
    print('Cell line, count, mean, stdev, min, max')
    for i in range(len(names)):
        if i not in exclude_lines:
            #print("%8s %6d %6.3f %6.3f %6.3f %6.3f " %
            #    (names[i], len(counts[i]), np.mean(counts[i]), np.std(counts[i]),
            #    min(counts[i]), max(counts[i])))
            print(names[i], len(counts[i]), np.mean(counts[i]), np.std(counts[i]),
                min(counts[i]), max(counts[i]))

In [9]:
print('Protein-coding CNRCI values per cell line')
filepath = DATA_DIR + PC_RCI_FILE
names,counts = values_per_cell_line(filepath)
analyze_values_per_cell_line(names,counts,exclude_lines=EXCLUDE)

Protein-coding CNRCI values per cell line
Cell line, count, mean, stdev, min, max
A549 13011 -0.056876129296902625 1.1947761160368826 -6.45121 10.5541
H1.hESC 16387 -0.03566786461030085 1.2803003889292162 -6.61208 6.76404
HeLa.S3 8778 -0.04764011320369104 1.5176974528727853 -7.52878 4.40693
HepG2 11844 -0.2679212828378082 1.700094207133356 -8.02377 4.48684
HT1080 11996 0.03182446937829277 1.2241419443773964 -7.33092 3.90689
HUVEC 12889 -0.1938931582919544 1.4508190643351646 -7.3837 5.47449
MCF.7 13908 -0.22639405178623814 1.692214888091745 -8.19311 6.63662
NCI.H460 10566 -0.32241901070565965 1.2743598039017643 -8.93799 6.48113
NHEK 10976 -0.24475720787591107 1.3570799541571847 -7.38627 3.61471
SK.MEL.5 10500 -0.03390146483590477 1.656405451671528 -8.47407 5.6231
SK.N.DZ 10644 0.03877351828391583 1.1421726466128805 -6.58496 3.97728
SK.N.SH 12141 -0.33746653690783296 1.7121380352115043 -8.13443 6.16065
GM12878 12511 -0.22194632432595318 1.30557280957447 -6.93074 5.1883
K562 8333 -0.12050

In [10]:
print('Non-coding CNRCI values per cell line')
filepath = DATA_DIR + NC_RCI_FILE
names,counts = values_per_cell_line(filepath)
analyze_values_per_cell_line(names,counts,exclude_lines=EXCLUDE)

Non-coding CNRCI values per cell line
Cell line, count, mean, stdev, min, max
A549 2018 -0.5502883531764123 1.6358036485839058 -5.59991 4.49338
H1.hESC 4669 -0.4594495390511887 1.549763348670568 -6.47018 5.58139
HeLa.S3 1248 -1.5686552029647436 1.8659041625283603 -7.5727 4.14975
HepG2 1900 -1.4432894830263159 1.9852199108638904 -7.57193 3.64697
HT1080 1288 -0.5655285576863355 1.8511188761594475 -7.74147 4.24793
HUVEC 2089 -1.3906608333221635 2.053116678411728 -8.23002 3.64386
MCF.7 2951 -1.576154710223653 2.1831108992870694 -9.1779 4.16993
NCI.H460 861 -1.5200252442508713 1.8878196622987138 -7.61176 4.58496
NHEK 1497 -1.2024822903139614 1.979496429550201 -8.40027 3.46877
SK.MEL.5 763 -1.785872140065531 2.1795594855117044 -10.255 3.97389
SK.N.DZ 848 -0.6133945319339623 1.5763191114750537 -8.33023 3.20511
SK.N.SH 2291 -1.384835979113924 2.1358692865483238 -9.53877 4.05156
GM12878 2379 -1.17181106790248 1.616109226810934 -8.32125 3.36075
K562 1334 -1.0011880146476761 1.6835658537003675 -6

## Correlations

In [11]:
def load_RCI_data(filepath,exclude_lines=[]):
    cell_line_names = []  # list of 15 names like A549
    cell_line_maps  = []  # 15 maps for 15 cell lines, each maps one gene name to one RCI value
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            line = row.strip()
            fields = line.split(',')
            if header is None:
                header = row 
                # exclude column 0 = gene name
                for col in range(1,len(fields)): 
                    cell_line_name = fields[col]
                    cell_line_names.append(cell_line_name)
                    # start a new empty map for each column
                    cell_line_genes = {}
                    cell_line_maps.append(cell_line_genes)
            else:
                gene_id = fields[0]
                if gene_id in GC_GENES:
                    for col in range(1,len(fields)): 
                        rci_val = fields[col]
                        if rci_val != "nan":
                            real_value = float(rci_val)
                            position = col-1 # 1st cell line in position 0
                            cell_line_name = cell_line_names[position] 
                            cell_line_map  = cell_line_maps[position]
                            cell_line_map[gene_id] = real_value 
    for i in exclude_lines:
        print('Exclude cell line', i, cell_line_names[i])
        del cell_line_names[i]
        del cell_line_maps[i]
    if len(exclude_lines)==0:
        print('No cell lines excluded.')
    print('Remaining cell lines:',cell_line_names)
    return cell_line_names, cell_line_maps

In [12]:
def all_vs_all_intersection(names,maps):
    NUM = len(maps)
    SHOW = 0
    for i in range(NUM):
        print(names[i],end=',')
    print()
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        for j in range(NUM):
            j_genes = set(maps[j].keys())
            section = i_genes.intersection(j_genes)
            SHOW = len(section)
            #if i==0:
            #    SHOW = len(j_genes)
            #elif j==0:
            #    SHOW = len(i_genes)
            print(SHOW,end=',')
        print()

In [13]:
def all_vs_all_correlation(names,maps):
    NUM = len(maps)
    # heading
    for i in range(NUM):
        print(names[i],end=',')
    print()
    # table
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        for j in range(NUM):
            j_genes = set(maps[j].keys())
            common_genes = i_genes.intersection(j_genes)
            i_values = np.array([maps[i][k] for k in common_genes])
            j_values = np.array([maps[j][k] for k in common_genes])
            r,p = ss.pearsonr(i_values,j_values)
            print('%.2f' % r, end=',')
        print()

In [14]:
def get_union(maps):
    union = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = one_map.keys()
        union = union.union(genes)
    return union
def get_intersection(maps):
    section = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = set(one_map.keys())
        if i==0:
            pass
        elif i==1:
            section = genes
        else:
            section = section.intersection(genes)
    return section
def show_rci_counts_table(names,maps):
    section = get_intersection(maps)
    print('The intersection of RCI values contains',len(section),'genes.')
    print('All vs all intersection')
    all_vs_all_intersection(names,maps)

## Coding

In [15]:
filepath = DATA_DIR + PC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath,exclude_lines=EXCLUDE)
#print('Coding Genes: number of RCI values per cell line')
#for i in range(len(cl_names)):
#    print('%2d %10s %5d' % (i, cl_names[i], len(cl_maps[i].keys())))
union = get_union(cl_maps)
print('The union of RCI values contains',len(union),'genes.')

No cell lines excluded.
Remaining cell lines: ['A549', 'H1.hESC', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']
The union of RCI values contains 17668 genes.


In [16]:
show_rci_counts_table(cl_names,cl_maps)

The intersection of RCI values contains 1787 genes.
All vs all intersection
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
13011,12706,8021,10490,10803,11345,11930,9734,9551,9552,9483,10596,10696,7610,4638,
12706,16387,8614,11572,11762,12622,13546,10442,10657,10368,10559,11851,12074,8202,4941,
8021,8614,8778,7696,7822,8085,8346,7134,7269,7062,6901,7586,7746,5992,3386,
10490,11572,7696,11844,10028,10552,10960,9097,9082,9037,8937,9830,10127,7432,4245,
10803,11762,7822,10028,11996,11067,11218,9595,9346,9600,9425,10241,10392,7421,4344,
11345,12622,8085,10552,11067,12889,11878,9862,9774,9775,9709,10794,10902,7721,4556,
11930,13546,8346,10960,11218,11878,13908,10107,10067,9995,9953,11023,11261,7888,4623,
9734,10442,7134,9097,9595,9862,10107,10566,8346,8759,8594,9088,9283,6751,3821,
9551,10657,7269,9082,9346,9774,10067,8346,10976,8263,8086,9219,9245,6893,4151,
9552,10368,7062,9037,9600,9775,9995,8759,8263,10500,8597,9031,9305,6766,381

In [17]:
print('All vs all correlation')
all_vs_all_correlation(cl_names,cl_maps)

All vs all correlation
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1.00,0.73,0.82,0.78,0.74,0.72,0.76,0.56,0.79,0.56,0.66,0.86,0.70,0.69,0.78,
0.73,1.00,0.71,0.68,0.51,0.55,0.58,0.38,0.58,0.38,0.55,0.72,0.58,0.65,0.63,
0.82,0.71,1.00,0.87,0.76,0.80,0.81,0.59,0.79,0.72,0.74,0.84,0.83,0.85,0.80,
0.78,0.68,0.87,1.00,0.73,0.83,0.83,0.55,0.74,0.72,0.77,0.84,0.84,0.86,0.83,
0.74,0.51,0.76,0.73,1.00,0.81,0.76,0.59,0.83,0.74,0.66,0.73,0.70,0.75,0.78,
0.72,0.55,0.80,0.83,0.81,1.00,0.86,0.58,0.78,0.81,0.73,0.78,0.80,0.85,0.91,
0.76,0.58,0.81,0.83,0.76,0.86,1.00,0.61,0.75,0.81,0.74,0.82,0.77,0.81,0.86,
0.56,0.38,0.59,0.55,0.59,0.58,0.61,1.00,0.53,0.63,0.62,0.59,0.55,0.56,0.53,
0.79,0.58,0.79,0.74,0.83,0.78,0.75,0.53,1.00,0.63,0.60,0.76,0.68,0.72,0.74,
0.56,0.38,0.72,0.72,0.74,0.81,0.81,0.63,0.63,1.00,0.74,0.69,0.70,0.77,0.82,
0.66,0.55,0.74,0.77,0.66,0.73,0.74,0.62,0.60,0.74,1.00,0.74,0.76,0.79,0.74,
0.86,0.72,0.84,0.84,0.73,0.78,0.82,

## Non-coding

In [18]:
filepath = DATA_DIR + NC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath,exclude_lines=EXCLUDE)
#print('Noncoding Genes: number of RCI values per cell line')
#for i in range(len(cl_names)):
#    print('%2d %10s %5d' % (i, cl_names[i], len(cl_maps[i].keys())))
union = get_union(cl_maps)
print('The union of RCI values contains',len(union),'genes.')

No cell lines excluded.
Remaining cell lines: ['A549', 'H1.hESC', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']
The union of RCI values contains 6423 genes.


In [19]:
show_rci_counts_table(cl_names,cl_maps)

The intersection of RCI values contains 29 genes.
All vs all intersection
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
2018,1740,706,1053,828,1137,1457,601,845,516,536,1232,1166,739,408,
1740,4669,1070,1639,1111,1782,2419,809,1273,686,786,1936,1953,1162,498,
706,1070,1248,677,494,690,909,364,551,321,340,732,737,491,210,
1053,1639,677,1900,779,1099,1346,594,864,512,555,1147,1163,718,359,
828,1111,494,779,1288,881,986,532,693,481,492,909,891,547,312,
1137,1782,690,1099,881,2089,1451,616,945,548,596,1295,1262,772,397,
1457,2419,909,1346,986,1451,2951,704,1100,611,670,1559,1549,924,445,
601,809,364,594,532,616,704,861,494,367,386,607,638,389,200,
845,1273,551,864,693,945,1100,494,1497,460,445,968,916,570,323,
516,686,321,512,481,548,611,367,460,763,340,561,552,362,198,
536,786,340,555,492,596,670,386,445,340,848,612,605,381,209,
1232,1936,732,1147,909,1295,1559,607,968,561,612,2291,1267,804,427,
1166,1953,737,1163,891,1262,1549,6

In [20]:
print('All vs all correlation')
all_vs_all_correlation(cl_names,cl_maps)

All vs all correlation
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1.00,0.57,0.73,0.81,0.76,0.81,0.80,0.59,0.79,0.66,0.73,0.82,0.73,0.71,0.83,
0.57,1.00,0.55,0.54,0.39,0.49,0.39,0.33,0.41,0.25,0.46,0.53,0.45,0.59,0.60,
0.73,0.55,1.00,0.79,0.77,0.74,0.73,0.66,0.79,0.77,0.71,0.80,0.72,0.77,0.75,
0.81,0.54,0.79,1.00,0.81,0.87,0.82,0.63,0.78,0.72,0.75,0.81,0.82,0.81,0.83,
0.76,0.39,0.77,0.81,1.00,0.83,0.83,0.67,0.81,0.81,0.73,0.83,0.68,0.73,0.85,
0.81,0.49,0.74,0.87,0.83,1.00,0.86,0.67,0.82,0.77,0.73,0.84,0.78,0.81,0.89,
0.80,0.39,0.73,0.82,0.83,0.86,1.00,0.64,0.80,0.80,0.73,0.82,0.71,0.75,0.86,
0.59,0.33,0.66,0.63,0.67,0.67,0.64,1.00,0.60,0.71,0.68,0.66,0.57,0.54,0.59,
0.79,0.41,0.79,0.78,0.81,0.82,0.80,0.60,1.00,0.74,0.67,0.83,0.69,0.67,0.81,
0.66,0.25,0.77,0.72,0.81,0.77,0.80,0.71,0.74,1.00,0.72,0.74,0.66,0.71,0.79,
0.73,0.46,0.71,0.75,0.73,0.73,0.73,0.68,0.67,0.72,1.00,0.70,0.74,0.78,0.75,
0.82,0.53,0.80,0.81,0.83,0.84,0.82,