# Analyze RCI correlations between cell lines
lncAtlas, exclude cell line 1 (H1.hESC), training data only.

Include only genes also represented in GenCode 43.
We can only train and test on genes for which we have sequence.
Some gene IDs from lncAtlas (2017) are no longer represented in GenCode 43 (2023).
However, the exclusion makes little difference, if any.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import scipy.stats as ss
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2023-03-26 13:55:25.896943
Python 3.10.0
sklearn 1.1.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/TrainTest/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = 'D:/Adjeroh/Localization/TrainTest/'   # Windows
    DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'    # Mac
print(DATA_DIR)

/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/


In [3]:
PC_RCI_FILE  = 'train.pc_RCI.csv'
PC_GENE_FILE = 'train.canon_pc_transcripts.csv'
NC_RCI_FILE  = 'train.lncRNA_RCI.csv'
NC_GENE_FILE = 'train.canon_lncRNA_transcripts.csv'
CELL_LINES   = 15
EXCLUDE      = [1]

## Genes with GenCode sequence

In [4]:
def load_gencode_genes(filepath):
    gene_set = set()
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
            else:
                fields = row.strip().split(',')
                tran_id = fields[0]
                gene_id = fields[1]
                gene_set.add(gene_id)
    return gene_set
PC_GENES = load_gencode_genes(DATA_DIR + PC_GENE_FILE)
NC_GENES = load_gencode_genes(DATA_DIR + NC_GENE_FILE)
GC_GENES = PC_GENES.union(NC_GENES)
print('GenCode gene counts (pc,nc,both):',len(PC_GENES),len(NC_GENES),len(GC_GENES))

GenCode gene counts (pc,nc,both): 14135 5139 19274


## CNRCI values per gene

In [5]:
def values_per_gene(filepath,exclude_lines=[]):
    all_counts=[]
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
            else:
                count_rci = 0
                fields = row.strip().split(',')
                gene_id = fields.pop(0)
                if gene_id in GC_GENES:
                    for i in range(len(fields)):
                        if fields[i] != 'nan' and i not in exclude_lines:
                            count_rci += 1
                    all_counts.append(count_rci)
    all_counts = np.asarray(all_counts)
    return all_counts
def analyze_values_per_gene(all_counts):
    print(len(all_counts), 'genes examined')
    print(np.mean(all_counts), 'average number of CNRCI for one gene')
    print(min(all_counts), max(all_counts), 'min and max CNRCI for any gene')
    print()
    print('Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI')
    for threshold in range(CELL_LINES+1):
        equal = np.count_nonzero(all_counts==threshold)
        more  = np.count_nonzero(all_counts>threshold)
        less  = np.count_nonzero(all_counts<threshold)
        print("%2d %6d %6d %6d" % (threshold, equal, more, less))

In [6]:
print('Protein-coding genes with some number of CNRCI values')
filepath = DATA_DIR + PC_RCI_FILE
counts = values_per_gene(filepath,exclude_lines=EXCLUDE)
analyze_values_per_gene(counts)

Protein-coding genes with some number of CNRCI values
14135 genes examined
8.721896002829855 average number of CNRCI for one gene
0 14 min and max CNRCI for any gene

Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI
 0    466  13669      0
 1    969  12700    466
 2    796  11904   1435
 3    615  11289   2231
 4    535  10754   2846
 5    533  10221   3381
 6    492   9729   3914
 7    546   9183   4406
 8    617   8566   4952
 9    741   7825   5569
10    985   6840   6310
11   1364   5476   7295
12   1812   3664   8659
13   2200   1464  10471
14   1464      0  12671
15      0      0  14135


In [7]:
print('Non-coding genes with some number of CNRCI values')
filepath = DATA_DIR + NC_RCI_FILE
counts = values_per_gene(filepath)
analyze_values_per_gene(counts)

Non-coding genes with some number of CNRCI values
5139 genes examined
4.205292858532789 average number of CNRCI for one gene
1 15 min and max CNRCI for any gene

Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI
 0      0   5139      0
 1   1597   3542      0
 2    768   2774   1597
 3    524   2250   2365
 4    407   1843   2889
 5    334   1509   3296
 6    274   1235   3630
 7    261    974   3904
 8    217    757   4165
 9    163    594   4382
10    171    423   4545
11    134    289   4716
12    114    175   4850
13     87     88   4964
14     69     19   5051
15     19      0   5120


## Values per cell line

In [8]:
def values_per_cell_line(filepath):
    names = np.zeros(CELL_LINES)
    values = []
    for i in range(CELL_LINES):
        values.append([])
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            row = row.strip()
            if header is None:
                header = row 
                # ignore col 0 = 'gene_id'
                names = row.split(',')
                names.pop(0)
            else:
                fields = row.split(',')
                gene_id = fields.pop(0)
                if gene_id in GC_GENES:
                    for i in range(len(fields)):
                        if fields[i] != 'nan':
                            value = float(fields[i])
                            values[i].append(value)
    return names,values
def analyze_values_per_cell_line(names,counts,exclude_lines=[]):
    print('Cell line, count, mean, stdev, min, max')
    for i in range(len(names)):
        if i not in exclude_lines:
            #print("%8s %6d %6.3f %6.3f %6.3f %6.3f " %
            #    (names[i], len(counts[i]), np.mean(counts[i]), np.std(counts[i]),
            #    min(counts[i]), max(counts[i])))
            print(names[i], len(counts[i]), np.mean(counts[i]), np.std(counts[i]),
                min(counts[i]), max(counts[i]))

In [9]:
print('Protein-coding CNRCI values per cell line')
filepath = DATA_DIR + PC_RCI_FILE
names,counts = values_per_cell_line(filepath)
analyze_values_per_cell_line(names,counts,exclude_lines=EXCLUDE)

Protein-coding CNRCI values per cell line
Cell line, count, mean, stdev, min, max
A549 10474 -0.05976983478537331 1.1981512707728252 -6.45121 10.5541
HeLa.S3 7055 -0.05204840213182141 1.5081758665287959 -6.89372 4.40693
HepG2 9549 -0.28159588836223687 1.697331904911219 -7.68568 4.4075
HT1080 9667 0.028601782410468613 1.2182034378268005 -6.45921 3.90689
HUVEC 10393 -0.19753256528480712 1.4503234673517502 -7.3837 5.47449
MCF.7 11147 -0.2306045686635866 1.690233969810388 -8.19311 6.63662
NCI.H460 8541 -0.3187753048713266 1.2739545077228769 -8.93799 6.48113
NHEK 8859 -0.2458269187210746 1.3490845772383626 -7.38627 3.61471
SK.MEL.5 8472 -0.03255823837783286 1.6583024352563216 -8.47407 5.6231
SK.N.DZ 8540 0.03112836065374708 1.149277386338518 -6.58496 3.97728
SK.N.SH 9764 -0.34718655745575583 1.705001843534662 -8.13443 6.16065
GM12878 10038 -0.2311822760021917 1.3109922002552217 -6.93074 5.1883
K562 6727 -0.1312907043020663 1.4424511219062668 -5.8419 3.9542
IMR.90 4058 -0.06950355284869393 1

In [10]:
print('Non-coding CNRCI values per cell line')
filepath = DATA_DIR + NC_RCI_FILE
names,counts = values_per_cell_line(filepath)
analyze_values_per_cell_line(names,counts,exclude_lines=EXCLUDE)

Non-coding CNRCI values per cell line
Cell line, count, mean, stdev, min, max
A549 1635 -0.543231414409786 1.6423750408739248 -5.59991 4.49338
HeLa.S3 999 -1.5328410593593595 1.8489583039982518 -7.5727 4.14975
HepG2 1560 -1.4347548832820511 1.9767969260079241 -7.57193 3.64697
HT1080 1050 -0.5911357883047619 1.8555808753482688 -7.74147 4.24793
HUVEC 1674 -1.4093593955854242 2.0546907078051406 -8.23002 3.64386
MCF.7 2377 -1.56400102796382 2.1838592553463188 -9.1779 3.88264
NCI.H460 700 -1.495083041 1.8548862772960437 -7.61176 4.20387
NHEK 1214 -1.225780156589786 1.9808793877267008 -8.40027 3.12653
SK.MEL.5 618 -1.7226884745469255 2.1480533868631126 -8.66356 3.81823
SK.N.DZ 683 -0.6119149324743778 1.5994391693300594 -8.33023 3.20511
SK.N.SH 1865 -1.4181864412064344 2.124505876393391 -9.53877 4.05156
GM12878 1931 -1.1746448151424134 1.6076889900159934 -8.32125 3.36075
K562 1090 -1.0155525348990828 1.6916048915065078 -6.66607 4.35815
IMR.90 441 -0.7462931029478458 2.1038014196731583 -7.8926

## Correlations

In [11]:
def load_RCI_data(filepath,exclude_lines=[]):
    cell_line_names = []  # list of 15 names like A549
    cell_line_maps  = []  # 15 maps for 15 cell lines, each maps one gene name to one RCI value
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            line = row.strip()
            fields = line.split(',')
            if header is None:
                header = row 
                # exclude column 0 = gene name
                for col in range(1,len(fields)): 
                    cell_line_name = fields[col]
                    cell_line_names.append(cell_line_name)
                    # start a new empty map for each column
                    cell_line_genes = {}
                    cell_line_maps.append(cell_line_genes)
            else:
                gene_id = fields[0]
                if gene_id in GC_GENES:
                    for col in range(1,len(fields)): 
                        rci_val = fields[col]
                        if rci_val != "nan":
                            real_value = float(rci_val)
                            position = col-1 # 1st cell line in position 0
                            cell_line_name = cell_line_names[position] 
                            cell_line_map  = cell_line_maps[position]
                            cell_line_map[gene_id] = real_value 
    for i in exclude_lines:
        print('Exclude cell line', i, cell_line_names[i])
        del cell_line_names[i]
        del cell_line_maps[i]
    if len(exclude_lines)==0:
        print('No cell lines excluded.')
    print('Remaining cell lines:',cell_line_names)
    return cell_line_names, cell_line_maps

In [12]:
def all_vs_all_intersection(names,maps):
    NUM = len(maps)
    SHOW = 0
    for i in range(NUM):
        print(names[i],end=',')
    print()
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        for j in range(NUM):
            j_genes = set(maps[j].keys())
            section = i_genes.intersection(j_genes)
            SHOW = len(section)
            #if i==0:
            #    SHOW = len(j_genes)
            #elif j==0:
            #    SHOW = len(i_genes)
            print(SHOW,end=',')
        print()

In [13]:
def all_vs_all_correlation(names,maps):
    NUM = len(maps)
    # heading
    for i in range(NUM):
        print(names[i],end=',')
    print()
    # table
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        for j in range(NUM):
            j_genes = set(maps[j].keys())
            common_genes = i_genes.intersection(j_genes)
            i_values = np.array([maps[i][k] for k in common_genes])
            j_values = np.array([maps[j][k] for k in common_genes])
            r,p = ss.pearsonr(i_values,j_values)
            print('%.2f' % r, end=',')
        print()

In [14]:
def get_union(maps):
    union = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = one_map.keys()
        union = union.union(genes)
    return union
def get_intersection(maps):
    section = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = set(one_map.keys())
        if i==0:
            pass
        elif i==1:
            section = genes
        else:
            section = section.intersection(genes)
    return section
def show_rci_counts_table(names,maps):
    section = get_intersection(maps)
    print('The intersection of RCI values contains',len(section),'genes.')
    print('All vs all intersection')
    all_vs_all_intersection(names,maps)

## Coding

In [15]:
filepath = DATA_DIR + PC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath,exclude_lines=[1])
#print('Coding Genes: number of RCI values per cell line')
#for i in range(len(cl_names)):
#    print('%2d %10s %5d' % (i, cl_names[i], len(cl_maps[i].keys())))
union = get_union(cl_maps)
print('The union of RCI values contains',len(union),'genes.')

Exclude cell line 1 H1.hESC
Remaining cell lines: ['A549', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']
The union of RCI values contains 13669 genes.


In [16]:
show_rci_counts_table(cl_names,cl_maps)

The intersection of RCI values contains 1474 genes.
All vs all intersection
A549,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
10474,6461,8470,8733,9159,9613,7883,7737,7738,7642,8545,8612,6157,3765,
6461,7055,6202,6313,6522,6711,5752,5847,5708,5560,6104,6227,4841,2753,
8470,6202,9549,8112,8524,8841,7360,7357,7324,7212,7937,8170,6010,3447,
8733,6313,8112,9667,8933,9042,7751,7564,7764,7581,8255,8367,5992,3530,
9159,6522,8524,8933,10393,9568,7968,7897,7898,7825,8685,8781,6241,3698,
9613,6711,8841,9042,9568,11147,8163,8129,8068,8008,8857,9055,6360,3749,
7883,5752,7360,7751,7968,8163,8541,6760,7081,6929,7333,7490,5459,3120,
7737,5847,7357,7564,7897,8129,6760,8859,6693,6526,7442,7467,5570,3378,
7738,5708,7324,7764,7898,8068,7081,6693,8472,6928,7290,7494,5472,3103,
7642,5560,7212,7581,7825,8008,6929,6526,6928,8540,7311,7420,5382,3042,
8545,6104,7937,8255,8685,8857,7333,7442,7290,7311,9764,8135,5882,3672,
8612,6227,8170,8367,8781,9055,7490,7467,74

In [17]:
print('All vs all correlation')
all_vs_all_correlation(cl_names,cl_maps)

All vs all correlation
A549,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1.00,0.82,0.78,0.74,0.72,0.75,0.55,0.79,0.56,0.66,0.86,0.71,0.69,0.78,
0.82,1.00,0.87,0.76,0.80,0.81,0.58,0.79,0.71,0.74,0.84,0.83,0.85,0.80,
0.78,0.87,1.00,0.73,0.83,0.83,0.55,0.74,0.72,0.77,0.84,0.84,0.86,0.83,
0.74,0.76,0.73,1.00,0.81,0.76,0.60,0.82,0.74,0.66,0.73,0.71,0.75,0.78,
0.72,0.80,0.83,0.81,1.00,0.86,0.58,0.78,0.81,0.73,0.78,0.80,0.85,0.91,
0.75,0.81,0.83,0.76,0.86,1.00,0.60,0.75,0.81,0.74,0.83,0.77,0.81,0.86,
0.55,0.58,0.55,0.60,0.58,0.60,1.00,0.53,0.63,0.62,0.58,0.55,0.56,0.53,
0.79,0.79,0.74,0.82,0.78,0.75,0.53,1.00,0.63,0.60,0.76,0.69,0.72,0.74,
0.56,0.71,0.72,0.74,0.81,0.81,0.63,0.63,1.00,0.74,0.69,0.70,0.77,0.83,
0.66,0.74,0.77,0.66,0.73,0.74,0.62,0.60,0.74,1.00,0.75,0.76,0.79,0.74,
0.86,0.84,0.84,0.73,0.78,0.83,0.58,0.76,0.69,0.75,1.00,0.72,0.75,0.87,
0.71,0.83,0.84,0.71,0.80,0.77,0.55,0.69,0.70,0.76,0.72,1.00,0.89,0.78,
0.69,0.85,0.86,0.75,0.85,0.

## Non-coding

In [18]:
filepath = DATA_DIR + NC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath,exclude_lines=[1])
#print('Noncoding Genes: number of RCI values per cell line')
#for i in range(len(cl_names)):
#    print('%2d %10s %5d' % (i, cl_names[i], len(cl_maps[i].keys())))
union = get_union(cl_maps)
print('The union of RCI values contains',len(union),'genes.')

Exclude cell line 1 H1.hESC
Remaining cell lines: ['A549', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']
The union of RCI values contains 4369 genes.


In [19]:
show_rci_counts_table(cl_names,cl_maps)

The intersection of RCI values contains 22 genes.
All vs all intersection
A549,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1635,559,862,669,917,1185,484,693,414,431,1012,955,609,321,
559,999,547,397,556,733,299,455,265,266,593,598,390,163,
862,547,1560,640,897,1101,480,712,408,448,945,954,593,287,
669,397,640,1050,722,811,437,575,389,389,743,725,453,247,
917,556,897,722,1674,1182,507,779,439,475,1057,1029,635,315,
1185,733,1101,811,1182,2377,573,902,503,541,1282,1277,763,357,
484,299,480,437,507,573,700,412,299,314,497,524,328,162,
693,455,712,575,779,902,412,1214,375,364,804,764,476,265,
414,265,408,389,439,503,299,375,618,273,458,450,292,156,
431,266,448,389,475,541,314,364,273,683,494,489,309,166,
1012,593,945,743,1057,1282,497,804,458,494,1865,1044,665,345,
955,598,954,725,1029,1277,524,764,450,489,1044,1931,708,299,
609,390,593,453,635,763,328,476,292,309,665,708,1090,203,
321,163,287,247,315,357,162,265,156,166,345,299,203,441,


In [20]:
print('All vs all correlation')
all_vs_all_correlation(cl_names,cl_maps)

All vs all correlation
A549,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1.00,0.71,0.80,0.78,0.81,0.80,0.59,0.80,0.65,0.72,0.81,0.72,0.69,0.83,
0.71,1.00,0.79,0.77,0.74,0.73,0.66,0.79,0.76,0.70,0.79,0.70,0.78,0.72,
0.80,0.79,1.00,0.82,0.87,0.82,0.63,0.78,0.71,0.75,0.81,0.81,0.81,0.83,
0.78,0.77,0.82,1.00,0.84,0.84,0.68,0.81,0.81,0.73,0.84,0.69,0.72,0.85,
0.81,0.74,0.87,0.84,1.00,0.86,0.66,0.83,0.76,0.73,0.83,0.78,0.80,0.89,
0.80,0.73,0.82,0.84,0.86,1.00,0.63,0.80,0.79,0.72,0.82,0.70,0.75,0.87,
0.59,0.66,0.63,0.68,0.66,0.63,1.00,0.59,0.70,0.66,0.65,0.56,0.54,0.60,
0.80,0.79,0.78,0.81,0.83,0.80,0.59,1.00,0.74,0.67,0.83,0.68,0.68,0.81,
0.65,0.76,0.71,0.81,0.76,0.79,0.70,0.74,1.00,0.70,0.74,0.63,0.70,0.80,
0.72,0.70,0.75,0.73,0.73,0.72,0.66,0.67,0.70,1.00,0.69,0.73,0.76,0.75,
0.81,0.79,0.81,0.84,0.83,0.82,0.65,0.83,0.74,0.69,1.00,0.66,0.69,0.87,
0.72,0.70,0.81,0.69,0.78,0.70,0.56,0.68,0.63,0.73,0.66,1.00,0.83,0.71,
0.69,0.78,0.81,0.72,0.80,0.