# Analyze RCI correlations between cell lines
Here we compute the correlations after removing the two outlier cell lines in GenCode 43 data.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import scipy.stats as ss
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2023-03-04 09:43:50.456512
Python 3.10.0
sklearn 1.1.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/TrainTest/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = 'D:/Adjeroh/Localization/TrainTest/'   # Windows
    DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'    # Mac
print(DATA_DIR)

/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/


In [3]:
PC_RCI_FILE =      'train.pc_RCI.csv'
NC_RCI_FILE =      'train.lncRNA_RCI.csv'

## CNRCI values per gene

In [27]:
def values_per_gene(filepath):
    all_counts=[]
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
            else:
                fields = row.strip().split(',')
                count_nan=0
                for i in range(1,len(fields)):
                    if fields[i]=='nan':
                        count_nan += 1
                count_rci = len(fields) -1 -count_nan
                all_counts.append(count_rci)
    all_counts = np.asarray(all_counts)
    print(len(all_counts), 'genes examined')
    print(np.mean(all_counts), 'average number of CNRCI for one gene')
    print(min(all_counts), max(all_counts), 'min and max CNRCI for one gene')
    print(np.count_nonzero(all_counts==1), 'genes have only one CNRCI')
    print(np.count_nonzero(all_counts==2), 'genes have only two CNRCI')
    print(np.count_nonzero(all_counts==3), 'genes have only three CNRCI')
    print(np.count_nonzero(all_counts>1), 'genes have more than one CNRCI')
    print(np.count_nonzero(all_counts>2), 'genes have more than two CNRCI')
    print(np.count_nonzero(all_counts==15), 'genes have 15 CNRCI -- one for each cell line')

In [28]:
print('Protein-coding CNRCI values')
filepath = DATA_DIR + PC_RCI_FILE
values_per_gene(filepath)

Protein-coding CNRCI values
14135 genes examined
9.651220374955784 average number of CNRCI for one gene
1 15 min and max CNRCI for one gene
887 genes have only one CNRCI
791 genes have only two CNRCI
677 genes have only three CNRCI
13248 genes have more than one CNRCI
12457 genes have more than two CNRCI
1463 genes have 15 CNRCI -- one for each cell line


In [29]:
print('Non-coding CNRCI values')
filepath = DATA_DIR + NC_RCI_FILE
values_per_gene(filepath)

Non-coding CNRCI values
5139 genes examined
4.205292858532789 average number of CNRCI for one gene
1 15 min and max CNRCI for one gene
1597 genes have only one CNRCI
768 genes have only two CNRCI
524 genes have only three CNRCI
3542 genes have more than one CNRCI
2774 genes have more than two CNRCI
19 genes have 15 CNRCI -- one for each cell line


## Correlations

In [4]:
def load_RCI_data(filepath,exclude_lines=[]):
    cell_line_names = []  # list of 15 names like A549
    cell_line_maps  = []  # 15 maps for 15 cell lines, each maps one gene name to one RCI value
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            line = row.strip()
            fields = line.split(',')
            if header is None:
                header = row 
                # exclude column 0 = gene name
                for col in range(1,len(fields)): 
                    cell_line_name = fields[col]
                    cell_line_names.append(cell_line_name)
                    # start a new empty map for each column
                    cell_line_genes = {}
                    cell_line_maps.append(cell_line_genes)
            else:
                gene_id = fields[0]
                for col in range(1,len(fields)): 
                    rci_val = fields[col]
                    if rci_val != "nan":
                        real_value = float(rci_val)
                        position = col-1 # 1st cell line in position 0
                        cell_line_name = cell_line_names[position] 
                        cell_line_map  = cell_line_maps[position]
                        cell_line_map[gene_id] = real_value 
    for i in exclude_lines:
        print('Exclude cell line', i, cell_line_names[i])
        del cell_line_names[i]
        del cell_line_maps[i]
    if len(exclude_lines)==0:
        print('No cell lines excluded.')
    print('Remaining cell lines:',cell_line_names)
    return cell_line_names, cell_line_maps

In [5]:
def all_vs_all_intersection(names,maps):
    NUM = len(maps)
    SHOW = 0
    for i in range(NUM):
        print(names[i],end=',')
    print()
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        for j in range(NUM):
            j_genes = set(maps[j].keys())
            section = i_genes.intersection(j_genes)
            SHOW = len(section)
            if i==0:
                SHOW = len(j_genes)
            elif j==0:
                SHOW = len(i_genes)
            print(SHOW,end=',')
        print()

In [6]:
def all_vs_all_correlation(names,maps):
    NUM = len(maps)
    # heading
    for i in range(NUM):
        print(names[i],end=',')
    print()
    # table
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        for j in range(NUM):
            j_genes = set(maps[j].keys())
            common_genes = i_genes.intersection(j_genes)
            i_values = np.array([maps[i][k] for k in common_genes])
            j_values = np.array([maps[j][k] for k in common_genes])
            r,p = ss.pearsonr(i_values,j_values)
            print('%.2f' % r, end=',')
        print()

In [7]:
def get_union(maps):
    union = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = one_map.keys()
        union = union.union(genes)
    return union
def get_intersection(maps):
    section = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = set(one_map.keys())
        if i==0:
            pass
        elif i==1:
            section = genes
        else:
            section = section.intersection(genes)
    return section
def show_rci_counts_table(names,maps):
    section = get_intersection(maps)
    print('The intersection of RCI values contains',len(section),'genes.')
    print('All vs all intersection')
    all_vs_all_intersection(names,maps)

## Coding

In [8]:
filepath = DATA_DIR + PC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath)
print('Coding Genes: number of RCI values per cell line')
for i in range(len(cl_names)):
    print('%2d %10s %5d' % (i, cl_names[i], len(cl_maps[i].keys())))
union = get_union(cl_maps)
print('The union of RCI values contains',len(union),'genes.')

No cell lines excluded.
Remaining cell lines: ['A549', 'H1.hESC', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']
Coding Genes: number of RCI values per cell line
 0       A549 10474
 1    H1.hESC 13136
 2    HeLa.S3  7055
 3      HepG2  9549
 4     HT1080  9667
 5      HUVEC 10393
 6      MCF.7 11147
 7   NCI.H460  8541
 8       NHEK  8859
 9   SK.MEL.5  8472
10    SK.N.DZ  8540
11    SK.N.SH  9764
12    GM12878 10038
13       K562  6727
14     IMR.90  4058
The union of RCI values contains 14135 genes.


In [9]:
show_rci_counts_table(cl_names,cl_maps)

The intersection of RCI values contains 1473 genes.
All vs all intersection
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
10474,13136,7055,9549,9667,10393,11147,8541,8859,8472,8540,9764,10038,6727,4058,
13136,13136,6933,9339,9486,10172,10863,8437,8605,8367,8481,9539,9696,6624,4008,
7055,6933,7055,6202,6313,6522,6711,5752,5847,5708,5560,6104,6227,4841,2753,
9549,9339,6202,9549,8112,8524,8841,7360,7357,7324,7212,7937,8170,6010,3447,
9667,9486,6313,8112,9667,8933,9042,7751,7564,7764,7581,8255,8367,5992,3530,
10393,10172,6522,8524,8933,10393,9568,7968,7897,7898,7825,8685,8781,6241,3698,
11147,10863,6711,8841,9042,9568,11147,8163,8129,8068,8008,8857,9055,6360,3749,
8541,8437,5752,7360,7751,7968,8163,8541,6760,7081,6929,7333,7490,5459,3120,
8859,8605,5847,7357,7564,7897,8129,6760,8859,6693,6526,7442,7467,5570,3378,
8472,8367,5708,7324,7764,7898,8068,7081,6693,8472,6928,7290,7494,5472,3103,
8540,8481,5560,7212,7581,7825,8008,6929,652

In [10]:
print('All vs all correlation')
all_vs_all_correlation(cl_names,cl_maps)

All vs all correlation
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1.00,0.73,0.82,0.78,0.74,0.72,0.75,0.55,0.79,0.56,0.66,0.86,0.71,0.69,0.78,
0.73,1.00,0.71,0.68,0.51,0.55,0.58,0.38,0.58,0.38,0.55,0.71,0.58,0.65,0.63,
0.82,0.71,1.00,0.87,0.76,0.80,0.81,0.58,0.79,0.71,0.74,0.84,0.83,0.85,0.80,
0.78,0.68,0.87,1.00,0.73,0.83,0.83,0.55,0.74,0.72,0.77,0.84,0.84,0.86,0.83,
0.74,0.51,0.76,0.73,1.00,0.81,0.76,0.60,0.82,0.74,0.66,0.73,0.71,0.75,0.78,
0.72,0.55,0.80,0.83,0.81,1.00,0.86,0.58,0.78,0.81,0.73,0.78,0.80,0.85,0.91,
0.75,0.58,0.81,0.83,0.76,0.86,1.00,0.60,0.75,0.81,0.74,0.83,0.77,0.81,0.86,
0.55,0.38,0.58,0.55,0.60,0.58,0.60,1.00,0.53,0.63,0.62,0.58,0.55,0.56,0.53,
0.79,0.58,0.79,0.74,0.82,0.78,0.75,0.53,1.00,0.63,0.60,0.76,0.69,0.72,0.74,
0.56,0.38,0.71,0.72,0.74,0.81,0.81,0.63,0.63,1.00,0.74,0.69,0.70,0.77,0.83,
0.66,0.55,0.74,0.77,0.66,0.73,0.74,0.62,0.60,0.74,1.00,0.75,0.76,0.79,0.74,
0.86,0.71,0.84,0.84,0.73,0.78,0.83,

## Non-coding

In [11]:
filepath = DATA_DIR + NC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath)
print('Noncoding Genes: number of RCI values per cell line')
for i in range(len(cl_names)):
    print('%2d %10s %5d' % (i, cl_names[i], len(cl_maps[i].keys())))
union = get_union(cl_maps)
print('The union of RCI values contains',len(union),'genes.')

No cell lines excluded.
Remaining cell lines: ['A549', 'H1.hESC', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']
Noncoding Genes: number of RCI values per cell line
 0       A549  1635
 1    H1.hESC  3774
 2    HeLa.S3   999
 3      HepG2  1560
 4     HT1080  1050
 5      HUVEC  1674
 6      MCF.7  2377
 7   NCI.H460   700
 8       NHEK  1214
 9   SK.MEL.5   618
10    SK.N.DZ   683
11    SK.N.SH  1865
12    GM12878  1931
13       K562  1090
14     IMR.90   441
The union of RCI values contains 5139 genes.


In [12]:
show_rci_counts_table(cl_names,cl_maps)

The intersection of RCI values contains 22 genes.
All vs all intersection
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1635,3774,999,1560,1050,1674,2377,700,1214,618,683,1865,1931,1090,441,
3774,3774,867,1347,910,1443,1966,660,1054,556,631,1602,1605,962,402,
999,867,999,547,397,556,733,299,455,265,266,593,598,390,163,
1560,1347,547,1560,640,897,1101,480,712,408,448,945,954,593,287,
1050,910,397,640,1050,722,811,437,575,389,389,743,725,453,247,
1674,1443,556,897,722,1674,1182,507,779,439,475,1057,1029,635,315,
2377,1966,733,1101,811,1182,2377,573,902,503,541,1282,1277,763,357,
700,660,299,480,437,507,573,700,412,299,314,497,524,328,162,
1214,1054,455,712,575,779,902,412,1214,375,364,804,764,476,265,
618,556,265,408,389,439,503,299,375,618,273,458,450,292,156,
683,631,266,448,389,475,541,314,364,273,683,494,489,309,166,
1865,1602,593,945,743,1057,1282,497,804,458,494,1865,1044,665,345,
1931,1605,598,954,725,1029,1277,524,764,45

In [13]:
print('All vs all correlation')
all_vs_all_correlation(cl_names,cl_maps)

All vs all correlation
A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90,
1.00,0.56,0.71,0.80,0.78,0.81,0.80,0.59,0.80,0.65,0.72,0.81,0.72,0.69,0.83,
0.56,1.00,0.56,0.54,0.39,0.49,0.39,0.33,0.41,0.25,0.48,0.53,0.47,0.58,0.58,
0.71,0.56,1.00,0.79,0.77,0.74,0.73,0.66,0.79,0.76,0.70,0.79,0.70,0.78,0.72,
0.80,0.54,0.79,1.00,0.82,0.87,0.82,0.63,0.78,0.71,0.75,0.81,0.81,0.81,0.83,
0.78,0.39,0.77,0.82,1.00,0.84,0.84,0.68,0.81,0.81,0.73,0.84,0.69,0.72,0.85,
0.81,0.49,0.74,0.87,0.84,1.00,0.86,0.66,0.83,0.76,0.73,0.83,0.78,0.80,0.89,
0.80,0.39,0.73,0.82,0.84,0.86,1.00,0.63,0.80,0.79,0.72,0.82,0.70,0.75,0.87,
0.59,0.33,0.66,0.63,0.68,0.66,0.63,1.00,0.59,0.70,0.66,0.65,0.56,0.54,0.60,
0.80,0.41,0.79,0.78,0.81,0.83,0.80,0.59,1.00,0.74,0.67,0.83,0.68,0.68,0.81,
0.65,0.25,0.76,0.71,0.81,0.76,0.79,0.70,0.74,1.00,0.70,0.74,0.63,0.70,0.80,
0.72,0.48,0.70,0.75,0.73,0.73,0.72,0.66,0.67,0.70,1.00,0.69,0.73,0.76,0.75,
0.81,0.53,0.79,0.81,0.84,0.83,0.82,