# Analyze RCI correlations between cell lines

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2023-01-08 12:35:10.337949
Python 3.10.0
sklearn 1.1.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/TrainTest/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = 'D:/Adjeroh/Localization/TrainTest/'   # Windows
    DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/'    # Mac
print(DATA_DIR)

/Users/jasonmiller/WVU/Localization/TrainTest/


In [3]:
PC_RCI_FILE =      'CNRCI_coding_train_RCI.gc42.csv'
NC_RCI_FILE =      'CNRCI_noncoding_train_RCI.gc42.csv'

In [8]:
def load_RCI_data(filepath):
    cell_line_names = []
    cell_line_maps  = []
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            line = row.strip()
            fields = line.split(',')
            gene_id = fields[0]
            if header is None:
                header = row 
                for i in range(len(fields)): 
                    cell_line_name = fields[i]
                    cell_line_names.append(cell_line_name)
                    cell_line_genes = {}
                    cell_line_maps.append(cell_line_genes)
            else:
                for i in range(len(fields)): 
                    if i>0:  # column 0 is gene_id
                        rci_val = fields[i]
                        if rci_val != "nan":
                            real_value = float(rci_val)
                            cell_line_genes = cell_line_maps[i]
                            cell_line_genes[gene_id] = real_value 
    return cell_line_names, cell_line_maps

In [31]:
def all_vs_all_intersection(maps):
    NUM = len(maps)
    SHOW = 0
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        for j in range(NUM):
            j_genes = set(maps[j].keys())
            section = i_genes.intersection(j_genes)
            SHOW = len(section)
            if i==0:
                SHOW = len(j_genes)
            elif j==0:
                SHOW = len(i_genes)
            print(SHOW,end=',')
        print()

In [32]:
def get_union(maps):
    union = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = one_map.keys()
        union = union.union(genes)
    return union
def get_intersection(maps):
    section = set()
    for i in range(len(maps)):
        one_map = maps[i]
        genes = set(one_map.keys())
        if i==0:
            pass
        elif i==1:
            section = genes
        else:
            section = section.intersection(genes)
    return section
def show_rci_counts_table(names,maps):
    for i in range(len(names)):
        print('%10s %5d' % (names[i], len(maps[i].keys())))
    union = get_union(cl_maps)
    print('The union of RCI values contains',len(union),'genes.')
    section = get_intersection(cl_maps)
    print('The intersection of RCI values contains',len(section),'genes.')
    print('All vs all intersection')
    all_vs_all_intersection(cl_maps)

In [33]:
filepath = DATA_DIR + PC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath)
print('Coding Genes: number of RCI values per cell line')
show_rci_counts_table(cl_names,cl_maps)

Coding Genes: number of RCI values per cell line
   gene_id     0
      A549 10338
   H1.hESC 13000
   HeLa.S3  6983
     HepG2  9424
    HT1080  9575
     HUVEC 10257
     MCF.7 11060
  NCI.H460  8441
      NHEK  8738
  SK.MEL.5  8407
   SK.N.DZ  8510
   SK.N.SH  9689
   GM12878  9931
      K562  6676
    IMR.90  4001
The union of RCI values contains 13978 genes.
The intersection of RCI values contains 1425 genes.
All vs all intersection
0,10338,13000,6983,9424,9575,10257,11060,8441,8738,8407,8510,9689,9931,6676,4001,
10338,10338,10111,6397,8366,8617,9048,9494,7775,7615,7652,7582,8466,8524,6116,3718,
13000,10111,13000,6852,9210,9387,10047,10779,8339,8489,8298,8441,9457,9601,6568,3951,
6983,6397,6852,6983,6116,6241,6447,6642,5707,5782,5627,5510,6060,6174,4794,2729,
9424,8366,9210,6116,9424,8004,8426,8737,7265,7242,7224,7161,7858,8059,5948,3411,
9575,8617,9387,6241,8004,9575,8850,8942,7689,7467,7688,7549,8198,8305,5967,3488,
10257,9048,10047,6447,8426,8850,10257,9457,7892,7795,7814,7778

In [34]:
filepath = DATA_DIR + NC_RCI_FILE
cl_names,cl_maps = load_RCI_data(filepath)
print('Noncoding Genes: number of RCI values per cell line')
show_rci_counts_table(cl_names,cl_maps)

Noncoding Genes: number of RCI values per cell line
   gene_id     0
      A549  1447
   H1.hESC  3327
   HeLa.S3   919
     HepG2  1362
    HT1080   936
     HUVEC  1488
     MCF.7  2156
  NCI.H460   610
      NHEK  1080
  SK.MEL.5   534
   SK.N.DZ   603
   SK.N.SH  1647
   GM12878  1698
      K562   958
    IMR.90   390
The union of RCI values contains 4662 genes.
The intersection of RCI values contains 19 genes.
All vs all intersection
0,1447,3327,919,1362,936,1488,2156,610,1080,534,603,1647,1698,958,390,
1447,1447,1240,504,750,590,815,1057,433,624,366,377,883,827,524,287,
3327,1240,3327,770,1158,806,1249,1749,572,914,477,557,1382,1375,827,347,
919,504,770,919,487,356,500,664,256,393,228,248,531,534,353,146,
1362,750,1158,487,1362,560,771,964,426,613,367,398,826,835,521,250,
936,590,806,356,560,936,634,721,395,497,338,355,661,641,397,218,
1488,815,1249,500,771,634,1488,1030,441,676,391,428,928,892,558,275,
2156,1057,1749,664,964,721,1030,2156,510,801,426,484,1120,1123,673,319,
610,4