# Analyze RCI correlations between cell lines
Do the same thing as notebook 101, but use numpy or pandas instead of loops.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import scipy.stats as ss
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2023-01-10 14:48:38.131895
Python 3.10.0
sklearn 1.1.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/TrainTest/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = 'D:/Adjeroh/Localization/TrainTest/'   # Windows
    DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/'    # Mac
print(DATA_DIR)

/Users/jasonmiller/WVU/Localization/TrainTest/


In [3]:
PC_RCI_FILE =      'CNRCI_coding_train_RCI.gc42.csv'
NC_RCI_FILE =      'CNRCI_noncoding_train_RCI.gc42.csv'

In [4]:
def load_RCI_data(filepath):
    df = pd.read_csv(filepath)
    return df

In [5]:
def average_correlation_per_cell_line(df,exclusions=[]):
    names = df.columns
    averages = []
    for i in range(1,len(names)):
        coli = names[i]
        if i not in exclusions:
            total = 0
            count = 0
            for j in range(1,len(names)):
                if i != j and j not in exclusions:
                    colj = names[j]
                    r = df[colj].corr(df[coli])
                    total += r
                    count += 1
            average = total / count
            averages.append((average,coli,i))
    return averages

In [6]:
# The most correlated cell lines are 6=HUVEC and 15=IMR.90.
# What is the range of differences in RCI values?
def compare_lines(df,LINE1,LINE2,cutoff=0):
    num_genes = len(df)
    differences={}
    for i in range(num_genes):
        gene = df['gene_id'].iloc[i]
        rci1  = df[LINE1].iloc[i]
        rci2  = df[LINE2].iloc[i]
        if not pd.isna(rci1) and not pd.isna(rci2):
            diff = int(0.5+abs(rci1-rci2))
            if diff > cutoff:
                print('Extreme',gene,rci1,rci2)
            if diff in differences.keys():
                differences[diff] += 1
            else:
                differences[diff] = 1
    return differences

# Coding

In [7]:
# This correctly converts "nan" to NaN = Python float(nan).
filepath = DATA_DIR + PC_RCI_FILE
df = load_RCI_data(filepath)
df

Unnamed: 0,gene_id,A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90
0,ENSG00000000003,1.080680,1.857340,1.868390,2.29436,0.866395,1.284940,1.726960,-0.266510,0.460806,1.837530,-1.119010,1.834430,,,
1,ENSG00000000005,,5.882640,,,,,,,,,,,,,
2,ENSG00000000419,1.326790,2.589540,1.275560,1.43865,0.771867,1.275460,1.155230,-0.496772,0.266253,0.364575,0.958075,2.194600,0.439030,,
3,ENSG00000000457,0.434284,-0.377326,-0.267569,-0.47502,-1.216230,-0.853779,-1.343170,-0.250651,-0.778011,,-0.811809,-0.780640,-0.561655,-0.974177,
4,ENSG00000000460,-0.154524,0.163728,-1.649930,-1.59481,-0.973326,-1.350700,-1.506060,-0.634401,0.016657,-2.349650,-0.492205,-0.610424,-1.597690,-0.615989,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13973,ENSG00000282419,,0.000000,,,,,,,,,,,1.215510,,
13974,ENSG00000282815,,0.736966,,,,,,,,,,0.415038,,,
13975,ENSG00000282881,,-2.321930,,,,,,,,,,,,,
13976,ENSG00000282988,,0.126710,,,,1.041000,0.418061,,-0.680577,,,-1.123170,,,


In [8]:
# Average correlation for each cell line.
# Note that two cell lines are outliers: 2 and 8.
averages = average_correlation_per_cell_line(df)
print('Avgerage correlation, Cell line name, Cell line number')
print(*sorted(averages),sep='\n')

Avgerage correlation, Cell line name, Cell line number
(0.5574057999232218, 'NCI.H460', 8)
(0.5847601779079602, 'H1.hESC', 2)
(0.6936143894737279, 'SK.MEL.5', 10)
(0.6994803754254235, 'SK.N.DZ', 11)
(0.7049238716146469, 'NHEK', 9)
(0.7180740086429938, 'HT1080', 5)
(0.7241143318749105, 'A549', 1)
(0.7337605907996098, 'GM12878', 13)
(0.760507867719563, 'SK.N.SH', 12)
(0.7647253395095782, 'K562', 14)
(0.7669433676609013, 'MCF.7', 7)
(0.7692875306602314, 'HUVEC', 6)
(0.7743148199441252, 'IMR.90', 15)
(0.7747476145367019, 'HepG2', 4)
(0.7797903431841074, 'HeLa.S3', 3)


In [9]:
# Average correlation for each cell line, excluding the outliers.
averages = average_correlation_per_cell_line(df,[2,8])
print('Avgerage correlation, Cell line name, Cell line number')
print(*sorted(averages),sep='\n')

Avgerage correlation, Cell line name, Cell line number
(0.7194501406520993, 'SK.N.DZ', 11)
(0.725011391136344, 'SK.MEL.5', 10)
(0.7306946409754279, 'NHEK', 9)
(0.7378350617352898, 'A549', 1)
(0.7466188378417412, 'HT1080', 5)
(0.7624685278905451, 'GM12878', 13)
(0.7789591079134003, 'SK.N.SH', 12)
(0.7925193511012832, 'K562', 14)
(0.7963590938806474, 'MCF.7', 7)
(0.8012702766410182, 'HepG2', 4)
(0.8018054766852066, 'HeLa.S3', 3)
(0.8038558303646638, 'HUVEC', 6)
(0.8081996574316969, 'IMR.90', 15)


In [10]:
differences = compare_lines(df,'HUVEC','IMR.90',3.5)
print()
for key in sorted(differences.keys()):
    print('rounded difference=',key,'num genes=',differences[key])

Extreme ENSG00000108511 -2.53864 1.86602
Extreme ENSG00000120075 -3.02956 1.5656
Extreme ENSG00000149564 0.703362 -3.66676
Extreme ENSG00000167874 2.18233 -4.08746
Extreme ENSG00000187553 -5.82273 -0.160465

rounded difference= 0 num genes= 2299
rounded difference= 1 num genes= 1257
rounded difference= 2 num genes= 88
rounded difference= 3 num genes= 14
rounded difference= 4 num genes= 2
rounded difference= 5 num genes= 1
rounded difference= 6 num genes= 2


The extreme genes above are 
HOXB6 (Homeobox B6), HOXB5 (Homeobox B5), ESAM (Endothelial cell adhesion molecule), TMEM88 (Transmembrane protein 88), and CYP26C1 (Cytochrome P450 family 26 subfamily C member 1). 


## Non-coding

In [11]:
filepath = DATA_DIR + NC_RCI_FILE
df = load_RCI_data(filepath)
df

Unnamed: 0,gene_id,A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90
0,ENSG00000099869,,1.000000,,0.008462,,,,,,,,,,,
1,ENSG00000105501,,,,,-0.415038,,,,,,,,0.363348,,
2,ENSG00000116652,-1.848000,1.652080,,-4.426260,,,-4.887530,-4.988680,-2.97982,,,,-3.428240,,
3,ENSG00000117242,-0.256730,-0.793877,-0.070389,0.464641,-0.059643,0.787885,-0.892197,0.229026,-1.27700,,-0.147342,0.067574,0.828520,0.316474,0.245683
4,ENSG00000120664,,0.321928,,,,-1.760170,,,,,,,-1.697440,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,ENSG00000283078,-1.000000,-1.584960,-3.738640,-2.300660,,-1.454570,-2.961750,,,,-0.941897,-2.432960,,-2.568280,
4658,ENSG00000283083,,,,,,,,,,,,,-4.010270,,
4659,ENSG00000283095,,-1.716210,,,,,,,,,,,,1.369230,
4660,ENSG00000283103,1.408840,0.747641,1.506770,1.164660,,2.198040,1.929300,,1.71022,,0.898216,1.660580,1.968570,2.147580,


In [12]:
# Average correlation for each cell line.
# Note that the same two cell lines are outliers: 2 and 8.
averages = average_correlation_per_cell_line(df)
print('Avgerage correlation, Cell line name, Cell line number')
print(*sorted(averages),sep='\n')

Avgerage correlation, Cell line name, Cell line number
(0.46176986006274284, 'H1.hESC', 2)
(0.6169986311965373, 'NCI.H460', 8)
(0.697495425259526, 'GM12878', 13)
(0.706744363300593, 'SK.MEL.5', 10)
(0.7144991464331191, 'SK.N.DZ', 11)
(0.7154554251271633, 'K562', 14)
(0.7265597483534967, 'HeLa.S3', 3)
(0.7281427854246523, 'A549', 1)
(0.7322968273927554, 'NHEK', 9)
(0.7485242991297104, 'MCF.7', 7)
(0.7594345699720454, 'SK.N.SH', 12)
(0.7621607101920785, 'HepG2', 4)
(0.76401102942951, 'HT1080', 5)
(0.7729523945220494, 'HUVEC', 6)
(0.7751619697991341, 'IMR.90', 15)


In [13]:
# Average correlation for each cell line, excluding the outliers.
averages = average_correlation_per_cell_line(df,[2,8])
print('Avgerage correlation, Cell line name, Cell line number')
print(*sorted(averages),sep='\n')

Avgerage correlation, Cell line name, Cell line number
(0.7298380520358431, 'GM12878', 13)
(0.7399647560156374, 'SK.N.DZ', 11)
(0.7408934268497993, 'SK.MEL.5', 10)
(0.7411601732076041, 'K562', 14)
(0.7503663044531068, 'HeLa.S3', 3)
(0.7565913346095169, 'A549', 1)
(0.7704735553990657, 'NHEK', 9)
(0.7859823010250144, 'SK.N.SH', 12)
(0.7873721941309327, 'MCF.7', 7)
(0.7945853750530817, 'HepG2', 4)
(0.7997160694926141, 'IMR.90', 15)
(0.7998737148896572, 'HT1080', 5)
(0.8052742354990018, 'HUVEC', 6)


In [14]:
differences = compare_lines(df,'HUVEC','IMR.90',2.5)
print()
for key in sorted(differences.keys()):
    print('rounded difference=',key,'num genes=',differences[key])

Extreme ENSG00000248932 -1.88662 1.17911
Extreme ENSG00000264772 -3.67032 -6.30844
Extreme ENSG00000271855 -0.369234 -3.32193
Extreme ENSG00000282164 -3.04439 -0.321928

rounded difference= 0 num genes= 116
rounded difference= 1 num genes= 132
rounded difference= 2 num genes= 23
rounded difference= 3 num genes= 4


The extreme genes listed above are
COPB2-DT (COPB2 Divergent Transcript, antisense to RBP2), a novel transcript (no info, possibly small nucleolar RNA), lnc-IAH1-2 (antisense), and PEG13 (paternally expressed gene 13, intronic).
