# Analyze RCI correlations between cell lines
Do the same thing as notebook 101, but use numpy or pandas instead of loops.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import scipy.stats as ss
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2023-01-10 11:24:21.566732
Python 3.10.0
sklearn 1.1.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/TrainTest/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = 'D:/Adjeroh/Localization/TrainTest/'   # Windows
    DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/'    # Mac
print(DATA_DIR)

/Users/jasonmiller/WVU/Localization/TrainTest/


In [3]:
PC_RCI_FILE =      'CNRCI_coding_train_RCI.gc42.csv'
NC_RCI_FILE =      'CNRCI_noncoding_train_RCI.gc42.csv'

In [4]:
def load_RCI_data(filepath):
    df = pd.read_csv(filepath)
    return df

In [5]:
# This correctly converts "nan" to NaN = Python float(nan).
filepath = DATA_DIR + PC_RCI_FILE
df = load_RCI_data(filepath)
df

Unnamed: 0,gene_id,A549,H1.hESC,HeLa.S3,HepG2,HT1080,HUVEC,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,GM12878,K562,IMR.90
0,ENSG00000000003,1.080680,1.857340,1.868390,2.29436,0.866395,1.284940,1.726960,-0.266510,0.460806,1.837530,-1.119010,1.834430,,,
1,ENSG00000000005,,5.882640,,,,,,,,,,,,,
2,ENSG00000000419,1.326790,2.589540,1.275560,1.43865,0.771867,1.275460,1.155230,-0.496772,0.266253,0.364575,0.958075,2.194600,0.439030,,
3,ENSG00000000457,0.434284,-0.377326,-0.267569,-0.47502,-1.216230,-0.853779,-1.343170,-0.250651,-0.778011,,-0.811809,-0.780640,-0.561655,-0.974177,
4,ENSG00000000460,-0.154524,0.163728,-1.649930,-1.59481,-0.973326,-1.350700,-1.506060,-0.634401,0.016657,-2.349650,-0.492205,-0.610424,-1.597690,-0.615989,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13973,ENSG00000282419,,0.000000,,,,,,,,,,,1.215510,,
13974,ENSG00000282815,,0.736966,,,,,,,,,,0.415038,,,
13975,ENSG00000282881,,-2.321930,,,,,,,,,,,,,
13976,ENSG00000282988,,0.126710,,,,1.041000,0.418061,,-0.680577,,,-1.123170,,,


In [13]:
# This ignores nan and uses only the genes in common.
# We get the same values we computed longhand in notebook 101.
names = df.columns
colj = names[1]
for i in range(1,len(names)):
    coli = names[i]
    r = df[colj].corr(df[coli])
    print('%10s %10s %.3f' % (colj,coli,r))

      A549       A549 1.000
      A549    H1.hESC 0.730
      A549    HeLa.S3 0.821
      A549      HepG2 0.781
      A549     HT1080 0.738
      A549      HUVEC 0.718
      A549      MCF.7 0.757
      A549   NCI.H460 0.553
      A549       NHEK 0.792
      A549   SK.MEL.5 0.558
      A549    SK.N.DZ 0.661
      A549    SK.N.SH 0.861
      A549    GM12878 0.702
      A549       K562 0.685
      A549     IMR.90 0.779


In [25]:
def average_correlation_per_cell_line(exclusions=[]):
    names = df.columns
    averages = []
    for i in range(1,len(names)):
        coli = names[i]
        if i not in exclusions:
            total = 0
            count = 0
            for j in range(1,len(names)):
                if i != j and j not in exclusions:
                    colj = names[j]
                    r = df[colj].corr(df[coli])
                    total += r
                    count += 1
            average = total / count
            averages.append((average,coli,i))
    return averages

In [26]:
# Average correlation for each cell line.
# Note that two cell lines are outliers: 2 and 8.
averages = average_correlation_per_cell_line()
print('Avgerage correlation, Cell line name, Cell line number')
print(*sorted(averages),sep='\n')

Avgerage correlation, Cell line name, Cell line number
(0.5574057999232218, 'NCI.H460', 8)
(0.5847601779079602, 'H1.hESC', 2)
(0.6936143894737279, 'SK.MEL.5', 10)
(0.6994803754254235, 'SK.N.DZ', 11)
(0.7049238716146469, 'NHEK', 9)
(0.7180740086429938, 'HT1080', 5)
(0.7241143318749105, 'A549', 1)
(0.7337605907996098, 'GM12878', 13)
(0.760507867719563, 'SK.N.SH', 12)
(0.7647253395095782, 'K562', 14)
(0.7669433676609013, 'MCF.7', 7)
(0.7692875306602314, 'HUVEC', 6)
(0.7743148199441252, 'IMR.90', 15)
(0.7747476145367019, 'HepG2', 4)
(0.7797903431841074, 'HeLa.S3', 3)


In [27]:
# Average correlation for each cell line, excluding the outliers.
averages = average_correlation_per_cell_line([2,8])
print('Avgerage correlation, Cell line name, Cell line number')
print(*sorted(averages),sep='\n')

Avgerage correlation, Cell line name, Cell line number
(0.7194501406520993, 'SK.N.DZ', 11)
(0.725011391136344, 'SK.MEL.5', 10)
(0.7306946409754279, 'NHEK', 9)
(0.7378350617352898, 'A549', 1)
(0.7466188378417412, 'HT1080', 5)
(0.7624685278905451, 'GM12878', 13)
(0.7789591079134003, 'SK.N.SH', 12)
(0.7925193511012832, 'K562', 14)
(0.7963590938806474, 'MCF.7', 7)
(0.8012702766410182, 'HepG2', 4)
(0.8018054766852066, 'HeLa.S3', 3)
(0.8038558303646638, 'HUVEC', 6)
(0.8081996574316969, 'IMR.90', 15)
