# Analyze Composition

Analyze the results for stratification.   
Not all genes are expressed in every cell line.   
We must avoid a test set with zero genes for some cell line.  

In [1]:
import pandas as pd
from cell_lines import Cell_Lines

TEST_PORTION = 0.2
ATLAS_DIR='/Users/jasonmiller/WVU/Localization/LncAtlas/'
ATLAS_CODING='CNRCI_coding_genes.csv'
ATLAS_NONCODING='CNRCI_noncoding_genes.csv'
GENCODE_DIR='/Users/jasonmiller/WVU/Localization/GenCode/'
GENCODE_CODING='Homo_sapiens.GRCh38.cds.csv'
GENCODE_NONCODING='Homo_sapiens.GRCh38.ncrna.csv'

CODING_TEST = 'CNRCI_coding_test_genes.csv'
CODING_TRAIN = 'CNRCI_coding_train_genes.csv'
NONCODING_TEST = 'CNRCI_noncoding_test_genes.csv'
NONCODING_TRAIN = 'CNRCI_noncoding_train_genes.csv'

In [2]:
cell_line_names = Cell_Lines.get_ordered_list()
print(cell_line_names)

['A549', 'H1.hESC', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']


In [3]:
def show_names():
    for i in range(len(cell_line_names)):
        print(i, cell_line_names[i])
show_names()

0 A549
1 H1.hESC
2 HeLa.S3
3 HepG2
4 HT1080
5 HUVEC
6 MCF.7
7 NCI.H460
8 NHEK
9 SK.MEL.5
10 SK.N.DZ
11 SK.N.SH
12 GM12878
13 K562
14 IMR.90


In [4]:
def analyze_nan(dataset_name, df):
    print(dataset_name, len(df))
    count = len(df)
    for i in range(len(cell_line_names)):
        print(f'{i:4}',end=' ')
    print()
    for i in range(len(cell_line_names)):
        col = cell_line_names[i]
        s = df[col].isna().sum()
        print(f'{s:4}',end=' ')
    print()

In [5]:
filename = ATLAS_DIR + CODING_TRAIN
df = pd.read_csv(filename)
analyze_nan("Coding Train Set", df)

Coding Train Set 14216
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
3824 1047 7216 4735 4624 3892 3067 5787 5443 5832 5663 4516 4204 7553 10216 


In [6]:
filename = ATLAS_DIR + CODING_TEST
df = pd.read_csv(filename)
analyze_nan("Coding Test Set", df)

Coding Test Set 3554
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
 895  264 1751 1155 1122  947  744 1400 1318 1422 1451 1080 1013 1866 2537 


In [7]:
filename = ATLAS_DIR + NONCODING_TRAIN
df = pd.read_csv(filename)
analyze_nan("Noncoding Train Set", df)

Noncoding Train Set 5415
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
3730 1435 4350 3819 4347 3634 2918 4678 4186 4808 4709 3512 3409 4294 4972 


In [8]:
filename = ATLAS_DIR + NONCODING_TEST
df = pd.read_csv(filename)
analyze_nan("Noncoding Test Set", df)

Noncoding Test Set 1353
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
 909  410 1101  935 1060  920  736 1160  994 1152 1152  839  848 1072 1214 
