# Quality Assurance

## Q/A the FASTA to CSV process

In [1]:
def count_fasta(fn,coding):
    if coding:
        # Must parse defline carefully.
        # Gotcha #1: transcript_biotype:protein_coding_LoF
        # Gotcha #2: transcript_biotype:protein_coding\n 
        signal = 'transcript_biotype:protein_coding'  
    else:
        signal = 'transcript_biotype:lncRNA'
    with open (fn,'r') as fasta:
        alive=False
        valid_seqs=0
        valid_letters=0
        for line in fasta:
            line = line.strip()
            if line[0]=='>':
                tokens = line.split()
                if signal in tokens:
                    alive = True
                    valid_seqs += 1
                else:
                    alive = False
            else:
                if alive:
                    valid_letters += len(line)
    return(valid_seqs,valid_letters)

In [2]:
def count_csv(fn,coding):
    header = None
    with open (fn,'r') as csv:
        valid_seqs=0
        valid_letters=0
        for line in csv:
            if header is None:
                header = line
            else:
                valid_seqs += 1
                line = line.strip()
                fields=line.split(',')
                # The CSV headers are:
                # transcript_id,gene_id,biotype,length,sequence
                sequence = fields[4]
                valid_letters += len(sequence)
    return(valid_seqs,valid_letters)

### Expect non-coding CSV == FASTA
Number of lncRNA, and total nucleotides, should be the same.

In [3]:
DATA_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/'
FASTA_FILENAME='Homo_sapiens.GRCh38.ncrna.fa'
CSV_FILENAME='Homo_sapiens.GRCh38.ncrna.csv'
num_seqs1, num_letters1 = count_fasta(DATA_DIR+FASTA_FILENAME,False)
print('FASTA', num_seqs1, num_letters1)
num_seqs2, num_letters2 = count_csv(DATA_DIR+CSV_FILENAME,False)
print('  CSV', num_seqs2, num_letters2)
if num_seqs1==num_seqs2 and num_letters1==num_letters2:
    print("same!")

FASTA 55625 71589477
  CSV 55625 71589477
same!


### Expect coding CSV == FASTA
Number of mRNA, and total nucleotides, should be the same.

In [4]:
DATA_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/'
FASTA_FILENAME='Homo_sapiens.GRCh38.cds.all.fa'
CSV_FILENAME='Homo_sapiens.GRCh38.cds.csv'
num_seqs1, num_letters1 = count_fasta(DATA_DIR+FASTA_FILENAME,True)
print('FASTA', num_seqs1, num_letters1)
num_seqs2, num_letters2 = count_csv(DATA_DIR+CSV_FILENAME,True)
print('  CSV', num_seqs2, num_letters2)
if num_seqs1==num_seqs2 and num_letters1==num_letters2:
    print("same!")

FASTA 98078 127885784
  CSV 98078 127885784
same!


## Q/A the LncATLAS to CSV process

In [5]:
DATA_DIR = '/Users/jasonmiller/WVU/Localization/LncAtlas/'
ATLAS_FILE='lncATLAS_all_data_RCI.csv'
CODING_TEST=DATA_DIR+'CNRCI_coding_test_genes.csv'
CODING_TRAIN=DATA_DIR+'CNRCI_coding_train_genes.csv'
NONCODING_TEST=DATA_DIR+'CNRCI_noncoding_test_genes.csv'
NONCODING_TRAIN=DATA_DIR+'CNRCI_noncoding_train_genes.csv'

In [6]:
def count_data(file):
    with open (file,'r') as csv:
        count = -1   # don't include csv header line
        for line in csv:
            count += 1
    return count

In [7]:
def count_LncAtlas(fn):
    header = None
    coding_rci_per_gene = {}
    noncoding_rci_per_gene = {}
    with open (fn,'r') as csv:
        for line in csv:
            if header is None:
                header = line
            else:
                # ENSEMBL ID,Data Source,Data Type,Value,Gene Name,Coding Type,Biotype
                line=line.strip()
                fields=line.split(',')
                gene = fields[0]
                measure = fields[2]
                value = fields[3]
                biotype = fields[6]
                if measure=='CNRCI' and value != 'NA':
                    if biotype == 'coding':
                        if gene in coding_rci_per_gene:
                            coding_rci_per_gene[gene] += 1
                        else:
                            coding_rci_per_gene[gene] = 1
                    elif biotype == 'nc':
                        if gene in noncoding_rci_per_gene:
                            noncoding_rci_per_gene[gene] += 1
                        else:
                            noncoding_rci_per_gene[gene] = 1
    return coding_rci_per_gene,noncoding_rci_per_gene

In [8]:
coding_map,noncoding_map = count_LncAtlas(DATA_DIR+ATLAS_FILE)
coding_genes = len(coding_map)
noncoding_genes = len(noncoding_map)
print('ATLAS genes',coding_genes,noncoding_genes)
# to do: count RCI values

ATLAS genes 17770 6768


In [9]:
coding_test_genes = count_data(CODING_TEST)
coding_train_genes = count_data(CODING_TRAIN)
print('CODING test train:',coding_test_genes,coding_train_genes)
if coding_test_genes+coding_train_genes == coding_genes:
    print("same total")
noncoding_test_genes = count_data(NONCODING_TEST)
noncoding_train_genes = count_data(NONCODING_TRAIN)
print('NONCODING test train:',noncoding_test_genes,noncoding_train_genes)
if noncoding_test_genes+noncoding_train_genes == noncoding_genes:
    print("same total")

CODING test train: 3554 14216
same total
NONCODING test train: 1353 5415
same total
