## Real Coverage
This project will calculate the number of seeds and seed coverage of all species

NOTE: The All.txt genome file has been commented out (see Random Coverage Plot for more details). 

In [18]:
from numpy import loadtxt
import os
import pandas as pd 

In [19]:
def simulate_true_number(genome_filename, mature_filename, v_subset, canon_site):
    new_file_path = "Real Coverage/"+mature_filename+"/Canon Site "+canon_site
    if os.path.exists(new_file_path+"/"+v_subset+"_v.txt"):
        print(new_file_path+"/"+v_subset+"_v.txt exists")
        return
    
    #returns real coverage and seed number from mature data
    df = pd.read_csv("Mature Data/"+mature_filename+'.csv')
    
    #filter based on pre_v or post_v
    library = ['Olfactores', 'Chordata', 'Deuterostomia', 'Bilateria', 'Eumetazoa']
    
    if v_subset == 'Pre':
        print("Generating data for Pre_V")
        df = df[df['Node of origin (family)'].isin(library)]
    elif v_subset == 'Post':
        print("Generating data for Post_V")
        df = df[~df['Node of origin (family)'].isin(library)]
    elif v_subset != 'Norm':
        print('ERROR: Invalid v_subset')
        return
        
    filename_data = df['Seed']
    
    #create reverse compliments
    #Replace A with X
    filename_data = [sub.replace('A', 'X') for sub in filename_data]
    
    #Replace T with A
    filename_data = [sub.replace('U', 'A') for sub in filename_data]
    
    #Replace X with T
    filename_data = [sub.replace('X', 'T') for sub in filename_data]
    
    #Replace C with X
    filename_data = [sub.replace('C', 'X') for sub in filename_data]
    
    #Replace G with C
    filename_data = [sub.replace('G', 'C') for sub in filename_data]
    
    #Replace X with G
    filename_data = [sub.replace('X', 'G') for sub in filename_data]
    
    #Reverse all the strings
    filename_data = [sub[::-1] for sub in filename_data]
    
    #If canon site A or C, append A at the end
    if canon_site == 'A':
        filename_data = [item[:-1]+'A' for item in filename_data]
    elif canon_site == 'C':
        filename_data = [item+'A' for item in filename_data]
    
    #remove repeats
    filename_data = [*set(filename_data)]

    if genome_filename == 'All':
        data = []
        for species in species_list:
            if species[0]!='All':
                temp_data = loadtxt("Genome Data/"+species[0]+".txt", comments=">",dtype="str")
                #remove all the Unavailable sequences and sequences that are too short
                allowed_char = 'AGCT'
                temp_data = list(filter(lambda a: all(ch in allowed_char for ch in a) and len(a)>7, temp_data))
                print(species[0]+": "+str(len(temp_data)))
                data = data+temp_data
    else:
        data = loadtxt("Genome Data/"+genome_filename+".txt", comments=">",dtype="str")
    
    #calculate seed coverage
    covered_list = []
    for gene in data:
        for motif in filename_data:
            if motif in gene:
                covered_list.append(gene)
                break
                    
    #Remove repeats
    covered_list = [*set(covered_list)]
    
    #final data
    seed_number = len(filename_data)
    coverage_number = len(covered_list)
    
    #Save data to file
    if not os.path.exists(new_file_path):
        os.makedirs(new_file_path)
    
    with open(new_file_path+"/"+v_subset+"_v.txt", "w") as output:
        output.write(str([seed_number, coverage_number]))
        
    print("Done!")

In [22]:
species_list = [
    ('Lamprey genes (Pmarinus_7.0)','Sea Lamprey (Petromyzon marinus)'),
    ('Hagfish genes (Eburgeri_3.2)','Inshore hagfish (Eptatretus burgeri)'),
    ('Elephant shark genes (Callorhinchus_milii-6.1.3)','Australian ghostshark (Callorhinchus milii)'),
    ('Spotted gar genes (LepOcu1)','Spotted gar (Lepisosteus oculatus)'),
    ('Zebrafish genes (GRCz11)','Zebrafish (Danio rerio)'),
    ('Atlantic cod genes (gadMor3.0)','Cod (Gadus morhua)'),
    ('Coelacanth genes (LatCha1)','Coelacanth (Latimeria chalumnae)'),
    ('Tropical clawed frog genes (Xenopus_tropicalis_v9.1)','Tropical clawed frog (Xenopus tropicalis)'),
    ('Tuatara genes (ASM311381v1)','Tuatara (Sphenodon punctatus)'),
    ('Green anole genes (AnoCar2.0v2)','Green anole lizard (Anolis carolinensis)'),
    ('Painted turtle genes (Chrysemys_picta_bellii-3.0.3)','Western painted turtle (Chrysemys picta bellii)'),
    ('Zebra finch genes (bTaeGut1_v1.p)','Zebra finch (Taeniopygia guttata)'),
    ('Chicken (maternal Broiler) genes (bGalGal1.mat.broiler.GRCg7b)','Chicken (Gallus gallus)'),
    ('Platypus genes (mOrnAna1.p.v1)','Platypus (Ornithorhynchus anatinus)'),
    ('Opossum genes (ASM229v1)','Gray short-tailed opossum (Monodelphis domestica)'),
    ('Armadillo genes (Dasnov3.0)','Nine-banded armadillo (Dasypus novemcinctus)'),
    ('Cow genes (ARS-UCD1.2)','Cow (Bos taurus)'),
    ('Dog genes (ROS_Cfam_1.0)','Dog (Canis familiaris)'),
    ('Rabbit genes (OryCun2.0)','Rabbit (Oryctolagus cuniculus)'),
    ('Guinea Pig genes (Cavpor3.0)','Guinea pig (Cavia porcellus)'),
    ('Rat genes (mRatBN7.2)','Norway rat (Rattus norvegicus)'),
    ('Mouse genes (GRCm39)','House mouse (Mus musculus)'),
    ('Human genes (GRCh38.p13)','Human (Homo sapiens)'),
    ('Human TargetScan','Human (Homo sapiens) TargetScan'),
    ('All','All')
]

In [24]:
sites = ['A','B','C']
v_subsets = ['Pre','Post','Norm']

for site in sites:
    for v_subset in v_subsets:
        for species in species_list:
            simulate_true_number(species[0], species[1], v_subset, site)

Real Coverage/Sea Lamprey (Petromyzon marinus)/Canon Site A/Pre_v.txt exists
Real Coverage/Inshore hagfish (Eptatretus burgeri)/Canon Site A/Pre_v.txt exists
Real Coverage/Australian ghostshark (Callorhinchus milii)/Canon Site A/Pre_v.txt exists
Real Coverage/Spotted gar (Lepisosteus oculatus)/Canon Site A/Pre_v.txt exists
Real Coverage/Zebrafish (Danio rerio)/Canon Site A/Pre_v.txt exists
Real Coverage/Cod (Gadus morhua)/Canon Site A/Pre_v.txt exists
Real Coverage/Coelacanth (Latimeria chalumnae)/Canon Site A/Pre_v.txt exists
Real Coverage/Tropical clawed frog (Xenopus tropicalis)/Canon Site A/Pre_v.txt exists
Real Coverage/Tuatara (Sphenodon punctatus)/Canon Site A/Pre_v.txt exists
Real Coverage/Green anole lizard (Anolis carolinensis)/Canon Site A/Pre_v.txt exists
Real Coverage/Western painted turtle (Chrysemys picta bellii)/Canon Site A/Pre_v.txt exists
Real Coverage/Zebra finch (Taeniopygia guttata)/Canon Site A/Pre_v.txt exists
Real Coverage/Chicken (Gallus gallus)/Canon Site A/P

Hagfish genes (Eburgeri_3.2): 18874
Elephant shark genes (Callorhinchus_milii-6.1.3): 26758
Spotted gar genes (LepOcu1): 10559
Zebrafish genes (GRCz11): 30407
Atlantic cod genes (gadMor3.0): 30960
Coelacanth genes (LatCha1): 7395
Tropical clawed frog genes (Xenopus_tropicalis_v9.1): 41840
Tuatara genes (ASM311381v1): 7049
Green anole genes (AnoCar2.0v2): 14755
Painted turtle genes (Chrysemys_picta_bellii-3.0.3): 17145
Zebra finch genes (bTaeGut1_v1.p): 26724
Chicken (maternal Broiler) genes (bGalGal1.mat.broiler.GRCg7b): 41831
Platypus genes (mOrnAna1.p.v1): 18367
Opossum genes (ASM229v1): 18299
Armadillo genes (Dasnov3.0): 10355
Cow genes (ARS-UCD1.2): 21201
Dog genes (ROS_Cfam_1.0): 33467
Rabbit genes (OryCun2.0): 20484
Guinea Pig genes (Cavpor3.0): 12981
Rat genes (mRatBN7.2): 38817
Mouse genes (GRCm39): 52312
Human genes (GRCh38.p13): 94830
Human TargetScan: 1177
Done!
Real Coverage/Sea Lamprey (Petromyzon marinus)/Canon Site C/Norm_v.txt exists
Real Coverage/Inshore hagfish (Eptat