# Final Cover Plot
The hope of this project is to generate the “Goldilocks” zone for miRNA cover. To do this, we are using the previously developed “random cover” simulation compared to a newly generated “optimized cover” using a greedy algorithm. These two curves will be plotted together with an X marking each species' actual seed cover. 

For organization purposes, the random cover and optimized cover will be generated separately, with final data being outputted to folders that will be consolidated in the “final cover plot” program. 
Additionally, we will be using three datasets: TargetScan data (which selects only the most represented 3’UTR),  Biomart+consolidation (which considers all 3’UTR and removes repeats), and Biomart+generation (which generates a gene that contains all versions of transcriptID).  Each of these datasets represents different ways miRNA interacts in the cell, therefore comparing the results of each dataset will allow us to determine which method is most effective. 

This script takes the data generated by "random cover" and "optimized cover" and creates a plot containing the 'goldilocks zone'. 

In [99]:
from numpy import loadtxt
import os
import glob
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import ast
import itertools as it  
import scipy.stats as st
import seaborn as sns

In [158]:
def simulate_plot(genome_filename, mature_filename, canon_site):
    #load random and optimal simulation data 
    data_r = {}
    new_file_path = "Simulated Random Coverage/"+genome_filename+"_simulation/Canon Site "+canon_site
    for ID in glob.glob(os.path.join(new_file_path, '*.txt')):
           with open(os.path.join(os.getcwd(), ID), 'r') as f:
                data_r[int(os.path.basename(ID).split('/')[-1].strip(".tx"))] = (ast.literal_eval(f.read()))
    
    data_o = {}
    new_file_path = "Simulated Optimal Coverage/"+genome_filename+"_simulation/Canon Site "+canon_site
    for ID in glob.glob(os.path.join(new_file_path, '*.txt')):
           with open(os.path.join(os.getcwd(), ID), 'r') as f:
                data_o[int(os.path.basename(ID).split('/')[-1].strip(".tx"))] = (ast.literal_eval(f.read()))
                
    keys_r = list(data_r.keys())
    keys_r.sort()
    data_r = {i: data_r[i] for i in keys_r}
    
    keys_o = list(data_o.keys())
    keys_o.sort()
    data_o = {i: data_o[i] for i in keys_o}

    #to create confidence intervals
    x_r = []
    y_r = []
    for key in data_r.keys():
        for i in data_r[key]:
            y_r.append(i)
            x_r.append(key)
            
    x_o = []
    y_o = []
    for key in data_o.keys():
        for i in data_o[key]:
            y_o.append(i)
            x_o.append(key)
            
    #plot boxplot and line plot
    plt.figure(figsize=(10,10))
    line1 = sns.lineplot(x=x_r, y=y_r, ci=100, label='Random Seed')
    line2 = sns.lineplot(x=x_o, y=y_o, ci=100, label='Optimal Seed')
    
    #load real coverage of Norm_v, Pre_v, and Post_v
    new_file_path = "Real Coverage/"+mature_filename+"/Canon Site "+canon_site
    with open(new_file_path+"/Norm_v.txt", 'r') as f:
        norm_data = ast.literal_eval(f.read())
    
    with open(new_file_path+"/Pre_v.txt", 'r') as f:
        pre_data = ast.literal_eval(f.read())
        
    with open(new_file_path+"/Post_v.txt", 'r') as f:
        post_data = ast.literal_eval(f.read())
        
    
    plt.plot(norm_data[0], norm_data[1], marker="x", markersize=10, markeredgecolor="red")
    plt.plot(pre_data[0], pre_data[1], marker="x", markersize=10, markeredgecolor="blue")
    plt.plot(post_data[0], post_data[1], marker="x", markersize=10, markeredgecolor="green")
    
    if not os.path.exists('Simulated Plots'):
        os.makedirs('Simulated Plots')
    plt.title(genome_filename + " Canon Site " + canon_site +" Range Plot")
    plt.savefig("Simulated Plots/"+genome_filename + " Canon Site " + canon_site +" Range Plot.png")
    plt.show()

In [1]:
#run simulate_plot() here