In [1]:
import tools
import visualization_2D as vis2D
import visualization_3D as vis3D
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import random as rd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import os
from intermine.webservice import Service

# I- TRN download

In [2]:
#Sun et al., 2019
#https://wiley.figshare.com/ndownloader/files/14670236
Yeast_TRN = pd.read_csv('../results/Yeast_TRN.csv', sep='\t')
Yeast_TRN

Unnamed: 0,TF,TG,Direction,Distance
0,YLR403W,YLR048W,+,118.474
1,YDR423C,YKL002W,-,129.335
2,YMR021C,YDL234C,-,66.365
3,YLR403W,YDR050C,?,156.673
4,YDR253C,YLR438W,bound,109.636
...,...,...,...,...
28255,YIL101C,YOR070C,binding enriched,62.813
28256,YER159C,YBR203W,binding enriched,70.405
28257,YDR392W,YLR258W,binding enriched,124.902
28258,YML027W,YKR093W,+,46.923


# II- Targets lists creation

## 1) All targets

In [None]:
list_TF = Yeast_TRN.TF.unique()
#len(list_TF) = 186

for TF in list_TF :
    
    TG = Yeast_TRN[Yeast_TRN.TF == TF]
    TG = TG.drop(["TF", "Distance"], axis = 1)
    TG.to_csv("../results/TF_target_TRN/" + str(TF) + "_" + str(len(TG)) + "_targets.csv", 
              index = False, 
              columns = ["TG", "Direction"])

## 2) Targets filtered by type of regulation (+/-)

In [None]:
list_TF = Yeast_TRN.TF.unique()
#len(list_TF) = 186

for TF in list_TF :
    
    TG = Yeast_TRN[Yeast_TRN.TF == TF]
    TG = TG.drop(["TF", "Distance"], axis = 1)
    TG = TG[TG.Direction == "+"]
    TG.to_csv("../results/TF_target_TRN_pos/" + str(TF) + "_" + str(len(TG)) + "_pos_targets.csv", 
              index = False, 
              columns = ["TG", "Direction"])

for TF in list_TF :
    
    TG = Yeast_TRN[Yeast_TRN.TF == TF]
    TG = TG.drop(["TF", "Distance"], axis = 1)
    TG = TG[TG.Direction == "-"]
    TG.to_csv("../results/TF_target_TRN_neg/" + str(TF) + "_" + str(len(TG)) + "_neg_targets.csv", 
              index = False, 
              columns = ["TG", "Direction"])

## 3) Activated targets filtered by GO terms

In [None]:
files_names = os.listdir("../results/TF_target_TRN_pos")

for genes in files_names:
    genes_list = pd.read_csv('../results/TF_target_TRN_pos/' + genes, sep=',', header = [0])
        
    if len(genes_list.TG) >= 50 :
        genes_list.to_csv("../results/TF_target_TRN_pos_for_SGD_GOterms/" + genes, 
                          index = False, header = False, columns = ["TG"])


# III- Figures generation functions

## 1) Targets repartition on the chromosomes

In [3]:
def chrom_repartition_hist(genes_list):
    
    sql_query = \
"""SELECT Primary_SGDID, Feature_name, Start_coordinate, Stop_coordinate, Chromosome, Strand
FROM SGD_features
ORDER BY Start_coordinate
"""
    
    loci = tools.get_locus_info("../SCERE.db", sql_query)
    loci = loci.assign(FT_target = loci.Feature_name.isin(genes_list))
       
    loci = loci[loci.FT_target == True].drop(["FT_target"], axis = 1)
       
    fig = px.histogram(loci, x="Chromosome", nbins=30, range_x=[1, 17], color_discrete_sequence=['#A0E8AF'])
    fig.update_layout(plot_bgcolor = "white",
                      xaxis_showgrid = False,
                      yaxis_showgrid = False, 
                      showlegend = True)
       
    return fig

## 2) 3D distances between targets

In [4]:
def distances_hist(genes_list):
    
    sql_query = \
"""SELECT Primary_SGDID, Chromosome, Feature_name, Strand, Stop_coordinate, Start_coordinate
FROM SGD_features
"""

    Feature_name = tools.get_locus_info("../SCERE.db", sql_query)
    Feature_name = Feature_name.merge(genes_list, left_on = "Feature_name", right_on = genes_list.columns[0])
    
    adjacency_matrix_select = adjacency_matrix.loc[Feature_name.Primary_SGDID, Feature_name.Primary_SGDID]
    adjacency_matrix_select.index.names = ["Primary_SGDID_bis"]
    
    edges_list = adjacency_matrix_select.stack().dropna().reset_index()
    edges_list = edges_list.sort_values(by = "Primary_SGDID_bis")
    edges_list.rename(columns = {0: "3D_distances"}, inplace = True)
    edges_list = edges_list.sort_values(by = "3D_distances")
    edges_list.index = range(1, len(edges_list) + 1)
    
    fig = px.histogram(edges_list, x="3D_distances", range_x=[-10, 210], nbins= 70, color_discrete_sequence=['#A0E8AF'])
    fig.update_layout(plot_bgcolor = "white", 
                      xaxis_showgrid = False, 
                      yaxis_showgrid = False, 
                      showlegend = True)
    
    return fig

# IV- Results

## 1) All targets together

In [None]:
adjacency_matrix = pd.read_parquet("../dashboard/static/adjacency_matrix.parquet.gzip", engine='pyarrow')
files_names = os.listdir("../results/TF_target_TRN")

#removed mitochondrial targets genes :
#Q0050, Q0080 and Q0120 from YIR018W_451_targets
#Q0115 and Q0130 from YJL127C_1176_targets
#Q0080 and Q0297 from 'YEL009C_303_targets.csv'


for genes in files_names:
    genes_list = pd.read_csv('../results/TF_target_TRN/' + genes, sep=',', header = [0])
        
    chrom_repartition = chrom_repartition_hist(genes_list.TG)
    distances = distances_hist(genes_list)
    
    chrom_repartition.write_image("../results/chrom_repartition_hist/" + genes +".jpeg")
    distances.write_image("../results/3Ddistances_hist/" + genes +".jpeg")

## 2) Activated targets

In [None]:
adjacency_matrix = pd.read_parquet("../dashboard/static/adjacency_matrix.parquet.gzip", engine='pyarrow')
files_names = os.listdir("../results/TF_target_TRN_pos")

for genes in files_names:
    genes_list = pd.read_csv('../results/TF_target_TRN_pos/' + genes, sep=',', header = [0])
        
    if len(genes_list.TG) > 0 :    
        chrom_repartition = chrom_repartition_hist(genes_list.TG)
        distances = distances_hist(genes_list)
        
        chrom_repartition.write_image("../results/chrom_repartition_hist_pos/" + genes +".jpeg")
        distances.write_image("../results/3Ddistances_hist_pos/" + genes +".jpeg")

# V- Random targets lists creation

In [None]:
list_all_TG = Yeast_TRN.TG.unique()
list_all_TG = list(list_all_TG)
for mito_gene in ["Q0050", "Q0080","Q0120", "Q0115", "Q0130", "Q0297"] :
    print(mito_gene)
    list_all_TG.remove(mito_gene)

list_TF = Yeast_TRN.TF.unique()

for TF in list_TF :
    
    TG = Yeast_TRN[Yeast_TRN.TF == TF]
    TG = TG.drop(["TF", "Distance"], axis = 1)
    
    random_TG = pd.DataFrame(data = {"random_TG": rd.choices(list_all_TG, k=len(TG))})
    random_TG.to_csv("../results/TF_random_target_TRN/" + str(TF) + "_" + str(len(TG)) + "_random_targets.csv", 
                     index = False, 
                     columns = ["random_TG"])


In [None]:
adjacency_matrix = pd.read_parquet("../dashboard/static/adjacency_matrix.parquet.gzip", engine='pyarrow')
files_names = os.listdir("../results/TF_random_target_TRN")

for genes in files_names:
    genes_list = pd.read_csv('../results/TF_random_target_TRN/' + genes, sep=',', header = [0])
        
    chrom_repartition = chrom_repartition_hist(genes_list.random_TG)
    distances = distances_hist(genes_list)
        
    chrom_repartition.write_image("../results/random/" + genes +".jpeg")
    distances.write_image("../results/random/" + genes +".jpeg")

# V- QQ plots

In [12]:
def distances_QQplot(genes_list, adjacency_matrix, genes):
    
    edges_list = get_edges_list(genes_list, adjacency_matrix)
    x = np.array(edges_list["3D_distances"])
    
    y = []
    for i in range(1, 5) :
        random_genes_list = pd.DataFrame(data = {"Primary_SGDID": rd.choices(adjacency_matrix.index, k=len(edges_list))})
        random_edges_list = get_random_edges_list(random_genes_list, adjacency_matrix)
        y += list(random_edges_list["3D_distances"])
    
    y = np.array(y)
    pp_x = sm.ProbPlot(x)
    pp_y = sm.ProbPlot(y)
    qqplot_2samples(pp_x, pp_y)
    plt.xlabel('genes list 3D distances')
    plt.ylabel('random list 3D distances')
    
    genes = genes[:-4]
    plt.savefig("../results/QQplots/" + genes + ".jpeg")
    plt.close()

    return

def get_edges_list(gene_list, adjacency_matrix):
    
    sql_query = \
"""SELECT Primary_SGDID, Chromosome, Feature_name, Strand, Stop_coordinate, Start_coordinate
FROM SGD_features
"""

    Feature_name = tools.get_locus_info("../SCERE.db", sql_query)
    Feature_name = Feature_name.merge(genes_list, left_on = "Feature_name", right_on = genes_list.columns[0])
    
    adjacency_matrix_select = adjacency_matrix.loc[Feature_name.Primary_SGDID, Feature_name.Primary_SGDID]
    adjacency_matrix_select.index.names = ["Primary_SGDID_bis"]
    
    edges_list = adjacency_matrix_select.stack().dropna().reset_index()
    edges_list = edges_list.sort_values(by = "Primary_SGDID_bis")
    edges_list.rename(columns = {0: "3D_distances"}, inplace = True)
    edges_list = edges_list.sort_values(by = "3D_distances")
    edges_list.index = range(1, len(edges_list) + 1)
    
    return edges_list

def get_random_edges_list(random_gene_list, adjacency_matrix):
    
    adjacency_matrix_select = adjacency_matrix.loc[random_gene_list.Primary_SGDID, random_gene_list.Primary_SGDID]
    adjacency_matrix_select.index.names = ["Primary_SGDID_bis"]
    
    random_edges_list = adjacency_matrix_select.stack().dropna().reset_index()
    random_edges_list = random_edges_list.sort_values(by = "Primary_SGDID_bis")
    random_edges_list.rename(columns = {0: "3D_distances"}, inplace = True)
    random_edges_list = random_edges_list.sort_values(by = "3D_distances")
    random_edges_list.index = range(1, len(random_edges_list) + 1)
    
    return random_edges_list

In [10]:
adjacency_matrix = pd.read_parquet("../dashboard/static/adjacency_matrix.parquet.gzip", engine='pyarrow')
genes_list = pd.read_csv('../results/TF_target_TRN/YBR279W_53_targets.csv', sep=',', header = [0])
    
distances_QQplot(genes_list, adjacency_matrix, genes)

In [11]:
%%time

adjacency_matrix = pd.read_parquet("../dashboard/static/adjacency_matrix.parquet.gzip", engine='pyarrow')
files_names = os.listdir("../results/TF_target_TRN")

for genes in files_names:
    print(genes)
    genes_list = pd.read_csv('../results/TF_target_TRN/' + genes, sep=',', header = [0])
    distances_QQplot(genes_list, adjacency_matrix, genes)

YDL005C_980_targets.csv


MemoryError: Unable to allocate 32.8 GiB for an array with shape (478796, 9185) and data type float64

# VI- Distributions comparaison

In [None]:
def ks_2samples(sample, random_sample, name):
    
    sample_chrom_repartition = chrom_repartition(sample.TG)
    random_chrom_repartition = chrom_repartition(random_sample.random_TG)
    
    ks_result = ks_2samp(sample_chrom_repartition, random_chrom_repartition)
    
    if ks_result.pvalue < 0.05:
        print(name)
        print(ks_result)
    
    return



def chrom_repartition(genes_list):
    
    sql_query = \
"""SELECT Primary_SGDID, Feature_name, Start_coordinate, Stop_coordinate, Chromosome, Strand
FROM SGD_features
ORDER BY Start_coordinate
"""
    
    loci = tools.get_locus_info("../SCERE.db", sql_query)
    loci = loci.assign(FT_target = loci.Feature_name.isin(genes_list))
       
    loci = loci[loci.FT_target == True].drop(["FT_target"], axis = 1)
       
    return loci.Chromosome

In [None]:
files_names = os.listdir("../results/TF_target_TRN")
files_names_random = os.listdir("../results/TF_random_target_TRN")


for genes, random_genes in zip(files_names, files_names_random):
    random_genes_list = pd.read_csv('../results/TF_random_target_TRN/' + random_genes, sep=',', header = [0])
    genes_list = pd.read_csv('../results/TF_target_TRN/' + genes, sep=',', header = [0])
    
    ks_2samples(genes_list, random_genes_list, genes)