In [1]:
import tools
import visualization_2D as vis2D
import visualization_3D as vis3D
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import random as rd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import os
from intermine.webservice import Service

# I- TRN download

In [2]:
service = Service("https://yeastmine.yeastgenome.org/yeastmine/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("Gene")

# Type constraints should come early - before all mentions of the paths they constrain
query.add_constraint("regulatoryRegions", "TFBindingSite")

# The view specifies the output columns
query.add_view(
    "regulatoryRegions.regulator.symbol",
    "regulatoryRegions.regulator.secondaryIdentifier", "symbol",
    "secondaryIdentifier", "regulatoryRegions.regEvidence.ontologyTerm.name",
    "regulatoryRegions.regEvidence.ontologyTerm.identifier",
    "regulatoryRegions.experimentCondition",
    "regulatoryRegions.strainBackground",
    "regulatoryRegions.regulationDirection",
    "regulatoryRegions.publications.pubMedId", "regulatoryRegions.datasource",
    "regulatoryRegions.annotationType"
)

# Uncomment and edit the line below (the default) to select a custom sort order:
# query.add_sort_order("Gene.regulatoryRegions.regulator.symbol", "ASC")

# You can edit the constraint values below
query.add_constraint("regulatoryRegions.regulator", "IN", "Verified_ORFs", code="A")
query.add_constraint("regulatoryRegions.strainBackground", "=", "S288c", code="B")
query.add_constraint("regulatoryRegions.experimentCondition", "=", "cellular response to heat", code="C")

# Uncomment and edit the code below to specify your own custom logic:
# query.set_logic("B and A and C and C")

<BinaryConstraint: Gene.regulatoryRegions.experimentCondition = cellular response to heat>

In [3]:
TRN = pd.DataFrame(columns=['TF', 'TG'])

for row in query.rows():
    TRN = TRN.append({'TF': row["regulatoryRegions.regulator.secondaryIdentifier"],
                      'TG': row["secondaryIdentifier"]}, ignore_index=True)

display(TRN)

Unnamed: 0,TF,TG
0,YGR252W,YLR343W
1,YLR176C,YLR343W
2,YDL005C,YER177W
3,YLR071C,YER177W
4,YIL101C,YER177W
...,...,...
15520,YDL005C,YGL162W
15521,YIL101C,YGL162W
15522,YER111C,YBL071W-A
15523,YIL101C,YBL071W-A


# II- Targets lists creation

In [None]:
list_TF = TRN.TF.unique()
#len(list_TF) = 76

for TF in list_TF :
    
    TG = TRN[TRN.TF == TF]
    TG = TG.drop(["TF"], axis = 1)
    TG.to_csv("../results/V2/TF_target_TRN/" + str(TF) + "_" + str(len(TG)) + "_targets.csv", 
              index = False, 
              columns = ["TG"])

# III- Repartition and 3D distances histograms

## 1) Targets repartition on the chromosomes function

In [4]:
def chrom_repartition_hist(genes_list):
    
    sql_query = \
"""SELECT Primary_SGDID, Feature_name, Start_coordinate, Stop_coordinate, Chromosome, Strand
FROM SGD_features
ORDER BY Start_coordinate
"""
    
    loci = tools.get_locus_info("../SCERE.db", sql_query)
    loci = loci.assign(FT_target = loci.Feature_name.isin(genes_list))
       
    loci = loci[loci.FT_target == True].drop(["FT_target"], axis = 1)
       
    fig = px.histogram(loci, x="Chromosome", nbins=30, range_x=[1, 17], color_discrete_sequence=['#A0E8AF'])
    fig.update_layout(plot_bgcolor = "white",
                      xaxis_showgrid = False,
                      yaxis_showgrid = False, 
                      showlegend = True)
       
    return fig

## 2) 3D distances between targets function

In [5]:
def distances_hist(genes_list):
    
    sql_query = \
"""SELECT Primary_SGDID, Chromosome, Feature_name, Strand, Stop_coordinate, Start_coordinate
FROM SGD_features
"""

    Feature_name = tools.get_locus_info("../SCERE.db", sql_query)
    Feature_name = Feature_name.merge(genes_list, left_on = "Feature_name", right_on = genes_list.columns[0])
    
    adjacency_matrix_select = adjacency_matrix.loc[Feature_name.Primary_SGDID, Feature_name.Primary_SGDID]
    adjacency_matrix_select.index.names = ["Primary_SGDID_bis"]
    
    edges_list = adjacency_matrix_select.stack().dropna().reset_index()
    edges_list = edges_list.sort_values(by = "Primary_SGDID_bis")
    edges_list.rename(columns = {0: "3D_distances"}, inplace = True)
    edges_list = edges_list.sort_values(by = "3D_distances")
    edges_list.index = range(1, len(edges_list) + 1)
    
    fig = px.histogram(edges_list, x="3D_distances", range_x=[-10, 210], nbins= 70, color_discrete_sequence=['#A0E8AF'])
    fig.update_layout(plot_bgcolor = "white", 
                      xaxis_showgrid = False, 
                      yaxis_showgrid = False, 
                      showlegend = True)
    
    return fig

## 3) Results

In [None]:
adjacency_matrix = pd.read_parquet("../dashboard/static/adjacency_matrix.parquet.gzip", engine='pyarrow')
files_names = os.listdir("../results/V2/TF_target_TRN")

#removed mitochondrial targets genes :
#Q0050, Q0080 and Q0120 from YIR018W_451_targets
#Q0115 and Q0130 from YJL127C_1176_targets
#Q0080 and Q0297 from 'YEL009C_303_targets.csv'


for genes in files_names:
    genes_list = pd.read_csv('../results/V2/TF_target_TRN/' + genes, sep=',', header = [0])
        
    chrom_repartition = chrom_repartition_hist(genes_list.TG)
    distances = distances_hist(genes_list)
    
    chrom_repartition.write_image("../results/V2/chrom_repartition_hist/" + genes +".jpeg")
    distances.write_image("../results/V2/3Ddistances_hist/" + genes +".jpeg")

# IV- Distri

In [6]:
def get_edges_list(gene_list, adjacency_matrix):
    
    sql_query = \
"""SELECT Primary_SGDID, Chromosome, Feature_name, Strand, Stop_coordinate, Start_coordinate
FROM SGD_features
"""

    Feature_name = tools.get_locus_info("../SCERE.db", sql_query)
    Feature_name = Feature_name.merge(genes_list, left_on = "Feature_name", right_on = genes_list.columns[0])
    
    adjacency_matrix_select = adjacency_matrix.loc[Feature_name.Primary_SGDID, Feature_name.Primary_SGDID]
    
    edges_list = adjacency_matrix_select.stack().dropna().reset_index()
    edges_list = edges_list.sort_values(by = "Primary_SGDID_bis")
    edges_list.rename(columns = {0: "3D_distances"}, inplace = True)
    edges_list = edges_list.sort_values(by = "3D_distances")
    edges_list.index = range(1, len(edges_list) + 1)
    
    return edges_list

def get_random_edges_list(random_gene_list, adjacency_matrix):
    
    adjacency_matrix_select = adjacency_matrix.loc[random_gene_list.Primary_SGDID, random_gene_list.Primary_SGDID]
    random_edges_list = adjacency_matrix_select.stack().dropna().reset_index()
    random_edges_list.rename(columns = {0: "3D_distances"}, inplace = True)
    random_edges_list = random_edges_list.sort_values(by = "3D_distances")
    random_edges_list.index = range(1, len(random_edges_list) + 1)
    
    return random_edges_list

def distri(genes_list, adjacency_matrix, genes):
    
    edges_list = get_edges_list(genes_list, adjacency_matrix)
    x = list(edges_list["3D_distances"])
    
    rd_x = list(range(0, 30))
    for i in range(0, 30):
        random_genes_list = pd.DataFrame(data = {"Primary_SGDID": rd.sample(list(adjacency_matrix.index),
                                                                             k=len(genes_list))})
        random_edges_list = get_random_edges_list(random_genes_list, adjacency_matrix)
        rd_x[i] = list(random_edges_list["3D_distances"])
    print(len(rd_x))
    rd_X = pd.DataFrame(rd_x)
    rd_X = list(rd_X.mean(axis = 0))
    
    
    H, X1 = np.histogram(x, bins = 100)
    #dx = X1[1] - X1[0]
    F1 = np.cumsum(H)
    
    H2, X2 = np.histogram(rd_X, bins = 100)
    #rd_dx = X2[1] - X2[0]
    F2 = np.cumsum(H2)
    
    fig = go.Figure()
    fig.add_trace(go.Histogram(x = rd_X, name = "Random 3D distances"))
    fig.add_trace(go.Histogram(x = x, name = "3D distances between targets"))
    fig.add_trace(go.Scatter(x = X1, y = F1, mode = 'lines', 
                             name = 'Cumulative distribution function',
                             yaxis="y2", 
                             line_color = "#FA3824"))
    fig.add_trace(go.Scatter(x = X2, y = F2, mode = 'lines', 
                             name = 'Cumulative distribution function RD', 
                             yaxis="y2", 
                             line_color = "#5767FF"))

    
    # Overlay both histograms
    fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
    fig.update_traces(opacity=0.75)
    fig.update_layout(plot_bgcolor = "white", 
                      xaxis_showgrid = False, 
                      yaxis_showgrid = False, 
                      yaxis2 = dict(title="yaxis2 title", anchor="x",
                                    overlaying="y", side="right"))
    return fig

In [8]:
adjacency_matrix = pd.read_parquet("../dashboard/static/adjacency_matrix_V4.parquet.gzip", engine='pyarrow')
files_names = os.listdir("../results/V2/TF_target_TRN")

for genes in files_names:
    genes_list = pd.read_csv('../results/V2/TF_target_TRN/' + genes, sep=',', header = [0])
    fig = distri(genes_list, adjacency_matrix, "YPR104C_287_targets")
    fig.write_image("../results/V2/distri/" + genes + ".jpeg")

30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30


# V- Distributions comparaison, Kolmogorov-Smirnov test

In [None]:
def ks_2samples(sample, random_sample, name):
    
    sample_chrom_repartition = chrom_repartition(sample.TG)
    random_chrom_repartition = chrom_repartition(random_sample.random_TG)
    
    ks_result = ks_2samp(sample_chrom_repartition, random_chrom_repartition)
    
    if ks_result.pvalue < 0.05:
        print(name)
        print(ks_result)
    
    return



def chrom_repartition(genes_list):
    
    sql_query = \
"""SELECT Primary_SGDID, Feature_name, Start_coordinate, Stop_coordinate, Chromosome, Strand
FROM SGD_features
ORDER BY Start_coordinate
"""
    
    loci = tools.get_locus_info("../SCERE.db", sql_query)
    loci = loci.assign(FT_target = loci.Feature_name.isin(genes_list))
       
    loci = loci[loci.FT_target == True].drop(["FT_target"], axis = 1)
       
    return loci.Chromosome

In [None]:
files_names = os.listdir("../results/TF_target_TRN")
files_names_random = os.listdir("../results/TF_random_target_TRN")


for genes, random_genes in zip(files_names, files_names_random):
    random_genes_list = pd.read_csv('../results/TF_random_target_TRN/' + random_genes, sep=',', header = [0])
    genes_list = pd.read_csv('../results/TF_target_TRN/' + genes, sep=',', header = [0])
    
    ks_2samples(genes_list, random_genes_list, genes)