In [1]:
import tools
import visualization_2D as vis2D
import visualization_3D as vis3D
import pandas as pd
import numpy as np
import pyarrow as pa
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.express as px
from plotly.tools import mpl_to_plotly
import math
import random

In [2]:
import networkx as nx
import matplotlib.pyplot as plt
import ipycytoscape
import matplotlib.pyplot as plt
import ipywidgets as widgets

import sqlite3

## 3D coordinates

In [3]:
atoms_coordinates = pd.read_csv('../data/3D_coordinates/3dmodel.csv',
                          names = ("Atom","Atom_nb", "O", "EDG","chrom", "x", "y", "z", "1", "75","none"),
                          sep=',')

atoms_coordinates = atoms_coordinates.drop(["O","EDG","1","75","Atom_nb","Atom","none"],axis = 1)

atoms_coordinates = atoms_coordinates.set_index([pd.Index(list(range(1, 26539)))])

atoms_coordinates["chrom"] = atoms_coordinates["chrom"].replace(["A","B","C","D","E","F","G","H",
                                                                 "J","I","K","L","N","M","O","P"],
                                                                [1,2,3,4,5,6,7,8,
                                                                 9,10,11,12,13,14,15,16])

atoms_coordinates["chrom"] = pd.to_numeric(atoms_coordinates.chrom)

## 3D segments and loci connexion

In [4]:
%%time

loci_series_list = []

for c in range(1, 17):
    
    chrom_number = str(c)
    
    loci = vis3D.get_chrom_info('../SCERE.db',
                          chrom_number,
                          """Primary_SGDID, Start_coordinate, Stop_coordinate, Feature_type""",
                          "SGD_Features",
                          """""",
                          """""",
                          """ Start_coordinate""")
    
    segments = vis3D.format_segments_dataframe(c, atoms_coordinates)
    
    loci_series_list = loci_series_list + [vis3D.connect(segments, loci)]

segments_loci = pd.concat(loci_series_list)
segments_loci.index = range(1, len(segments_loci) + 1)


CPU times: user 3min 26s, sys: 610 ms, total: 3min 27s
Wall time: 3min 27s


In [5]:
list_segments_loci = segments_loci

for i in range(1, len(list_segments_loci) + 1):
    list_segments_loci.at[i, 1] = list(list_segments_loci.loc[i].dropna())

list_segments_loci.index = range(1, len(list_segments_loci) + 1)
list_segments_loci = list_segments_loci[1]
list_segments_loci = list(list_segments_loci)

## 3D segment subdivision

In [6]:
%%time


#Récupération des coordonnées des 26522 segments 3D
segments_coordinates = vis3D.calcul_segments_coordinates(1, atoms_coordinates)

for c in range(2, 17):
    
    chrom_segments_coordinates = vis3D.calcul_segments_coordinates(c, atoms_coordinates)
    
    segments_coordinates = segments_coordinates.append(chrom_segments_coordinates)

# segments_coordinates.index = range(0, len(segments_coordinates))
segments_coordinates = segments_coordinates.reset_index()

#new_segments stocke dans l'ordre les segments, dont ceux qui sont subdivisés
new_segments = pd.DataFrame(columns = ["x_start", "y_start", "z_start", "x_stop", "y_stop", "z_stop", "Primary_SGDID"])

#loci_3Dloc stocke les loci et leur position dans l'espace (le start du segments associé au loci)
#loci_3Dloc = pd.DataFrame(columns = ["Primary_SGDID", "x_start", "y_start", "z_start"])

#on parcourt la liste des segments, avec leurs loci associés
for s in range(0, len(list_segments_loci)):
    
    #liste des loci qui n'ont pas encore été associés à un segment
    new_loci = list(set(list_segments_loci[s]) - set(list(new_segments["Primary_SGDID"].values)))
    
    #dans le cas où il y a plus d'un loci associé au segment, on le subdivise par le nombre de loci.
    if len(new_loci) > 0:
        segment_vect_x = segments_coordinates.loc[s, "x_stop"] - segments_coordinates.loc[s, "x_start"]
        segment_vect_y = segments_coordinates.loc[s, "y_stop"] - segments_coordinates.loc[s, "y_start"]
        segment_vect_z = segments_coordinates.loc[s, "z_stop"] - segments_coordinates.loc[s, "z_start"]
        for new_loci_idx in range(0, len(new_loci)):
            # New start point
            x_start = segments_coordinates.loc[s, "x_start"] + (new_loci_idx/len(new_loci)) * segment_vect_x
            y_start = segments_coordinates.loc[s, "y_start"] + (new_loci_idx/len(new_loci)) * segment_vect_y
            z_start = segments_coordinates.loc[s, "z_start"] + (new_loci_idx/len(new_loci)) * segment_vect_z
            # New stop point
            x_stop = segments_coordinates.loc[s, "x_start"] + ((new_loci_idx+1)/len(new_loci)) * segment_vect_x
            y_stop = segments_coordinates.loc[s, "y_start"] + ((new_loci_idx+1)/len(new_loci)) * segment_vect_y
            z_stop = segments_coordinates.loc[s, "z_start"] + ((new_loci_idx+1)/len(new_loci)) * segment_vect_z
            # Store new segment
            new_segments = new_segments.append({"Primary_SGDID": new_loci[new_loci_idx],
                                                "x_start": x_start,
                                                "y_start": y_start,
                                                "z_start": z_start,
                                                "x_stop": x_stop,
                                                "y_stop": y_stop,
                                                "z_stop": z_stop},
                                                ignore_index=True)    
        
    else:
        new_segments = new_segments.append({"Primary_SGDID": None,
                                        "x_start": segments_coordinates.loc[s, "x_start"],
                                        "y_start": segments_coordinates.loc[s, "y_start"],
                                        "z_start": segments_coordinates.loc[s, "z_start"],
                                        "x_stop": segments_coordinates.loc[s, "x_stop"],
                                        "y_stop": segments_coordinates.loc[s, "y_stop"],
                                        "z_stop": segments_coordinates.loc[s, "z_stop"]},
                                        ignore_index=True)

CPU times: user 2min 59s, sys: 418 ms, total: 2min 59s
Wall time: 3min


In [7]:
#Mettre ces valeurs au propre

len(new_segments)
len(segments_coordinates)

new_segments["Primary_SGDID"].isna().sum()

28790-19605

9185

## Loci 3D localisation

In [8]:
loci_3Dloc = new_segments.drop(["x_stop", "y_stop", "z_stop"], axis = 1)
loci_3Dloc = loci_3Dloc[- loci_3Dloc["Primary_SGDID"].isna()]
loci_3Dloc.index = range(1, len(loci_3Dloc) + 1)

loci_3Dloc

Unnamed: 0,x_start,y_start,z_start,Primary_SGDID
1,77.365000,61.609000,15.448000,S000028862
2,77.501714,61.581143,15.478714,S000028594
3,77.638429,61.553286,15.509429,S000002143
4,77.775143,61.525429,15.540143,S000028864
5,77.911857,61.497571,15.570857,S000028865
...,...,...,...,...
9181,155.540000,34.061000,75.917000,S000034728
9182,155.262000,33.940333,75.778333,S000006407
9183,154.984000,33.819667,75.639667,S000006406
9184,154.706000,33.699000,75.501000,S000006408


In [9]:
#FIXING THE MATRIX
loci_3Dloc = loci_3Dloc.sort_values("Primary_SGDID")
loci_3Dloc.index = range(1, len(loci_3Dloc) + 1)
loci_3Dloc

Unnamed: 0,x_start,y_start,z_start,Primary_SGDID
1,113.1850,110.39700,4.1200,S000000001
2,112.6980,107.26100,9.4060,S000000002
3,112.5975,106.84375,10.0755,S000000003
4,112.4600,106.30400,10.9340,S000000004
5,112.1230,105.09200,12.8280,S000000005
...,...,...,...,...
9181,156.3465,54.58450,74.5690,S000178200
9182,152.2950,39.11200,65.2220,S000178201
9183,150.5970,26.04300,65.1290,S000178202
9184,142.4700,28.79000,77.1160,S000178203


In [10]:
loci_3Dloc.to_csv("../results/loci_3Dloc.csv", 
               sep = "\t",
               index = False,
               columns = ["x_start", "y_start", "z_start", "Primary_SGDID"])

## Distances matrix

In [None]:
%%time
#1h35 min

adjacency_matrix = pd.DataFrame(columns = range(1, len(loci_3Dloc) + 1), index = range(1,len(loci_3Dloc) + 1))

for i in adjacency_matrix.index:
    for j in range(i + 1, len(adjacency_matrix.index) + 1):
        adjacency_matrix[i][j] = math.sqrt((loci_3Dloc.x_start[i] - loci_3Dloc.x_start[j]) ** 2 +
                                           (loci_3Dloc.y_start[i] - loci_3Dloc.y_start[j]) ** 2 +
                                           (loci_3Dloc.z_start[i] - loci_3Dloc.z_start[j]) ** 2)

In [None]:
adjacency_matrix.index = loci_3Dloc["Primary_SGDID"]
loci_3Dloc = loci_3Dloc.rename(columns={"Primary_SGDID": "Primary_SGDID_bis"})
adjacency_matrix.columns = loci_3Dloc["Primary_SGDID_bis"]

In [None]:
adjacency_matrix

In [None]:
adjacency_matrix.to_parquet('../data/adjacency_matrix_V4.parquet.gzip', engine = "pyarrow")

In [None]:
#adjacency_matrix.to_csv("../data/adjacency_matrix_V2.csv")

In [None]:
#adjacency_matrix.to_csv("../data/adjacency_matrix_V3.csv", header = True)
adjacency_matrix = pd.read_csv("../data/adjacency_matrix_V3.csv")

adjacency_matrix = adjacency_matrix.drop("Primary_SGDID", axis = 1)
adjacency_matrix.index = loci_3Dloc["Primary_SGDID"]

In [None]:
print(adjacency_matrix.loc["S000004395", "S000004038"]) #118,474
print(adjacency_matrix.loc["S000006148", "S000001515"]) #106,962
print(adjacency_matrix.loc["S000006208", "S000005996"]) #62,021
print(adjacency_matrix.loc["S000001315", "S000003041"]) #28,407

## Selection of the genes of interest

In [None]:
genes_list = pd.read_csv('../data/regulation_factors_target/sgd/STE12_targets_response_to_chimical.csv',
                          header = 0,
                          names = ["genes"])

sql_query = \
"""SELECT Primary_SGDID, Chromosome, Feature_name, Strand, Stop_coordinate, Start_coordinate
FROM SGD_features
"""

sql_query_bis = \
"""SELECT Primary_SGDID, Chromosome, Standard_gene_name, Strand, Stop_coordinate, Start_coordinate
FROM SGD_features
"""

Feature_name = tools.get_locus_info("../SCERE.db", sql_query)
Feature_name = Feature_name.merge(genes_list, left_on = "Feature_name", right_on = "genes")

Standard_gene_name = tools.get_locus_info("../SCERE.db", sql_query_bis)
Standard_gene_name = Standard_gene_name.merge(genes_list, left_on = "Standard_gene_name", right_on = "genes")

features_ID = Standard_gene_name.append(Feature_name)

#why "S000007266" is not in the matrix ?
features_ID = features_ID[features_ID["Primary_SGDID"] != "S000007266"]

adjacency_matrix_select = adjacency_matrix.loc[ features_ID.Primary_SGDID, features_ID.Primary_SGDID]
adjacency_matrix_select.index.names = ["Primary_SGDID_bis"]

## Connexions list

In [None]:
edges_list = adjacency_matrix_select.stack().dropna().reset_index()
edges_list = edges_list.sort_values(by = "Primary_SGDID_bis")
edges_list.rename(columns = {0: "3D_distance"}, inplace = True)
edges_list = edges_list.sort_values(by = "3D_distance")
edges_list.index = range(1, len(edges_list) + 1)

## Genes of interest distances histogram

In [None]:
#fig = go.Figure(data = go.Scattergl(x = edges_list.index, y = edges_list["3D_distance"], mode = 'markers'))

fig = edges_list["3D_distance"].plot(kind='hist',
                                     bins=50,
                                     title='Histogram Of 3D distances',
                                     figsize=(12,8),
                                     fontsize=15, 
                                     color=['#A0E8AF'])

In [None]:
fig = px.histogram(edges_list, x="3D_distance", nbins= 50)
fig.show()

## All distances histogram

In [None]:
adjacency_matrix.index.names = ["Primary_SGDID_bis"]
edges_list_all = adjacency_matrix.stack().dropna().reset_index()
edges_list_all = edges_list_all.sort_values(by = "Primary_SGDID_bis")
edges_list_all.rename(columns = {0: "3D_distance"}, inplace = True)
edges_list_all = edges_list_all.sort_values(by = "3D_distance")


all_distance_hist = edges_list_all["3D_distance"].plot(kind='hist',
                                                       bins=50,
                                                       title='Histogram Of 3D distances',
                                                       figsize=(12,8),
                                                       fontsize=15, 
                                                       color=['#A0E8AF'])

## Random selecion of genes on chromosome 2 histogram

In [None]:

sql_query_ter = \
"""SELECT Primary_SGDID, Chromosome, Strand, Stop_coordinate, Start_coordinate
FROM SGD_features
WHERE (Chromosome == 12) AND (Feature_type != "CDS") AND (Start_coordinate != "")
"""

Feature_name = tools.get_locus_info("../SCERE.db", sql_query_ter)


adjacency_matrix_chrom = adjacency_matrix.loc[ Feature_name.Primary_SGDID, Feature_name.Primary_SGDID]
adjacency_matrix_chrom.index.names = ["Primary_SGDID_bis"]

edges_list_chrom = adjacency_matrix_chrom.stack().dropna().reset_index()
edges_list_chrom = edges_list_chrom.sort_values(by = "Primary_SGDID_bis")
edges_list_chrom.rename(columns = {0: "3D_distance"}, inplace = True)
edges_list_chrom = edges_list_chrom.sort_values(by = "3D_distance")


edges_list_chrom["3D_distance"].hist(bins = 50)

In [None]:
fig = px.histogram(edges_list_chrom, x="3D_distance", nbins= 100)
fig.show()

## Genes of interest genomic positions

In [None]:
chroms = tools.get_locus_info("../SCERE.db", sql_query)

chroms = chroms.assign(FT_target = chroms.Primary_SGDID.isin(features_ID.Primary_SGDID))

chroms_format = vis2D.format_coordinates(chroms, 6)

vis2D.genome_drawing(chroms_format, "discreet", "FT_target", [True], ["mediumblue"])

In [None]:
chroms[chroms.FT_target == True].drop(["FT_target"], axis = 1)

In [None]:
test = chroms[chroms.FT_target == True]
fig = px.histogram(test, x="Chromosome", nbins=30, range_x=[1, 17])
fig.show()

## Genes of interest 3D positions

In [None]:
def format_atoms_coordinates_V3(new_segments):
    
    plotly_segments_V2 = pd.DataFrame(columns = ["x", "y", "z", "Primary_SGDID"])
    
    for i in range(0, len(new_segments)):
        plotly_segments_V2 = plotly_segments_V2.append({"x": new_segments["x_start"][i], 
                                                       "y": new_segments["y_start"][i], 
                                                       "z": new_segments["z_start"][i], 
                                                       "Primary_SGDID": new_segments["Primary_SGDID"][i]}, 
                                                       ignore_index=True)
        
        plotly_segments_V2 = plotly_segments_V2.append({"x": new_segments["x_stop"][i], 
                                                       "y": new_segments["y_stop"][i], 
                                                       "z": new_segments["z_stop"][i], 
                                                       "Primary_SGDID": new_segments["Primary_SGDID"][i]}, 
                                                       ignore_index=True)
        
        plotly_segments_V2 = plotly_segments_V2.append({"x": None, 
                                                        "y": None, 
                                                        "z": None, 
                                                        "Primary_SGDID": new_segments["Primary_SGDID"][i]},
                                                        ignore_index=True)
    
    return plotly_segments_V2

In [None]:
%%time

#plotly_segments = vis3D.format_atoms_coordinates_V2(atoms_coordinates, segments_loci)
plotly_segments = format_atoms_coordinates_V3(new_segments)

#plotly_segments.to_csv("../data/plotly_segments.csv", index = False, columns = ["x", "y", "z", "Primary_SGDID"])

In [None]:
whole_genome = tools.get_locus_info("../SCERE.db", sql_query)

whole_genome = whole_genome.assign(FT_target = whole_genome.Primary_SGDID.isin(features_ID.Primary_SGDID))

whole_genome_segments = plotly_segments.merge(whole_genome, on = "Primary_SGDID", how = "left", copy = False)
whole_genome_segments.index = range(1, len(whole_genome_segments) + 1)

whole_genome_segments = vis3D.get_color_discreet_3D(whole_genome_segments,
                                                    "FT_target",
                                                    [True], ["mediumblue"])

In [None]:
%%time

fig = go.Figure(data=[go.Scatter3d(x = whole_genome_segments.x,
                                   y = whole_genome_segments.y,
                                   z = whole_genome_segments.z,
                                   mode = "lines",
                                   name = "",
                                   line = {"color": whole_genome_segments["colors"],
                                           "width": 12},
                                   customdata = whole_genome_segments.Primary_SGDID,
                                   hovertemplate = ("<b>SGDID :</b> %{customdata} <br>"
                                                    "<b>x :</b> %{x} <br>"),
                                   hoverlabel = dict(bgcolor = "white", font_size = 16))])

fig.update_layout(scene=dict(xaxis = dict(showgrid = False, backgroundcolor = "white"),
                             yaxis = dict(showgrid = False, backgroundcolor = "white"),
                             zaxis = dict(showgrid = False, backgroundcolor = "white")))


fig.show()

In [None]:
edges_list

## Network of genes of interest

In [None]:
edges_list_ts = edges_list[edges_list["3D_distance"] < 20]
edges_list_ts

In [None]:
G = nx.from_pandas_edgelist(
    edges_list_ts,
    source="Primary_SGDID_bis",
    target="level_1")

g = ipycytoscape.CytoscapeWidget()
g.graph.add_graph_from_networkx(G)
display(g)

In [None]:
print('nombre de noeuds :',G.number_of_nodes())
#print(G.nodes(data = True))

print('nombre de connexions :',G.number_of_edges(),'\n')
#print(G.edges(data = True))

#Liste des degrés.
#G.degree()

#Liste des 'betweenness centrality'
#pd.Series(nx.betweenness_centrality(G))

In [None]:
degrees = [val for (node, val) in G.degree()]

fig = px.histogram(degrees, nbins= 50)
fig.show()

In [None]:
list(nx.enumerate_all_cliques(G))

In [None]:
sql_query = \
"""SELECT DISTINCT GO_slim_term
FROM go_slim_mapping
"""

db_connexion = sqlite3.connect("../SCERE.db")
cursor = db_connexion.cursor()
GO_terms = cursor.execute(sql_query)
GO_terms = pd.DataFrame(GO_terms.fetchall(), columns=["GO_terms"])
GO_terms = GO_terms.sort_values("GO_terms")

GO_terms.to_csv("../data/GO_terms.csv", sep = ",", columns = ["GO_terms"], index = False)
options = [{'label': GO, 'value': GO} for GO in GO_terms["GO_terms"]]

In [None]:
top_TF = pd.read_csv('../data/Yeast_TRN.csv', sep='\t')
top_TF["top"] = top_TF["TF"].isin(top_TF["TG"])
top_TF = top_TF[top_TF["top"] == False]

In [None]:
top_TF.to_csv("../data/top_TF.csv", index = False, header = False)

In [None]:
top_TF