## Module import

In [1]:
import sqlite3
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import plotly
import plotly.graph_objects as go
import plotly.express as px
import ipywidgets as widgets

## Module version

In [2]:
print("sqlite3 version:", sqlite3.version)
print("pandas version:", pd.__version__)
print("matplotlib version:", matplotlib.__version__)
print("numpy version:", np.__version__)
print("plotly version:", plotly.__version__)
print("ipywidgets version:", widgets.__version__)

sqlite3 version: 2.6.0
pandas version: 1.2.2
matplotlib version: 3.3.4
numpy version: 1.19.2
plotly version: 4.14.3
ipywidgets version: 7.6.3


# Chromosome visualization

## Functions

Chromosomes data recuperation.

In [3]:
def string_to_list(string):
    return list(string.split(", "))

def get_locus_info(database, info, source, condition, group_by, order_by):
    
    #SQL request
    db_connexion = sqlite3.connect(database)
    
    cursor = db_connexion.cursor()
    
    chrom_sense = cursor.execute("""
    SELECT """ + info + """
    FROM """ + source + """ 
    WHERE Strand == 'W' """ + condition + group_by + """
    ORDER BY """ + order_by)
    
    cursor = db_connexion.cursor()

    chrom_antisense = cursor.execute("""
    SELECT """ + info + """
    FROM """ + source + """ 
    WHERE Strand == 'C'""" + condition + group_by + """
    ORDER BY """ + order_by)
    
    List_info = string_to_list(info)
    
    #pandas dataframe formatting
    chrom_sense = chrom_sense.fetchall()
    chrom_sense = pd.DataFrame(chrom_sense, columns=List_info)
    
    chrom_antisense = chrom_antisense.fetchall()
    chrom_antisense = pd.DataFrame(chrom_antisense, columns=List_info)
    
    return chrom_sense, chrom_antisense

Coordinates formatting.

In [4]:
def format_coordinates(coordinates, y):
    
    genome_data = pd.DataFrame(columns = coordinates.columns)
    row_null = {"Start_coordinate": "none", "Stop_coordinate": "none", "Feature_type": "0", "Chromosome": "0"}
    
    for c in range(1, 18):
        chrom = coordinates[coordinates["Chromosome"] == str(c)]
        row_one = chrom.copy()
        row_one.index = range(0, len(chrom)*3, 3)
        row_one = row_one.drop("Stop_coordinate", axis = 1)
        row_one = row_one.transpose()
        
        row_two = chrom.copy()
        row_two.index = range(1, len(chrom)*3, 3)
        row_two["Start_coordinate"] = row_two["Stop_coordinate"]
        row_two = row_two.drop("Stop_coordinate", axis = 1)
        row_two = row_two.transpose()
        
        row_three = chrom.assign(Start_coordinate = "none")
        row_three.index = range(2, len(chrom)*3, 3)
        row_three = row_three.drop("Stop_coordinate", axis = 1)
        row_three = row_three.transpose()
        
        chrom_data = pd.merge(row_one, row_two, how = "outer", left_index = True, right_index = True)
        chrom_data = pd.merge(chrom_data, row_three, how = "outer", left_index = True, right_index = True)
        chrom_data = chrom_data.transpose()
        chrom_data = chrom_data.sort_index()
        chrom_data = chrom_data.assign(Stop_coordinate = y[c-1])    
        
        chrom_data = chrom_data.append(row_null, ignore_index = True)
        
        genome_data = genome_data.append(chrom_data)
        
    return genome_data

Chromosome shapes.

In [5]:
def get_chromosome_lenght(chrom_number):
    #SQL request
    db_connexion = sqlite3.connect('../SCERE.db')

    cursor = db_connexion.cursor()

    chromosome_length = cursor.execute("""
    SELECT length
    FROM chromosome_length
    """)
    
    chromosome_length = chromosome_length.fetchall()
    chromosome_length = pd.DataFrame(chromosome_length, columns = ["length"], index = list(range(1,18)))
    
    return chromosome_length.loc[chrom_number][0]

def format_chromosomes(y1, y2):
    
    chromosomes = pd.DataFrame(columns = ["Start_coordinate", "Stop_coordinate", "Chromosome"])
    
    for c in range(1,18):
        chrom_lenght = get_chromosome_lenght(c)
        chromosomes = chromosomes.append({"Start_coordinate": 0, 
                                          "Stop_coordinate": y1[c-1], 
                                          "Chromosome": 0, 
                                          "Feature_type": "0"}, ignore_index = True)
        chromosomes = chromosomes.append({"Start_coordinate": chrom_lenght, 
                                          "Stop_coordinate": y1[c-1], 
                                          "Chromosome": 0, 
                                          "Feature_type": "0"}, ignore_index = True)
        chromosomes = chromosomes.append({"Start_coordinate": 
                                          "none", 
                                          "Stop_coordinate": "none", 
                                          "Chromosome": 0, 
                                          "Feature_type": "0"}, ignore_index = True)
        
        chromosomes = chromosomes.append({"Start_coordinate": 0, 
                                          "Stop_coordinate": y2[c-1], 
                                          "Chromosome": 0, 
                                          "Feature_type": "0"}, ignore_index = True)
        chromosomes = chromosomes.append({"Start_coordinate": chrom_lenght, 
                                          "Stop_coordinate": y2[c-1],
                                          "Chromosome": 0, 
                                          "Feature_type": "0"}, ignore_index = True)
        chromosomes = chromosomes.append({"Start_coordinate": "none", 
                                          "Stop_coordinate": "none", 
                                          "Chromosome": 0, 
                                          "Feature_type": "0"}, ignore_index = True)
    
    return chromosomes

Genome drawing.

In [6]:
def genome_drawing(genome_data, mode, parameter, 
                   values = "null", values_colors = "null", threshold = 10**40, hover = []):
    
    chromosomes = format_chromosomes(list(range(0,108,6)), list(i - 0.4 for i in range(0,108,6)))
    
    genome_data = chromosomes.append(genome_data)
    genome_data.index = range(1, len(genome_data) + 1)
    
    
    if mode == "continuous":
        colors = get_color_continuous(genome_data[parameter])
        colors.index = range(1, len(colors) + 1)
        
        genome_data[parameter] = colors
        
        fig = px.line(genome_data,
                      x = "Start_coordinate",
                      y = "Stop_coordinate",
                      color = parameter,
                      color_discrete_map = "identity", 
                      hover_name = "Primary_SGDID")
    
    if mode == "semi_continuous":
        
        colors_and_intervals = get_color_semi_continuous(genome_data[parameter], threshold)
        colors = colors_and_intervals[0]
        colors.index = range(1, len(colors) + 1)
        genome_data[parameter] = colors
        
        intervals = colors_and_intervals[1]
        color_discrete_map = zip(intervals, px.colors.sequential.Viridis_r)
        color_discrete_map = dict(color_discrete_map)
        color_discrete_map = { "null": "lightgrey", **color_discrete_map}
        
        hover_formating = [True] * len(hover)
        hover_data = dict(zip(hover, hover_formating))
        
        fig = px.line(genome_data,
                      x = "Start_coordinate",
                      y = "Stop_coordinate",
                      color = parameter,
                      color_discrete_map = color_discrete_map, 
                      hover_name = "Primary_SGDID", 
                      hover_data = {**hover_data, "Stop_coordinate": False})
                    #no order in legend because locus are not drawed when there is an order
    
    if mode == "discreet":
        colors = get_color_discreet(genome_data[parameter], values, values_colors)
        
        genome_data[parameter] = colors
        
        fig = px.line(genome_data,
                      x = "Start_coordinate",
                      y = "Stop_coordinate",
                      color = parameter, 
                      color_discrete_map = {"Other": "lightgrey", values: values_colors},
                      hover_name = "Primary_SGDID")

    fig.update_traces(line = dict(width = 9))
    
    fig.update_layout(plot_bgcolor = "white", 
                      xaxis_showgrid = False, 
                      yaxis_showgrid = False, 
                      showlegend = True)
    
    fig.update_yaxes(tickmode = "array",
                     tickvals = list(range(0,102,6)),
                     ticktext = ["1", "2", "3", "4", "5", "6", "7", "8", "9",
                                 "10", "11", "12", "13", "14", "15", "16", "Plasmid"],
                     title = "Chromosomes number")
    fig.update_xaxes(title = "Coordinates (bp)")
    
    fig.update_layout(hoverlabel = dict(bgcolor="white",
                                        font_size=16))
    
    fig.show()

Adding color.

In [7]:
def get_color_discreet(parameter, values, values_colors):
    
    parameter.index = range(1, len(parameter) + 1)
    
    test = [values] * len(parameter)
    test = pd.Series(test, index = range(1, len(parameter) + 1))
    
    tchecking_df = pd.DataFrame({"parameter": parameter, "test": test})
    
    conditions = [(tchecking_df.parameter == tchecking_df.test)]
    choices = [values]
    
    tchecking_df = tchecking_df.assign(right_parameter = np.select(conditions, choices, default = "Other"))
    
    return tchecking_df["right_parameter"]


In [8]:
def get_color_semi_continuous(parameter, threshold):
    
    parameter = parameter.apply(float)
    parameter.index = range(1, len(parameter) + 1)
    limit = min(parameter.max(), threshold)
    STEP = (limit/9)
    
    conditions = [(parameter <= STEP), (parameter <= STEP * 2), (parameter <= STEP * 3),
                  (parameter <= STEP * 4), (parameter <= STEP * 5), (parameter <= STEP * 6), 
                  (parameter <= STEP * 7), (parameter <= STEP * 8), (parameter <= STEP * 9), 
                  (parameter > STEP * 9)]
    
    choices = ["0-" + str(round(STEP)), 
               str(round(STEP)) + "-" + str(round(STEP * 2)), 
               str(round(STEP * 2)) + "-" + str(round(STEP * 3)), 
               str(round(STEP * 3)) + "-" + str(round(STEP * 4)), 
               str(round(STEP * 4)) + "-" + str(round(STEP * 5)), 
               str(round(STEP * 5)) + "-" + str(round(STEP * 6)), 
               str(round(STEP * 6)) + "-" + str(round(STEP * 7)), 
               str(round(STEP * 7)) + "-" + str(round(STEP * 8)), 
               str(round(STEP * 8)) + "-" + str(round(STEP * 9)), 
               str(round(STEP * 9)) + "<"]
    
    
    right_parameter = np.select(conditions, choices, default = "null")
    
    return [pd.Series(right_parameter), choices]

In [9]:
def get_color_continuous(parameter):
    cmap = matplotlib.cm.get_cmap('viridis')
    parameter = parameter.apply(float)
    MIN = min(parameter)
    MAX = max(parameter)
    colors = []

    for i in range(1, len(parameter)+1):
        
        if parameter[i] == 0 or parameter[i] == "" or parameter[i] == "NaN" :
            color = "lightgrey"
            colors = colors + [color]
        
        else :
            color = cmap((parameter[i] - MIN) / (MAX - MIN))
            color = "rgb" + str(color[:3])
            
            colors = colors + [color]
    
    return pd.Series(colors)

## Applications

In [13]:
%%time

chrom_sense = get_locus_info("../SCERE.db",
                             """Primary_SGDID, count(SGDID), Start_coordinate, Stop_coordinate, Chromosome, Feature_type""",
                            "gene_literature, SGD_features",
                             """AND (Strand == 'W') AND (SGDID == Primary_SGDID) """, """ GROUP BY SGDID """,
                             """ Start_coordinate"""
                            )[0]

#display(chrom_sense)
chrom_sense = format_coordinates(chrom_sense, list(range(0,108,6)))
#display(chrom_sense)

chrom_antisense = get_locus_info("../SCERE.db",
                                 """Primary_SGDID, count(SGDID), Start_coordinate, Stop_coordinate, Chromosome, Feature_type""",
                                 "gene_literature, SGD_features",
                                 """AND (Strand == 'C') AND (SGDID == Primary_SGDID) """, """ GROUP BY SGDID """,
                                 """ Start_coordinate"""
                                )[1]

chrom_antisense = format_coordinates(chrom_antisense, list(i - 0.4 for i in range(0,108,6)))

whole_genome = chrom_sense.append(chrom_antisense)
display(whole_genome)

genome_drawing(whole_genome, "continuous", "Chromosome", "null", "null")

genome_drawing(whole_genome, "discreet", "Feature_type", "rRNA_gene", "green")

Unnamed: 0,Primary_SGDID,count(SGDID),Start_coordinate,Stop_coordinate,Chromosome,Feature_type
0,S000002143,1,335,0,1,ORF
1,S000002143,1,649,0,1,ORF
2,S000002143,1,none,0,1,ORF
3,S000028594,1,538,0,1,ORF
4,S000028594,1,792,0,1,ORF
...,...,...,...,...,...,...
11,S000029670,2,none,95.6,17,origin_of_replication
12,S000007335,18,78162,95.6,17,tRNA_gene
13,S000007335,18,78089,95.6,17,tRNA_gene
14,S000007335,18,none,95.6,17,tRNA_gene


CPU times: user 7.3 s, sys: 336 ms, total: 7.64 s
Wall time: 7.66 s


In [11]:
%%time

chrom_sense = get_locus_info("../SCERE.db",
                             """Primary_SGDID, count(SGDID), Start_coordinate, Stop_coordinate, Chromosome, Phenotype""",
                            "SGD_features, phenotypes",
                             """AND (Strand == 'W') AND (SGDID == Primary_SGDID) AND (Strain_background == 'S288C') """, """ GROUP BY SGDID """,
                             """ Start_coordinate"""
                            )[0]

#display(chrom_sense)
chrom_sense = format_coordinates(chrom_sense, list(range(0,108,6)))
#display(chrom_sense)

chrom_antisense = get_locus_info("../SCERE.db",
                                 """Primary_SGDID, count(SGDID), Start_coordinate, Stop_coordinate, Chromosome, Phenotype""",
                                 "SGD_features, phenotypes",
                                 """AND (Strand == 'C') AND (SGDID == Primary_SGDID) AND (Strain_background == 'S288C') """, """ GROUP BY SGDID """,
                                 """ Start_coordinate"""
                                )[1]

chrom_antisense = format_coordinates(chrom_antisense, list(i - 0.4 for i in range(0,108,6)))

whole_genome = chrom_sense.append(chrom_antisense)
display(whole_genome)

genome_drawing(whole_genome, "discreet", "Phenotype", "respiratory growth: absent", "green")

Unnamed: 0,Primary_SGDID,count(SGDID),Start_coordinate,Stop_coordinate,Chromosome,Phenotype,Feature_type
0,S000028593,1,2480,0,1,viable,
1,S000028593,1,2707,0,1,viable,
2,S000028593,1,none,0,1,viable,
3,S000000061,7,10091,0,1,competitive fitness: normal,
4,S000000061,7,10399,0,1,competitive fitness: normal,
...,...,...,...,...,...,...,...
705,S000006404,4,939671,89.6,16,competitive fitness: increased,
706,S000006404,4,939279,89.6,16,competitive fitness: increased,
707,S000006404,4,none,89.6,16,competitive fitness: increased,
708,,,none,none,0,,0


CPU times: user 1.7 s, sys: 23.8 ms, total: 1.73 s
Wall time: 1.73 s


In [12]:
%%time

chrom_sense = get_locus_info("../SCERE.db",
                             """Primary_SGDID, count(SGDID), Start_coordinate, Stop_coordinate, Chromosome, Feature_type""",
                            "gene_literature, SGD_features",
                             """AND (Strand == 'W') AND (SGDID == Primary_SGDID) """, """ GROUP BY SGDID """,
                             """ Start_coordinate"""
                            )[0]

#display(chrom_sense)
chrom_sense = format_coordinates(chrom_sense, list(range(0,108,6)))
#display(chrom_sense)

chrom_antisense = get_locus_info("../SCERE.db",
                                 """Primary_SGDID, count(SGDID), Start_coordinate, Stop_coordinate, Chromosome, Feature_type""",
                                 "gene_literature, SGD_features",
                                 """AND (Strand == 'C') AND (SGDID == Primary_SGDID) """, """ GROUP BY SGDID """,
                                 """ Start_coordinate"""
                                )[1]

chrom_antisense = format_coordinates(chrom_antisense, list(i - 0.4 for i in range(0,108,6)))

whole_genome = chrom_sense.append(chrom_antisense)
whole_genome = whole_genome.rename(columns = {"count(SGDID)": "Number of articles"})
display(whole_genome)

genome_drawing(whole_genome, 
               "semi_continuous", 
               "Number of articles", 
               threshold = 300, 
               hover = ["Feature_type"])


Unnamed: 0,Primary_SGDID,Number of articles,Start_coordinate,Stop_coordinate,Chromosome,Feature_type
0,S000002143,1,335,0,1,ORF
1,S000002143,1,649,0,1,ORF
2,S000002143,1,none,0,1,ORF
3,S000028594,1,538,0,1,ORF
4,S000028594,1,792,0,1,ORF
...,...,...,...,...,...,...
11,S000029670,2,none,95.6,17,origin_of_replication
12,S000007335,18,78162,95.6,17,tRNA_gene
13,S000007335,18,78089,95.6,17,tRNA_gene
14,S000007335,18,none,95.6,17,tRNA_gene


CPU times: user 3.43 s, sys: 412 ms, total: 3.84 s
Wall time: 3.84 s
