In [None]:
# Librairie & paramètres
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import plotly.express as px
from scipy.cluster import hierarchy
import spacy

chemin_actuel = os.getcwd()
dossier_parent = os.path.abspath(os.path.join(chemin_actuel, os.pardir))
dossier_parent = os.path.abspath(os.path.join(dossier_parent, os.pardir))

param = {

    # Direction vers les fichiers GeoSRA
    "GeoSRA DIR IN" :dossier_parent+"/data/raw/geo-sra-ecological.parquet.gzip",
    "GeoSRA DIR OUT":dossier_parent+"/data/processed/geosra_cluster.parquet.gzip",

    # Taille des figures
    "Width Figure"  :800,
    "Height Figure" :600,

    # Seuils pour les fonctions de répartition et le dendogramme
    "Repart Thr"    :0.95,
    "Semant Thr"    :3.5,
 
    # Classe d'assay type principales
    "Assay Class"   :['AMPLICON','WGS','RNA-Seq','WGA'],

    # Couleurs
    "Clusters Color":{"(?)":'lightgrey',
                    "C1":'darkorange',
                    "C2":'limegreen', 
                    "C3":'darkred',
                    "C4":'rebeccapurple',
                    "C5":'saddlebrown',
                    "C6":'pink',
                    "C7":"gray",
                    "C8":"lightgreen",
                    "C9":"cyan",
                    "NC":"black"
    },

    "Assays Color"  :{"(?)":'lightgrey',
                    "AMPLICON":"skyblue",
                    "WGS":"salmon",
                    "RNA-Seq":"lemonchiffon",
                    "WGA":"palegreen",
                    "OTHER":"black"
    }

}

In [None]:
# Ouvre GeoSRA et affiche les colonnes
geosra_pd = pd.read_parquet(param["GeoSRA DIR IN"])
geosra_pd.head()

In [None]:
# TreeMap des organism

organism = list(geosra_pd['organism'].unique())
sizes = [len(geosra_pd[geosra_pd['organism'] == o]) for o in organism]

treemap_pd = pd.DataFrame(
    dict(organism=organism,sizes=sizes)
)
treemap_pd.loc[treemap_pd["organism"] == '','organism'] = 'Unspecified'

treemap_pd["all"] = "all"
fig = px.treemap(treemap_pd,
                path=['all','organism'], 
                values='sizes',
)
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_layout(
    width=param["Width Figure"],
    height=param["Height Figure"]
)
fig.show()

In [None]:
# Fonction de répartition de la colonne organism

value_count = dict(geosra_pd["organism"].value_counts())
keys   = [i for i in range(len(value_count))]
values = list(value_count.values())

# Initialize tot and repartiontion list
tot    = sum(values)
repart = [0]

# Repartition for each X
for i in range(len(values)-1):
    repart.append(sum(values[:i+1])/tot)

df = pd.DataFrame(
    dict(keys=keys,values=repart)
)

thr = min([i for i,val in enumerate(repart) if val > param["Repart Thr"]])

fig = px.line(df, x="keys", y="values", title='Fonction de répartition')
fig.add_vline(x=thr, line_width=3, line_dash="dash", line_color="lightcoral")
fig.update_layout(
    width=param["Width Figure"],
    height=param["Height Figure"]
)
fig.show()


In [None]:
# Dendogramme (Sémantique) des principales valeurs de organism

chains = list(value_count.keys())[:thr]
chains = [chain.replace('metagenome', '') for chain in chains]

nlp = spacy.load("en_core_web_md")
dist_matrix = np.zeros((len(chains), len(chains)))
for i in range(len(chains)):
    for j in range(len(chains)):
        dist_matrix[i, j] = nlp(chains[i]).similarity(nlp(chains[j]))
Z = hierarchy.linkage(dist_matrix, method='ward')


plt.figure(figsize=(10, 5))
dn = hierarchy.dendrogram(Z, labels=chains, color_threshold=param["Semant Thr"])
plt.xticks(rotation='vertical')
plt.ylabel('Semantix distance')
plt.show()

In [None]:
# Histogramme des clusters
keys = [key+'metagenome' for key in dn['ivl']]
class_dict = dict(zip(keys,dn["leaves_color_list"]))
geosra_pd["cluster_organism"] = geosra_pd["organism"].map(class_dict)
geosra_pd["cluster_organism"] = geosra_pd["cluster_organism"].fillna('NC')

df = geosra_pd[["cluster_organism","assay_type"]]
idxs = df[~df['assay_type'].isin(param["Assay Class"])].index
df.loc[idxs,"assay_type"] = 'OTHER'

fig = px.histogram(df, x="cluster_organism", color="assay_type",
                color_discrete_map=param["Assays Color"]) 
fig.update_layout(
    width=param["Width Figure"],
    height=param["Height Figure"]
)            
fig.show()

In [None]:
# Histogramme des assay types

fig = px.histogram(df, x="assay_type", color="cluster_organism", 
                color_discrete_map=param["Clusters Color"]) 
fig.update_layout(
    width=param["Width Figure"],
    height=param["Height Figure"]
)            
fig.show()

In [None]:
# TreeMap des clusters d'organism

organism = list(geosra_pd['organism'].unique())
cluster = [geosra_pd.loc[geosra_pd['organism'] == o, 'cluster_organism'].values[0] for o in organism]
sizes = [len(geosra_pd[geosra_pd['organism'] == o]) for o in organism]

treemap_pd = pd.DataFrame(
    dict(organism=organism,cluster=cluster,sizes=sizes)
)
treemap_pd.loc[treemap_pd["organism"] == '','organism'] = 'Unspecified'

treemap_pd["all"] = "all"
fig = px.treemap(treemap_pd,
                path=['all','cluster','organism'], 
                values='sizes',
                color='cluster',
                color_discrete_map=param["Clusters Color"]
)
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_layout(
    width=param["Width Figure"],
    height=param["Height Figure"]
)
fig.show()

In [None]:
# Enregistrement des données avec release_year et cluster_organism
geosra_pd.to_parquet(param["GeoSRA DIR OUT"])