In [None]:
# Librairie & paramètres
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import plotly.express as px
from scipy.cluster import hierarchy
import spacy

import sys
sys.path.append('../representations')
import graph_lib as graph

chemin_actuel = os.getcwd()
dossier_parent = os.path.abspath(os.path.join(chemin_actuel, os.pardir))
dossier_parent = os.path.abspath(os.path.join(dossier_parent, os.pardir))
dossier_parent = os.path.abspath(os.path.join(dossier_parent, os.pardir))

param = {

    # Direction vers les fichiers GeoSRA
    "GeoSRA DIR IN" :dossier_parent+"/data/raw/geo-sra-ecological.parquet.gzip",
    "GeoSRA DIR OUT":dossier_parent+"/data/processed/geosra_cluster.parquet.gzip",

    # Taille des figures
    "Width Figure"  :800,
    "Height Figure" :600,

    # Seuils pour les fonctions de répartition et le dendogramme
    "Repart Thr"    :0.95,
    "Semant Thr"    :3.5,
 
    # Classe d'assay type principales
    "Assay Class"   :['AMPLICON','WGS','RNA-Seq','WGA'],

    # Couleurs

}

In [None]:
# Ouvre GeoSRA et affiche les colonnes
geosra_pd = pd.read_parquet(param["GeoSRA DIR IN"])
geosra_pd.head()

In [None]:
# TreeMap organism
df_treemap = geosra_pd[["organism"]]
df_treemap.loc[df_treemap["organism"] == '','organism'] = 'Unspecified'
graph.TreeMap(geosra_pd,'organism')

In [None]:
# Repartition Function organism
thr = graph.RepartitionFunction(geosra_pd,'organism',y_thr=param['Repart Thr'])

In [None]:
# Dendogramme (Sémantique) des principales valeurs de organism
value_count = dict(geosra_pd["organism"].value_counts())
chains = list(value_count.keys())[:thr]
chains = [chain.replace('metagenome', '') for chain in chains]

nlp = spacy.load("en_core_web_md")
dist_matrix = np.zeros((len(chains), len(chains)))
for i in range(len(chains)):
    for j in range(len(chains)):
        dist_matrix[i, j] = nlp(chains[i]).similarity(nlp(chains[j]))
Z = hierarchy.linkage(dist_matrix, method='ward')


plt.figure(figsize=(10, 5))
dn = hierarchy.dendrogram(Z, labels=chains, color_threshold=param["Semant Thr"])
plt.xticks(rotation='vertical')
plt.ylabel('Semantix distance')
plt.show()

In [None]:
# Histogramme des clusters
keys = [key+'metagenome' for key in dn['ivl']]
class_dict = dict(zip(keys,dn["leaves_color_list"]))
geosra_pd["cluster_organism"] = geosra_pd["organism"].map(class_dict)
geosra_pd["cluster_organism"] = geosra_pd["cluster_organism"].fillna('NC')

idxs = geosra_pd[~geosra_pd['assay_type'].isin(param["Assay Class"])].index
geosra_pd.loc[idxs,"assay_type"] = 'OTHER'

df = geosra_pd[["cluster_organism","assay_type"]]

graph.Histogram(df,'cluster_organism',color_col='assay_type')

In [None]:
graph.TreeMap(geosra_pd,'organism',parent_col='cluster_organism',r_lim=(200,250),g_lim=(200,250),b_lim=(200,250))

In [None]:
# Enregistrement des données avec release_year et cluster_organism
geosra_pd.to_parquet(param["GeoSRA DIR OUT"])