In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
from scipy.spatial.distance import pdist, squareform

# IMPORTING THE FILTERED GENOTYPIC DATA
ATLAS_Dataset = pd.read_csv('atlas_2024_genes.csv')

# FILTER OUT THE DATA OF EACH CONTINENT FOR CONTINENT-BASED ANALYSES
Africa = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Africa']
Europe = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Europe']
North_America = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'North America']
South_America = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'South America']
Asia = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Asia']
Oceania = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Oceania']


In [23]:
# HEATMAP FOR SPECIES-SPECIFIC PREVALENCE OF GENOTYPES

# Function to create a heatmap with for species-specific prevalence of genotypes
def species_genotype_groups_heatmap(df, continent):

    # Filter the DataFrame for the specified continent
    df_filtered = df[df['Continents'] == continent]

    # Create a pivot table with species as rows and genes as columns, counting occurrences
    pivot_table = df_filtered.pivot_table(index='Species', columns='Gene', aggfunc='size', fill_value=0)

    # Compute the linkage for rows (Species) using Euclidean distance and average linkage method
    row_linkage = linkage(pdist(pivot_table, metric='euclidean'), method='average')

    # Map genes to their classes and create a dictionary for color coding
    gene_classes = df.set_index('Gene')['Gene Class'].to_dict()
    unique_classes = df['Gene Class'].unique()
    class_colors = dict(zip(unique_classes, sns.color_palette(n_colors=len(unique_classes))))

    # Create a distance matrix for gene classes
    gene_class_labels = pivot_table.columns.map(gene_classes)
    gene_class_distance_matrix = pdist(gene_class_labels.values[:, None], metric=lambda u, v: 0 if u == v else 1)
    gene_class_distance_matrix = squareform(gene_class_distance_matrix)

    # Create a combined distance matrix for genes based on class and data
    gene_distance_matrix = gene_class_distance_matrix + squareform(pdist(pivot_table.T, metric='euclidean'))

    # Compute the linkage for columns (Genes) using the modified distance matrix
    col_linkage = linkage(squareform(gene_distance_matrix), method='average')

    # Map genes to their corresponding class colors for visualization
    col_colors = pivot_table.columns.map(gene_classes).map(class_colors)

     # Generate the heatmap with dendrograms
    clustermap = sns.clustermap(pivot_table, row_linkage=row_linkage, col_linkage=col_linkage, cmap='Blues', figsize=(15, 10), col_colors=col_colors, linewidths=0.1, linecolor='lightgrey')

     # Add a legend for genotype subclasses
    for label in unique_classes:
        clustermap.ax_col_dendrogram.bar(0, 0, color=class_colors[label], label=label, linewidth=0)
    clustermap.ax_col_dendrogram.legend(loc='center', ncol=5, bbox_to_anchor=(0.5, 1.1), title='Genotype Subclasses')

    # Add a title to the heatmap
    plt.title(f'Species vs Genotype Heatmap {continent}')

    #Display heatmap
    plt.show()

