In [3]:
#IMPORTING LIBRARIES AND DATASETS

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



# IMPORTING THE FILTERED GENOTYPIC DATA
ATLAS_Dataset = pd.read_csv('atlas_2024_genes.csv')

# FILTER OUT THE DATA OF EACH CONTINENT FOR CONTINENT-BASED ANALYSES
Africa = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Africa']
Europe = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Europe']
North_America = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'North America']
South_America = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'South America']
Asia = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Asia']
Oceania = ATLAS_Dataset[ATLAS_Dataset['Continents'] == 'Oceania']


In [9]:
# HEATMAPS DEPICTING THE CO-OCCURENCE OF GENOTYPES FOR SELECTED ISOLATES (JACCARD SIMILARITY INDEX HEATMAP)


def genotype_couccurence_heatmap(df, species):
    # Filter the DataFrame for the selected species
    filtered_df = df[df['Species'] == species]

    # Extract relevant columns and drop duplicates and missing values
    relevant_columns = ['Isolate Id', 'Gene']
    data = filtered_df[relevant_columns].drop_duplicates().dropna(subset=['Gene'])

    # Create a pivot table: rows are isolates, columns are genes, values are counts
    species_pivot_table = pd.pivot_table(data, index='Isolate Id', columns='Gene', aggfunc=len, fill_value=0)

    # Define a function to compute the Jaccard index
    def jaccard_index(x, y):
        intersection = (x & y).sum()
        union = (x | y).sum()
        return intersection / union if union != 0 else 0

    # Initialize an empty DataFrame for the Jaccard matrix
    jaccard_matrix = pd.DataFrame(index=species_pivot_table.columns, columns=species_pivot_table.columns)
    
    # Compute the Jaccard index for each pair of genes
    for gene1 in species_pivot_table.columns:
        for gene2 in species_pivot_table.columns:
            jaccard_matrix.loc[gene1, gene2] = jaccard_index(species_pivot_table[gene1], species_pivot_table[gene2])

    # Plot the Jaccard similarity index as a heatmap
    plt.figure(figsize=(15, 10))
    sns.heatmap(jaccard_matrix.astype(float), cmap='YlGnBu')
    plt.title(f'Jaccard Similarity Index Heatmap for {species} in Africa')
    plt.xlabel('Resistance Genes')
    plt.ylabel('Resistance Genes')
    #plt.show()
    plt.savefig('Heatmap_Africa_e_coli.png')

Test run:

genotype_couccurence_heatmap(Africa,'Escherichia coli')