Hierarchical clustering is implemented using 3 different genetic distances, Manhattan, Binary and Jaccard. The distances are calculated using Scikit learn package. The dendrograms are created using SciPy package.

Hierarchical clustering using Manhattan genetic distance:

In [None]:
# Load libraries
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import pairwise_distances


# Compute manhattan distance matrix
manhattan_distance = pairwise_distances(final_df, metric='manhattan')
condensed_manhattan_distance = squareform(manhattan_distance)

# Perform hierarchical clustering using 'ward' method
linkage_manhattan = linkage(condensed_manhattan_distance, method='ward')


# Convert panda series to list
location_labels = location_data.tolist()
# Get unique locations
unique_locations = list(set(location_labels))
# Generate a colormap for each location
location_colors = {region: plt.cm.tab10(i) for i, region in enumerate(unique_locations)}


def assign_colors(labels, region_colors):
  ''' Assign colors based on the region '''
  leaf_colors = []
  for label in labels:
        region = label
        color = region_colors[region]
        leaf_colors.append(color)
  return leaf_colors


# Create a dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linkage_manhattan,
           orientation='top',
           labels=location_labels, # Use location labels for labels
           leaf_rotation=90,
           leaf_font_size= 8)


# Get x axis
ax = plt.gca()
x_labels = ax.get_xticklabels()   # Retrieve the labels on x-axis
# Iterate through the x labels and change their colors if they match a region
for label in x_labels:
    text = label.get_text()
    if text in location_colors:
        label.set_color(location_colors[text])

plt.title("Hierarchical Clustering Dendrogram - Manhattan")
plt.xlabel("Samples")
plt.ylabel("Distance")

# Save the figure
image1_path = '/content/drive/MyDrive/dendrogram_manhattan.png'
plt.savefig(image1_path, dpi=300, bbox_inches='tight')

Hierarchical clustering using Binary genetic distance:

In [None]:
# Compute binary distance matrix
binary_distance = pairwise_distances(final_df, metric='hamming')
condensed_binary_distance = squareform(binary_distance)
# Perform hierarchical clustering using 'ward' method
linkage_hamming = linkage(condensed_binary_distance, method='ward')

# Convert panda series to list
location_labels = location_data.tolist()
# Get unique locations
unique_locations = list(set(location_labels))
# Generate a colormap for each location
location_colors = {region: plt.cm.tab10(i) for i, region in enumerate(unique_locations)}


def assign_colors(labels, region_colors):
    ''' Assign colors based on the region '''
    leaf_colors = []
    for label in labels:
        region = label
        color = region_colors[region]
        leaf_colors.append(color)
    return leaf_colors


# Create a dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linkage_hamming,
           orientation='top',
           labels=location_labels,  # Use location labels for labels
           leaf_rotation=90,
           leaf_font_size= 8)


# Get x axis
ax = plt.gca()
x_labels = ax.get_xticklabels()    # Retrieve the labels on x-axis
# Iterate through the x labels and change their colors if they match a region
for label in x_labels:
    text = label.get_text()
    if text in location_colors:
        label.set_color(location_colors[text])

plt.title("Hierarchical Clustering Dendrogram - Binary")
plt.xlabel("Samples")
plt.ylabel("Distance")

# Save the figure
image2_path = '/content/drive/MyDrive/dendrogram_binary.png'
plt.savefig(image2_path, dpi=300, bbox_inches='tight')

Hierarchical clustering using Jaccard genetic distance:

In [None]:
# Compute Jaccard distance matrix
jaccard_distance = squareform(pdist(final_df, metric='jaccard'))
condensed_jaccard_distance = squareform(jaccard_distance)
# Perform hierarchical clustering using 'ward' method
linkage_jaccard = linkage(condensed_jaccard_distance, method='ward')

# Convert panda series to list
location_labels = location_data.tolist()
# Get unique locations
unique_locations = list(set(location_labels))
# Generate a colormap for each location
location_colors = {region: plt.cm.tab10(i) for i, region in enumerate(unique_locations)}

def assign_colors(labels, region_colors):
    ''' Assign colors based on the region '''
    leaf_colors = []
    for label in labels:
        region = label
        color = region_colors[region]
        leaf_colors.append(color)
    return leaf_colors


# Create a dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linkage_jaccard,
           orientation='top',

           labels=location_labels,
           leaf_rotation=90,
           leaf_font_size= 8)


# Get x axis
ax = plt.gca()
x_labels = ax.get_xticklabels() # Retrieve the labels on x-axis
# Iterate through the x labels and change their colors if they match a region
for label in x_labels:
    text = label.get_text()
    if text in location_colors:
        label.set_color(location_colors[text])

plt.title("Hierarchical Clustering Dendrogram - Jaccard")
plt.xlabel("Samples")
plt.ylabel("Distance")

# Save the figure
image1_path = '/content/drive/MyDrive/dendrogram_jaccard.png'
plt.savefig(image1_path, dpi=300, bbox_inches='tight')