In [71]:
import pandas as pd
import numpy as np
import random
import operator
import math
import plotly.express as px

In [73]:
file_path = 'HEALTH_STAT_02112023181604179.csv'
data = pd.read_csv(file_path)

In [64]:
# Initialize membership matrix with random probabilities for each cluster
def initialize_membership_matrix(n, k):
    membership_mat = []
    for i in range(n):
        random_num_list = [random.random() for _ in range(k)]
        summation = sum(random_num_list)
        temp_list = [x/summation for x in random_num_list]  # Normalize to sum to 1
        membership_mat.append(temp_list)
    return membership_mat


# Calculate the centers of clusters using the membership matrix
def calculate_cluster_centers(membership_mat, df, m, k):
    cluster_mem_val = list(zip(*membership_mat))  # Transpose membership matrix
    cluster_centers = []
    for j in range(k):
        x = list(cluster_mem_val[j])
        x_raised = [e ** m for e in x]  # Raise to the power of fuzzy parameter m
        denominator = sum(x_raised)
        temp_num = []
        for i in range(n):
            data_point = list(df.iloc[i])
            prod = [x_raised[i] * val for val in data_point]  # Weighted data point by membership value
            temp_num.append(prod)
        numerator = map(sum, zip(*temp_num))  # Column-wise summation
        center = [z/denominator for z in numerator]  # Calculate center
        cluster_centers.append(center)
    return cluster_centers



# Update membership values based on the cluster centers
def update_membership_values(membership_mat, cluster_centers, df, m, k):
    p = float(2 / (m - 1))  # Exponent for membership update
    for i in range(n):
        x = list(df.iloc[i])
        distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(k)]  # Calculate distances
        for j in range(k):
            den = sum([math.pow(distances[j] / distances[c], p) for c in range(k)])  # Sum of ratios raised to the power p
            membership_mat[i][j] = float(1 / den) if den != 0 else 0  # Update membership value
    return membership_mat


# Assign data points to clusters based on maximum membership value
def get_clusters(membership_mat):
    cluster_labels = []
    for i in range(n):
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))  # Find the index of max membership
        cluster_labels.append(idx)  # Assign cluster based on max membership
    return cluster_labels


# Main function to perform Fuzzy C-Means clustering
def fuzzy_c_means_clustering(df, n, k, m, MAX_ITER):
    membership_mat = initialize_membership_matrix(n, k)  # Step 1: Initialize membership matrix
    curr = 0
    while curr <= MAX_ITER:  # Iterate until max iterations or convergence
        cluster_centers = calculate_cluster_centers(membership_mat, df, m, k)  # Step 2: Calculate cluster centers
        membership_mat = update_membership_values(membership_mat, cluster_centers, df, m, k)  # Step 3: Update membership matrix
        cluster_labels = get_clusters(membership_mat)  # Step 4: Assign clusters
        curr += 1
    return cluster_labels, cluster_centers


In [65]:
num_attr = 1  # Average Life Expectancy
k = 4 # Number of clusters
MAX_ITER = 100
n = len(average_life_expectancy)  # Number of data points
m = 2.00 # Fuzzy parameter
df = average_life_expectancy[['Value']]  # Keeping only the life expectancy values
# df.head()  



# Generate the average life expectancy data
life_expectancy_data = data[(data['Variable'] == 'Total population at birth') & (data['Measure'] == 'Years')]
average_life_expectancy = life_expectancy_data.groupby('Country')['Value'].mean().reset_index()

labels, _ = fuzzy_c_means_clustering(df, n, k, m, MAX_ITER)



In [77]:
# Create a DataFrame for plotting that includes the country, average life expectancy, and cluster assignment
plot_df = average_life_expectancy.copy()
plot_df['Cluster'] = labels

# Create a list of strings with the information you want to display on hover
hover_data = []
for index, row in plot_df.iterrows():
    hover_text = f"Country: {row['Country']}<br>Life Expectancy: {row['Value']}<br>Cluster: {row['Cluster']}"
    hover_data.append(hover_text)

# Create the scatter plot
fig = px.scatter(plot_df, x=np.arange(len(plot_df)), y='Value',
                 color='Cluster', hover_data=[plot_df['Country'], plot_df['Value']],
                 labels={'x':'Country Index', 'Value':'Life Expectancy', 'color':'Cluster'})

# Customize the layout
fig.update_layout(
    title="Fuzzy Clustering of Life Expectancy by Country",
    xaxis_title="Country Index",
    yaxis_title="Life Expectancy",
    legend_title="Cluster"
)

# Customize the hover information
fig.update_traces(marker=dict(size=12),
                  hovertemplate='%{hovertext}',
                  hovertext=hover_data)

# Show the plot
fig.show()