In [7]:
import pandas as pd
import numpy as np
import random
import operator
import math
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [8]:
# Read the dataset
file_path = 'RWB_02112023195802159.csv'
data = pd.read_csv(file_path)

# Filter the dataset for 'Disposable income per capita' and 'Life expectancy at birth'
income_life_exp_data = data[data['Indicator'].isin(['Disposable income per capita', 'Life expectancy at birth'])]

# Pivot the data to have one row per region with both indicators
pivot_data = income_life_exp_data.pivot_table(index=['REG_ID', 'Regions', 'TIME', 'Time'], 
                                              columns='Indicator', 
                                              values='Value').reset_index()

# Normalize the features
scaler = MinMaxScaler()
features_to_cluster = pivot_data[['Disposable income per capita', 'Life expectancy at birth']]
normalized_features = scaler.fit_transform(features_to_cluster)
normalized_df = pd.DataFrame(normalized_features, columns=features_to_cluster.columns)


In [9]:
# Initialize membership matrix with random probabilities for each cluster
def initialize_membership_matrix(n, k):
    membership_mat = []
    for i in range(n):
        random_num_list = [random.random() for _ in range(k)]
        summation = sum(random_num_list)
        temp_list = [x / summation for x in random_num_list]
        membership_mat.append(temp_list)
    return membership_mat

# Calculate the centers of clusters using the membership matrix
def calculate_cluster_centers(membership_mat, df, m, k):
    cluster_mem_val = list(zip(*membership_mat))
    cluster_centers = []
    for j in range(k):
        x = list(cluster_mem_val[j])
        x_raised = [e ** m for e in x]
        denominator = sum(x_raised)
        temp_num = []
        for i in range(len(df)):
            data_point = list(df.iloc[i])
            prod = [x_raised[i] * val for val in data_point]
            temp_num.append(prod)
        numerator = map(sum, zip(*temp_num))
        center = [z / denominator for z in numerator]
        cluster_centers.append(center)
    return cluster_centers

# Update membership values based on the cluster centers
def update_membership_values(membership_mat, cluster_centers, df, m, k):
    p = float(2 / (m - 1))
    for i in range(len(df)):
        x = list(df.iloc[i])
        distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(k)]
        for j in range(k):
            den = sum([math.pow(distances[j] / distances[c], p) for c in range(k)])
            membership_mat[i][j] = float(1 / den) if den != 0 else 0
    return membership_mat

# Assign data points to clusters based on maximum membership value
def get_clusters(membership_mat):
    cluster_labels = []
    for i in range(len(membership_mat)):
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
        cluster_labels.append(idx)
    return cluster_labels

# Fuzzy C-Means clustering function
def fuzzy_c_means_clustering(df, n, k, m, MAX_ITER):
    membership_mat = initialize_membership_matrix(n, k)
    curr = 0
    while curr <= MAX_ITER:
        cluster_centers = calculate_cluster_centers(membership_mat, df, m, k)
        membership_mat = update_membership_values(membership_mat, cluster_centers, df, m, k)
        cluster_labels = get_clusters(membership_mat)
        curr += 1
    return cluster_labels, cluster_centers

In [10]:
# Perform fuzzy C-Means clustering
k = 4  # Number of clusters
MAX_ITER = 100
n = len(normalized_df)  # Number of data points
m = 2.00  # Fuzzy parameter
labels, cluster_centers = fuzzy_c_means_clustering(normalized_df, n, k, m, MAX_ITER)


In [11]:
# Create a DataFrame for plotting that includes the regions, the actual values for disposable income and life expectancy, and cluster assignment
plot_df = pivot_data.copy()
plot_df['Cluster'] = labels

# We need to get the actual values for disposable income and life expectancy from the pivoted data
plot_df['Disposable income per capita'] = pivot_data['Disposable income per capita']
plot_df['Life expectancy at birth'] = pivot_data['Life expectancy at birth']

# Create a list of strings with the information you want to display on hover
hover_data = []
for index, row in plot_df.iterrows():
    hover_text = f"Region: {row['Regions']}<br>Disposable Income: {row['Disposable income per capita']}<br>Life Expectancy: {row['Life expectancy at birth']}<br>Cluster: {row['Cluster']}"
    hover_data.append(hover_text)

# Create the scatter plot using Plotly
fig = px.scatter(plot_df, x='Disposable income per capita', y='Life expectancy at birth',
                 color='Cluster', hover_data=[plot_df['Country']]plot_df['Regions'], plot_df['Disposable income per capita'], plot_df['Life expectancy at birth']],
                 labels={'Disposable income per capita':'Disposable Income', 'Life expectancy at birth':'Life Expectancy', 'color':'Cluster'})

# Customize the layout
fig.update_layout(
    title="Fuzzy Clustering of Regions Based on Disposable Income and Life Expectancy",
    xaxis_title="Disposable Income",
    yaxis_title="Life Expectancy",
    legend_title="Cluster"
)

# Customize the hover information
fig.update_traces(marker=dict(size=12),
                  hovertemplate='%{hovertext}',
                  hovertext=hover_data)

# Show the plot
fig.show()
