In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import clustering_utils
import math
from itertools import combinations
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster import cluster_visualizer

In [None]:
users_df = pd.read_csv("dataset/users_df_dataset_cleaned_with_indicators.csv")
users_df.dropna(inplace=True)
users_df.info(verbose=True)
users_df.describe()

# X-means

X-means wants to resolve three major issues of K-means: K-means scales poorly computationally, the need of supplying the number of clusters K and the fact that the algorithm is prone to local minima.

In [None]:
indicator_columns_users = ['account_average_tweets_per_day', 'avg_tweets_per_actual_day', 'covid19_num_of_tweets', 'avg_hashtags', 
                            'avg_text_length', 'avg_mentions', 'avg_special_char_in_text', 'avt_favorite_count', 'total_replies',
                            'avt_reply_count', 'total_retweet_count', 'account_discussion_creation_ratio', 'tweet_num_likes_ratio',
                            'tweet_num_replies_ratio']

all_possible_combinations = list(combinations(indicator_columns_users, 2))
#combination = ('account_average_tweets_per_day', 'avg_mentions')
combination = indicator_columns_users

In [None]:
subset_df = users_df[list(combination)]
scaler = StandardScaler()
scaler.fit(subset_df.values)
transform_result = scaler.fit_transform(subset_df.values)

In [None]:
initial_centers = kmeans_plusplus_initializer(transform_result, 2).initialize()
xmeans_instance = xmeans(transform_result, initial_centers, 20)
xmeans_instance.process()

In [None]:
clusters = xmeans_instance.get_clusters()

## Studying the composition of our clusters 

### Number of real bots and users in the cluster

In [None]:
is_bot = lambda index: users_df.iloc[index].bot == 1
clusters_ids = []
number_users_per_cluster = []
number_bots_per_cluster = []
for (cluster_index, indexes_in_cluster) in enumerate(clusters):
    number_real_users = 0
    number_bots = 0
    for user_index in indexes_in_cluster:
        if is_bot(user_index):
            number_bots += 1
        else:
            number_real_users += 1
    clusters_ids.append(cluster_index)
    number_users_per_cluster.append(number_real_users)
    number_bots_per_cluster.append(number_bots)

In [None]:
def plot_clusters_compositions(clusters_ids, number_users_per_cluster, number_bots_per_cluster):
    x = np.arange(len(clusters_ids))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, number_users_per_cluster, width, label='Real users', log=True)
    rects2 = ax.bar(x + width/2, number_bots_per_cluster, width, label='Bots', log=True)

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Number of users')
    ax.set_title('Composition of clusters found by X-means')
    ax.set_xticks(x, clusters_ids)
    ax.legend()

    ax.bar_label(rects1, padding=3)
    ax.bar_label(rects2, padding=3)

    fig.tight_layout()

    plt.show()
   
plot_clusters_compositions(clusters_ids,number_users_per_cluster, number_bots_per_cluster)

### Entropy for the clusters

In [None]:
def visualize_clusters_entropy(clusters_ids: list, clusters_entropy: list):
    """
    Plots the entropy of the clusters
    
    :param clusters_ids: list of int, the ids of the clusters to study
    :param clusters_entropy: list of float, the value of entropy for each cluster; i.e. the i-th element of this list
        contains the entropy of the i-th cluster
    """
    x = np.arange(len(clusters_ids))
    
    fig, ax = plt.subplots()
    plt.bar(x=x, height=clusters_entropy)
    
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Entropy')
    ax.set_title('Entropy of clusters found by X-means')
    ax.set_xticks(x, clusters_ids)
    
    plt.show()


compute_entropy = lambda p: -(p*math.log(p) + (1-p)*math.log(1-p)) if p != 1 and p != 0 else 0

clusters_entropy = []
for cluster_id in clusters_ids:
    number_users_in_cluster = number_users_per_cluster[cluster_id]
    number_bots_in_cluster = number_bots_per_cluster[cluster_id]
    total_number_of_elements = number_users_in_cluster + number_bots_in_cluster
    frequency_users = number_users_in_cluster / total_number_of_elements
    entropy = compute_entropy(frequency_users)
    clusters_entropy.append(entropy)
    
visualize_clusters_entropy(clusters_ids, clusters_entropy)

In [None]:
def get_element_cluster(element, clusters):
    for (index, list_of_elements) in enumerate(clusters):
        if element in list_of_elements:
            return index

labels = [
    get_element_cluster(index, clusters)
    for index in range(len(users_df))
]

In [None]:
plt.scatter(data=users_df, x=combination[1], y=combination[0],c=labels)

In [None]:
visualizer = cluster_visualizer()
visualizer.append_clusters(clusters, transform_result)
visualizer.show()

In [None]:
print("Total WCE:", xmeans_instance.get_total_wce())

In [None]:
%%time
sse = []
for combination in all_possible_combinations:
    subset_df = users_df[list(combination)]
    scaler = StandardScaler()
    scaler.fit(subset_df.values)
    transform_result = scaler.fit_transform(subset_df.values)
    initial_centers = kmeans_plusplus_initializer(transform_result, 2).initialize()
    xmeans_instance = xmeans(transform_result, initial_centers, 20)
    xmeans_instance.process()
    sse.append((combination, xmeans_instance.get_total_wce()))

In [None]:
sse.sort(key=lambda x: x[1])

In [None]:
sse[0]