In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
crisismmd = "../data/annotations/"

In [3]:
path = crisismmd + "california_wildfires_final_data.tsv"

In [4]:
data = pd.read_csv(path, sep='\t')

In [6]:
data = data[['tweet_id', 'tweet_text', 'text_info', 'image_info', 'text_human', 'image_human']]

In [7]:
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info,image_info,text_human,image_human
0,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,informative,informative,other_relevant_information,other_relevant_information
1,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,informative,informative,infrastructure_and_utility_damage,affected_individuals
2,917791291823591425,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...,informative,informative,other_relevant_information,infrastructure_and_utility_damage
3,917791291823591425,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...,informative,not_informative,other_relevant_information,not_relevant_or_cant_judge
4,917792092100988929,RT @TIME: California's raging wildfires as you...,informative,informative,other_relevant_information,infrastructure_and_utility_damage


In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['tweet_text']).toarray()
X

In [11]:
from scipy.spatial.distance import cdist
import numpy as np

class ClusterSimilarityMatrix():
    
    def __init__(self) -> None:
        self._is_fitted = False

    def fit(self, y_clusters):
        if not self._is_fitted:
            self._is_fitted = True
            self.similarity = self.to_binary_matrix(y_clusters)
            return self

        self.similarity += self.to_binary_matrix(y_clusters)

    def to_binary_matrix(self, y_clusters):
        y_reshaped = np.expand_dims(y_clusters, axis=-1)
        return (cdist(y_reshaped, y_reshaped, 'cityblock')==0).astype(int)


class EnsembleCustering():
    def __init__(self, base_estimators, aggregator, distances=False):
        self.base_estimators = base_estimators
        self.aggregator = aggregator
        self.distances = distances

    def fit(self, X):
        X_ = X.copy()

        clt_sim_matrix = ClusterSimilarityMatrix()
        for model in self.base_estimators:
            clt_sim_matrix.fit(model.fit_predict(X=X_))
        
        sim_matrix = clt_sim_matrix.similarity
        self.cluster_matrix = sim_matrix/sim_matrix.diagonal()

        if self.distances:
            self.cluster_matrix = np.abs(np.log(self.cluster_matrix + 1e-8)) # Avoid log(0)

    def fit_predict(self, X):
        self.fit(X)
        y = self.aggregator.fit_predict(self.cluster_matrix)
        return y

In [12]:
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering

In [19]:
target_label = 'text_human'
clustering_models = []

In [20]:
num_clusters = len(data[target_label].unique())
num_clusters

9

In [21]:
clustering_models.append(KMeans(n_clusters=2*num_clusters))

In [22]:
clustering_models.append(Birch(n_clusters=2*num_clusters))

In [23]:
clustering_models.append(AgglomerativeClustering(n_clusters=2*num_clusters))

In [24]:
aggregator_clt = SpectralClustering(n_clusters=num_clusters, affinity="precomputed")

In [25]:
ens_clt=EnsembleCustering(clustering_models, aggregator_clt)

In [26]:
y_ensemble = ens_clt.fit_predict(X)



In [28]:
np.unique(y_ensemble, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int32),
 array([225,  49, 311, 190, 120, 464,  73, 113,  44]))

In [29]:
data[target_label].value_counts()

other_relevant_information                683
rescue_volunteering_or_donation_effort    195
infrastructure_and_utility_damage         166
injured_or_dead_people                    108
affected_individuals                       83
not_relevant_or_cant_judge                 62
missing_or_found_people                     8
vehicle_damage                              2
Name: text_human, dtype: int64