In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
import os
from sklearn.preprocessing import OneHotEncoder


In [2]:
directory = "./data"
data = pd.DataFrame(columns=['tweet_id', 'tweet_text', 'class_label'])

In [10]:
for path, subdir, files in os.walk(directory):
    if len(files) == 0:
        continue
    for file in files:
        df = pd.read_csv(os.path.join(path, file), sep='\t')
        data = pd.concat([data, df])
    break

In [11]:

data.head()

Unnamed: 0,tweet_id,tweet_text,class_label
0,798262465234542592,RT @MissEarth: New Zealand need our prayers af...,sympathy_and_support
1,771464543796985856,"@johnaglass65 @gordonluke Ah, woke up to a nig...",caution_and_advice
2,797835622471733248,RT @terremotocentro: #eqnz if you need a tool ...,requests_or_urgent_needs
3,798021801540321280,RT @BarristerNZ: My son (4) has drawn a pictur...,other_relevant_information
4,798727277794033664,Due to earthquake damage our Defence Force is ...,infrastructure_and_utility_damage


In [12]:
data = data.dropna(axis=0)

In [13]:
data.head()

Unnamed: 0,tweet_id,tweet_text,class_label
0,798262465234542592,RT @MissEarth: New Zealand need our prayers af...,sympathy_and_support
1,771464543796985856,"@johnaglass65 @gordonluke Ah, woke up to a nig...",caution_and_advice
2,797835622471733248,RT @terremotocentro: #eqnz if you need a tool ...,requests_or_urgent_needs
3,798021801540321280,RT @BarristerNZ: My son (4) has drawn a pictur...,other_relevant_information
4,798727277794033664,Due to earthquake damage our Defence Force is ...,infrastructure_and_utility_damage


In [14]:
vectorizer = CountVectorizer()

In [15]:
X = vectorizer.fit_transform(data['tweet_text']).toarray()

In [16]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
from scipy.spatial.distance import cdist
import numpy as np

class ClusterSimilarityMatrix():
    
    def __init__(self) -> None:
        self._is_fitted = False

    def fit(self, y_clusters):
        if not self._is_fitted:
            self._is_fitted = True
            self.similarity = self.to_binary_matrix(y_clusters)
            return self

        self.similarity += self.to_binary_matrix(y_clusters)

    def to_binary_matrix(self, y_clusters):
        y_reshaped = np.expand_dims(y_clusters, axis=-1)
        return (cdist(y_reshaped, y_reshaped, 'cityblock')==0).astype(int)


class EnsembleCustering():
    def __init__(self, base_estimators, aggregator, distances=False):
        self.base_estimators = base_estimators
        self.aggregator = aggregator
        self.distances = distances

    def fit(self, X):
        X_ = X.copy()

        clt_sim_matrix = ClusterSimilarityMatrix()
        for model in self.base_estimators:
            clt_sim_matrix.fit(model.fit_predict(X=X_))
        
        sim_matrix = clt_sim_matrix.similarity
        self.cluster_matrix = sim_matrix/sim_matrix.diagonal()

        if self.distances:
            self.cluster_matrix = np.abs(np.log(self.cluster_matrix + 1e-8)) # Avoid log(0)

    def fit_predict(self, X):
        self.fit(X)
        y = self.aggregator.fit_predict(self.cluster_matrix)
        return y

In [18]:
clustering_models = []

In [19]:
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering

In [20]:
num_clusters = len(data['class_label'].unique())

In [21]:
clustering_models.append(KMeans(n_clusters=num_clusters))

In [22]:
clustering_models.append(Birch(n_clusters=num_clusters))

In [23]:
clustering_models.append(AgglomerativeClustering(n_clusters=num_clusters))

In [24]:
# clustering_models.append(MeanShift())

In [25]:
aggregator_clt = SpectralClustering(n_clusters=num_clusters, affinity="precomputed")

In [26]:
ens_clt=EnsembleCustering(clustering_models, aggregator_clt)

In [27]:
y_ensemble = ens_clt.fit_predict(X)



In [28]:
y_ensemble.shape

(2195,)

In [22]:
# data['ensemble'] = y_ensemble

In [29]:
data

Unnamed: 0,tweet_id,tweet_text,class_label
0,798262465234542592,RT @MissEarth: New Zealand need our prayers af...,sympathy_and_support
1,771464543796985856,"@johnaglass65 @gordonluke Ah, woke up to a nig...",caution_and_advice
2,797835622471733248,RT @terremotocentro: #eqnz if you need a tool ...,requests_or_urgent_needs
3,798021801540321280,RT @BarristerNZ: My son (4) has drawn a pictur...,other_relevant_information
4,798727277794033664,Due to earthquake damage our Defence Force is ...,infrastructure_and_utility_damage
...,...,...,...
430,797862022595497988,What a night that was. Thankfully no damage to...,other_relevant_information
431,798425325923401728,@We3forDemocracy This is since Extreme M7.5 ea...,caution_and_advice
432,797808377191600128,RT @caity_petersonn: UPDATE: Hanmer Springs -h...,infrastructure_and_utility_damage
433,797961196364124160,RT @Halcyon_Knights: Our thoughts go out to th...,sympathy_and_support


In [30]:
# np.unique(y_ensemble, return_counts=True)

In [31]:
y_ensemble

array([5, 7, 1, ..., 1, 3, 1], dtype=int32)

In [37]:
# clustering_models.append(ens_clt)
np.unique(y_ensemble, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int32),
 array([ 168, 1069,  110,  100,   76,  386,  101,  118,   67]))

In [40]:
data['class_label'].value_counts()

caution_and_advice                        493
sympathy_and_support                      432
infrastructure_and_utility_damage         312
other_relevant_information                311
not_humanitarian                          224
rescue_volunteering_or_donation_effort    207
injured_or_dead_people                    105
displaced_people_and_evacuations           87
requests_or_urgent_needs                   24
Name: class_label, dtype: int64

## Vis

In [38]:
data

Unnamed: 0,tweet_id,tweet_text,class_label
0,798262465234542592,RT @MissEarth: New Zealand need our prayers af...,sympathy_and_support
1,771464543796985856,"@johnaglass65 @gordonluke Ah, woke up to a nig...",caution_and_advice
2,797835622471733248,RT @terremotocentro: #eqnz if you need a tool ...,requests_or_urgent_needs
3,798021801540321280,RT @BarristerNZ: My son (4) has drawn a pictur...,other_relevant_information
4,798727277794033664,Due to earthquake damage our Defence Force is ...,infrastructure_and_utility_damage
...,...,...,...
430,797862022595497988,What a night that was. Thankfully no damage to...,other_relevant_information
431,798425325923401728,@We3forDemocracy This is since Extreme M7.5 ea...,caution_and_advice
432,797808377191600128,RT @caity_petersonn: UPDATE: Hanmer Springs -h...,infrastructure_and_utility_damage
433,797961196364124160,RT @Halcyon_Knights: Our thoughts go out to th...,sympathy_and_support


In [32]:
for i in range(len(clustering_models)):
    column_name = str(clustering_models[i])
    print(column_name)
    output_df = pd.DataFrame(clustering_models[i].fit_predict(X), columns=[column_name])
    # print(output_df[column_name].value_counts())
    data = data.join(output_df)
    matched = data[data["text_info"]==data[column_name]]
    unmatched = data[data["text_info"]!=data[column_name]]
    print("Matching", matched.shape[0]/data.shape[0])
    match_percentage = matched.shape[0]/data.shape[0]
    print("Unmatched", unmatched.shape[0]/data.shape[0])
    unmatch_percentage = unmatched.shape[0]/data.shape[0]
    if unmatch_percentage > match_percentage:
        data[column_name] = data[column_name].apply(lambda x: 1 if x==0 else 0)

KMeans(n_clusters=2)




0 corresponding to 0 0.6788413098236776
0 not corresponding to 0 0.3211586901763224
Birch(n_clusters=2)
0 corresponding to 0 0.3211586901763224
0 not corresponding to 0 0.6788413098236776
AgglomerativeClustering()
0 corresponding to 0 0.31801007556675065
0 not corresponding to 0 0.6819899244332494


In [35]:
data['ensemble'] = y_ensemble
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info,KMeans(n_clusters=2),Birch(n_clusters=2),AgglomerativeClustering(),ensemble
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1,1.0,1,1,0
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1,1.0,1,1,0
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,0
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,0
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1,1.0,1,1,0


In [36]:
matched = data[data["text_info"]==data['ensemble']]
unmatched = data[data["text_info"]!=data['ensemble']]
print("0 corresponding to 0", matched.shape[0]/data.shape[0])
match_percentage = matched.shape[0]/data.shape[0]
print("0 not corresponding to 0", unmatched.shape[0]/data.shape[0])
unmatch_percentage = unmatched.shape[0]/data.shape[0]
if unmatch_percentage > match_percentage:
    data['ensemble'] = data['ensemble'].apply(lambda x: 1 if x==0 else 0)

0 corresponding to 0 0.2795969773299748
0 not corresponding to 0 0.7204030226700252


In [37]:
data

Unnamed: 0,tweet_id,tweet_text,text_info,KMeans(n_clusters=2),Birch(n_clusters=2),AgglomerativeClustering(),ensemble
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1,1.0,1,1,1
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1,1.0,1,1,1
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,1
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,1
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1,1.0,1,1,1
...,...,...,...,...,...,...,...
1585,923761170967797761,GOP targets environmental rules after Californ...,1,1.0,1,1,1
1586,923796193670336512,Motorcycle crash sparked 350-acre fire in Clev...,1,1.0,1,1,1
1587,923821955568013313,Carl’s Jr. #SantaRosa catches #Fire while maki...,0,1.0,1,1,1
1588,923844053426348032,Inside the List: The Romance Writer Who Almost...,1,,0,0,1


In [39]:
data.to_csv("model_clustering_data.csv")

In [None]:
# outputs = []

In [None]:
# for model in clustering_models:
#     outputs.append(model.fit_predict(X))

In [None]:
# output_df = pd.DataFrame(outputs[2], columns=['cluster'])
# new_data = data
# new_data = new_data.join(output_df)

In [None]:
# for output in outputs:
#     output_df = pd.DataFrame(output, columns=['cluster'])
#     new_data = data
#     new_data = new_data.join(output_df)
#     # print(new_data[new_data["text_info"]==new_data["cluster"]])
#     matched = new_data[new_data["text_info"]==new_data["cluster"]]
#     unmatched = new_data[new_data["text_info"]!=new_data["cluster"]]
#     print("0 corresponding to 0", matched.shape[0]/data.shape[0])
#     match_percentage = matched.shape[0]/data.shape[0]
#     print("0 not corresponding to 0", unmatched.shape[0]/data.shape[0])
#     unmatch_percentage = unmatched.shape[0]/data.shape[0]
#     if unmatch_percentage > match_percentage:
#         new_data['cluster'] = new_data["cluster"].apply(lambda x: 1 if x==0 else 0)