In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import VotingClassifier

In [3]:
data = pd.read_csv('classification_data.csv')
data.rename(columns={'tweet_text_x': 'tweet_text'}, inplace=True)
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,informative
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,informative
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,informative
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,informative
4,917792092100988929,RT @TIME: California's raging wildfires as you...,informative


In [4]:
data = data.dropna(axis=0)

In [5]:
data['text_info'] = data['text_info'].apply(lambda x: 1 if (x == "informative")  else 0)

In [6]:
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1


In [7]:
vectorizer = CountVectorizer()

In [8]:
X = vectorizer.fit_transform(data['tweet_text']).toarray()

In [9]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
from scipy.spatial.distance import cdist
import numpy as np

class ClusterSimilarityMatrix():
    
    def __init__(self) -> None:
        self._is_fitted = False

    def fit(self, y_clusters):
        if not self._is_fitted:
            self._is_fitted = True
            self.similarity = self.to_binary_matrix(y_clusters)
            return self

        self.similarity += self.to_binary_matrix(y_clusters)

    def to_binary_matrix(self, y_clusters):
        y_reshaped = np.expand_dims(y_clusters, axis=-1)
        return (cdist(y_reshaped, y_reshaped, 'cityblock')==0).astype(int)


class EnsembleCustering():
    def __init__(self, base_estimators, aggregator, distances=False):
        self.base_estimators = base_estimators
        self.aggregator = aggregator
        self.distances = distances

    def fit(self, X):
        X_ = X.copy()

        clt_sim_matrix = ClusterSimilarityMatrix()
        for model in self.base_estimators:
            clt_sim_matrix.fit(model.fit_predict(X=X_))
        
        sim_matrix = clt_sim_matrix.similarity
        self.cluster_matrix = sim_matrix/sim_matrix.diagonal()

        if self.distances:
            self.cluster_matrix = np.abs(np.log(self.cluster_matrix + 1e-8)) # Avoid log(0)

    def fit_predict(self, X):
        self.fit(X)
        y = self.aggregator.fit_predict(self.cluster_matrix)
        return y

In [11]:
clustering_models = []

In [12]:
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering

In [13]:
num_clusters = 2

In [14]:
clustering_models.append(KMeans(n_clusters=num_clusters))

In [15]:
clustering_models.append(Birch(n_clusters=num_clusters))

In [16]:
clustering_models.append(AgglomerativeClustering(n_clusters=num_clusters))

In [17]:
# clustering_models.append(MeanShift())

In [18]:
aggregator_clt = SpectralClustering(n_clusters=2, affinity="precomputed")

In [19]:
ens_clt=EnsembleCustering(clustering_models, aggregator_clt)

In [20]:
y_ensemble = ens_clt.fit_predict(X)



In [21]:
y_ensemble.shape

(1588,)

In [22]:
# data['ensemble'] = y_ensemble

In [23]:
data

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1
...,...,...,...
1585,923761170967797761,GOP targets environmental rules after Californ...,1
1586,923796193670336512,Motorcycle crash sparked 350-acre fire in Clev...,1
1587,923821955568013313,Carl’s Jr. #SantaRosa catches #Fire while maki...,0
1588,923844053426348032,Inside the List: The Romance Writer Who Almost...,1


In [24]:
# np.unique(y_ensemble, return_counts=True)

In [25]:
data['text_info'].value_counts()

1    1245
0     343
Name: text_info, dtype: int64

In [26]:
# pred = ensemble.predict(vectorizer.transform(X_test).toarray())
# accuracy = metrics.accuracy_score(y_test, pred)
# precisions, recall, f1_score, _ = metrics.precision_recall_fscore_support(y_test, pred)
   

In [27]:
# accuracy

In [28]:
# precisions, recall

In [29]:
# f1_score

## Vis

In [30]:
data

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1
...,...,...,...
1585,923761170967797761,GOP targets environmental rules after Californ...,1
1586,923796193670336512,Motorcycle crash sparked 350-acre fire in Clev...,1
1587,923821955568013313,Carl’s Jr. #SantaRosa catches #Fire while maki...,0
1588,923844053426348032,Inside the List: The Romance Writer Who Almost...,1


In [31]:
# clustering_models.append(ens_clt)

In [32]:
for i in range(len(clustering_models)):
    column_name = str(clustering_models[i])
    print(column_name)
    output_df = pd.DataFrame(clustering_models[i].fit_predict(X), columns=[column_name])
    # print(output_df[column_name].value_counts())
    data = data.join(output_df)
    matched = data[data["text_info"]==data[column_name]]
    unmatched = data[data["text_info"]!=data[column_name]]
    print("0 corresponding to 0", matched.shape[0]/data.shape[0])
    match_percentage = matched.shape[0]/data.shape[0]
    print("0 not corresponding to 0", unmatched.shape[0]/data.shape[0])
    unmatch_percentage = unmatched.shape[0]/data.shape[0]
    if unmatch_percentage > match_percentage:
        data[column_name] = data[column_name].apply(lambda x: 1 if x==0 else 0)

KMeans(n_clusters=2)




0 corresponding to 0 0.6788413098236776
0 not corresponding to 0 0.3211586901763224
Birch(n_clusters=2)
0 corresponding to 0 0.3211586901763224
0 not corresponding to 0 0.6788413098236776
AgglomerativeClustering()
0 corresponding to 0 0.31801007556675065
0 not corresponding to 0 0.6819899244332494


In [35]:
data['ensemble'] = y_ensemble
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info,KMeans(n_clusters=2),Birch(n_clusters=2),AgglomerativeClustering(),ensemble
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1,1.0,1,1,0
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1,1.0,1,1,0
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,0
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,0
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1,1.0,1,1,0


In [36]:
matched = data[data["text_info"]==data['ensemble']]
unmatched = data[data["text_info"]!=data['ensemble']]
print("0 corresponding to 0", matched.shape[0]/data.shape[0])
match_percentage = matched.shape[0]/data.shape[0]
print("0 not corresponding to 0", unmatched.shape[0]/data.shape[0])
unmatch_percentage = unmatched.shape[0]/data.shape[0]
if unmatch_percentage > match_percentage:
    data['ensemble'] = data['ensemble'].apply(lambda x: 1 if x==0 else 0)

0 corresponding to 0 0.2795969773299748
0 not corresponding to 0 0.7204030226700252


In [37]:
data

Unnamed: 0,tweet_id,tweet_text,text_info,KMeans(n_clusters=2),Birch(n_clusters=2),AgglomerativeClustering(),ensemble
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1,1.0,1,1,1
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1,1.0,1,1,1
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,1
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1,1.0,1,1,1
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1,1.0,1,1,1
...,...,...,...,...,...,...,...
1585,923761170967797761,GOP targets environmental rules after Californ...,1,1.0,1,1,1
1586,923796193670336512,Motorcycle crash sparked 350-acre fire in Clev...,1,1.0,1,1,1
1587,923821955568013313,Carl’s Jr. #SantaRosa catches #Fire while maki...,0,1.0,1,1,1
1588,923844053426348032,Inside the List: The Romance Writer Who Almost...,1,,0,0,1


In [39]:
data.to_csv("model_clustering_data.csv")

In [None]:
# outputs = []

In [None]:
# for model in clustering_models:
#     outputs.append(model.fit_predict(X))

In [None]:
# output_df = pd.DataFrame(outputs[2], columns=['cluster'])
# new_data = data
# new_data = new_data.join(output_df)

In [None]:
# for output in outputs:
#     output_df = pd.DataFrame(output, columns=['cluster'])
#     new_data = data
#     new_data = new_data.join(output_df)
#     # print(new_data[new_data["text_info"]==new_data["cluster"]])
#     matched = new_data[new_data["text_info"]==new_data["cluster"]]
#     unmatched = new_data[new_data["text_info"]!=new_data["cluster"]]
#     print("0 corresponding to 0", matched.shape[0]/data.shape[0])
#     match_percentage = matched.shape[0]/data.shape[0]
#     print("0 not corresponding to 0", unmatched.shape[0]/data.shape[0])
#     unmatch_percentage = unmatched.shape[0]/data.shape[0]
#     if unmatch_percentage > match_percentage:
#         new_data['cluster'] = new_data["cluster"].apply(lambda x: 1 if x==0 else 0)