In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import VotingClassifier

In [2]:
data = pd.read_csv('classification_data.csv')
data.rename(columns={'tweet_text_x': 'tweet_text'}, inplace=True)
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,informative
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,informative
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,informative
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,informative
4,917792092100988929,RT @TIME: California's raging wildfires as you...,informative


In [3]:
data = data.dropna(axis=0)

In [4]:
data['text_info'] = data['text_info'].apply(lambda x: 1 if (x == "informative")  else 0)

In [5]:
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1


In [6]:
vectorizer = CountVectorizer()

In [7]:
X = vectorizer.fit_transform(data['tweet_text']).toarray()

In [8]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
from scipy.spatial.distance import cdist
import numpy as np

class ClusterSimilarityMatrix():
    
    def __init__(self) -> None:
        self._is_fitted = False

    def fit(self, y_clusters):
        if not self._is_fitted:
            self._is_fitted = True
            self.similarity = self.to_binary_matrix(y_clusters)
            return self

        self.similarity += self.to_binary_matrix(y_clusters)

    def to_binary_matrix(self, y_clusters):
        y_reshaped = np.expand_dims(y_clusters, axis=-1)
        return (cdist(y_reshaped, y_reshaped, 'cityblock')==0).astype(int)


class EnsembleCustering():
    def __init__(self, base_estimators, aggregator, distances=False):
        self.base_estimators = base_estimators
        self.aggregator = aggregator
        self.distances = distances

    def fit(self, X):
        X_ = X.copy()

        clt_sim_matrix = ClusterSimilarityMatrix()
        for model in self.base_estimators:
            clt_sim_matrix.fit(model.fit_predict(X=X_))
        
        sim_matrix = clt_sim_matrix.similarity
        self.cluster_matrix = sim_matrix/sim_matrix.diagonal()

        if self.distances:
            self.cluster_matrix = np.abs(np.log(self.cluster_matrix + 1e-8)) # Avoid log(0)

    def fit_predict(self, X):
        self.fit(X)
        y = self.aggregator.fit_predict(self.cluster_matrix)
        return y

In [10]:
clustering_models = []

In [11]:
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering

In [12]:
num_clusters = 4

In [13]:
clustering_models.append(KMeans(n_clusters=num_clusters))

In [14]:
clustering_models.append(Birch(n_clusters=num_clusters))

In [15]:
clustering_models.append(AgglomerativeClustering(n_clusters=num_clusters))

In [16]:
clustering_models.append(MeanShift())

In [17]:
aggregator_clt = SpectralClustering(n_clusters=2, affinity="precomputed")

In [18]:
ens_clt=EnsembleCustering(clustering_models, aggregator_clt)

In [19]:
y_ensemble = ens_clt.fit_predict(X)



In [21]:
np.unique(y_ensemble, return_counts=True)

(array([0, 1], dtype=int32), array([1280,  308]))

In [22]:
data['text_info'].value_counts()

1    1245
0     343
Name: text_info, dtype: int64

In [None]:
pred = ensemble.predict(vectorizer.transform(X_test).toarray())
accuracy = metrics.accuracy_score(y_test, pred)
precisions, recall, f1_score, _ = metrics.precision_recall_fscore_support(y_test, pred)
   

In [None]:
accuracy

In [None]:
precisions, recall

In [None]:
f1_score

In [None]:
clf = LogisticRegression()
# clf = RandomForestClassifier(n_estimators=500, max_features=0.25, criterion="entropy", class_weight="balanced")
# clf = DecisionTreeClassifier()
# clf = SVC()
clf.fit(X, y_train)

In [None]:
lgr_pred = clf.predict(vectorizer.transform(X_test).toarray())
metrics.accuracy_score(y_test, lgr_pred)