# Fine-tuning BERT for Sentiment Analysis

# importing, downloading and setting consts

In [None]:
#!g1.1
%pip install datasets
%pip install livelossplot
%pip install pymystem3 umap-learn pyarrow==6.0.0
%pip install pymystem3 --upgrade
%pip install razdel

In [66]:
#!g1.1
import numpy as np
from transformers import BertForMaskedLM, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import random
from torch.utils import data
import faiss
import gc
import warnings
from tqdm.notebook import tqdm

from livelossplot import PlotLosses
import os.path
from datasets import load_dataset
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sklearn.model_selection import train_test_split
from scipy.optimize import linear_sum_assignment as linear_assignment
from pprint import pprint
from IPython.display import clear_output
import json
from collections import OrderedDict
import random
import pandas as pd
import string
import umap
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [67]:
#!g1.1
from utils import Preprocessor, BertKMeansModel, Metrics, NewEncoder, Plot_data, KeyWords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# TRAIN

In [76]:
#!g1.1
class BertKMeansExecutor:
    def __init__(self, tokenizer=None, max_len=128, batch_size=64, lr=3e-5, model_name="bert-base-uncased", num_clusters=8):
        if tokenizer is None:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        else:
            self.tokenizer = tokenizer
        self.model_name = model_name
        self.max_len = max_len
        self.batch_size = batch_size
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.lr=lr
        self.num_clusters = num_clusters
        self.keywords = KeyWords(language="english")
        self.umapper = Plot_data()
    
    def train(self, dataloader, epochs=1):    
        dataloader_len = len(dataloader)
        self.optimizer = AdamW(self.model.parameters(), lr=self.lr)
        total_steps = dataloader_len * epochs
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=70,
                                                         num_training_steps=total_steps)
        self.model.bert.train()
        self.model.mlm_head.train()    

        for epoch in range(epochs):
            old_centers = self.model.centers.clone().detach()
            print('====', epoch)
            tq = tqdm(dataloader, desc=f'Еpoch {epoch}')

            for batch in tq:
                self.model.zero_grad()
                batch = {i: j.to(self.device) for i, j in batch.items()}

                Z, bert_loss = self.model(**batch, do_masking=True)
                bert_loss = bert_loss

                real_Q = self.model.calc_Q(Z)
                old_Q = self.model.calc_Q(Z, centers=old_centers)
                old_P = self.model.calc_P(old_Q)
#                 cluster_loss = self.model.kl_loss(torch.log(real_Q), old_P)
                cluster_loss = self.model.kl_loss(torch.log(real_Q + 1e-08), old_P)


                loss = cluster_loss + bert_loss
#                 loss = cluster_loss
                loss.backward()

                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.1)
                self.optimizer.step()
                self.scheduler.step()
    
    def predict(self, dataloader):
        self.model.bert.eval()
        self.model.mlm_head.eval()
        ids = []
        embedings = []
        y_pred = []
        with torch.no_grad():
            for step, batch in tqdm(enumerate(dataloader)):
                batch = {i: j.to(self.device) for i, j in batch.items()}
                # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                ids.append(batch["label"])
                output = self.model(**batch)
                Z, _ = output
                real_Q = self.model.calc_Q(Z)
                y_pred.append(torch.argmax(real_Q, dim=1))
                embedings.append(Z)
        self.model.bert.train()
        self.model.mlm_head.train()
        return torch.cat(y_pred).cpu().detach().numpy(), torch.cat(ids).cpu().detach().numpy(), \
               torch.cat(embedings).cpu().detach().numpy()
    
    def tokenize_data(self, data, ids=None):
        token_function = lambda x: self.tokenizer(x, padding="max_length", truncation=True, 
                                                  max_length=self.max_len, return_tensors='pt')
        data = list(map(token_function, data))
        if ids: ids = list(map(int, ids))
        for i in range(len(data)):
            data[i]["input_ids"] = data[i]["input_ids"].squeeze(0)
            data[i]["token_type_ids"] = data[i]["token_type_ids"].squeeze(0)
            data[i]["attention_mask"] = data[i]["attention_mask"].squeeze(0)
            if ids: data[i]["id"] = torch.tensor(ids[i])
        
        return data
    
    def load_dataset(self, filename):
        df = pd.read_excel(filename)
        data = df["text"].tolist()
        ids = df["id"].tolist()
        data = self.tokenize_data(data, ids)
        dataloader = torch.utils.data.DataLoader(data, batch_size=self.batch_size, shuffle=True)
        self.df = df
        return dataloader
    
    def init_model(self, dataloder):
        self.model = BertKMeansModel(device=self.device, num_clusters=self.num_clusters, 
                                     dataloader_to_init_kmeans=dataloder, model_name=self.model_name)
        self.model.to(self.device)
        
    def get_payload(self, filename, do_train=False, epochs=1):
        response = {"status": "success", "payload": {"intertopic_map": [],"documents": []}}
        dataloader = self.load_dataset(filename)
        self.init_model(dataloader)
        if do_train:
            self.train(dataloader, epochs=epochs)
            
        y_pred, ids, embedings = self.predict(dataloader)
        coords = self.umapper.get_2d_embeddings(embedings)
        cluster_coords = self.umapper.get_cluster_centers(coords, y_pred)
        key_words = self.keywords.get_ctfidf_keywords(self.df["text"].tolist(), y_pred)
        for cluster, values in cluster_coords.items():
            key_words_of_doc = key_words[cluster]
            if len(key_words_of_doc) > 10: key_words_of_doc = key_words_of_doc[:10]
            response["payload"]["intertopic_map"].append({
                                                            "id": cluster,
                                                            "keywords": list(map(lambda x: x[0], key_words_of_doc)),
                                                            "size": values["size"],
                                                            "cord_x": values["x"],
                                                            "cord_y": values["y"]
                                                         })
        self.df["cluster"] = 0
        self.df["x"] = .0
        self.df["y"] = .0

        for i, id_ in enumerate(ids):
            self.df["cluster"][id_] = int(y_pred[i])
            self.df["x"][id_] = int(coords[i][0])
            self.df["y"][id_] = int(coords[i][1])
        for i in range(self.df.shape[0]):
            response["payload"]["documents"].append({"id": int(self.df["id"][i]),
                                                             "cord_x": float(self.df["x"][i]),
                                                             "cord_y": float(self.df["y"][i]),
                                                             "cluster_id": int(self.df["cluster"][i]),
                                                             "description": self.df["text"][i]})
        return json.dumps(response, cls=NewEncoder, ensure_ascii=False)
        
    def get_topics_by_doc_id(self, id_):
        document = self.df[self.df["id"] == id_]["text"].tolist()
        data = self.tokenize_data(document)[0]
        self.model.bert.eval()
        self.model.mlm_head.eval()
        with torch.no_grad():
            data = {i: j.to(self.device).unsqueeze(0) for i, j in data.items()}
            output = self.model(**data)
            Q = self.model.calc_Q(output[0])
        self.model.bert.train()
        self.model.mlm_head.train()

        distribution = nn.functional.softmax(Q, dim=1)
        response = {
                        "content": document,
                         "distribution": [{"label": i, "value": value} for i, value in enumerate(distribution[0])]
                    }
        return json.dumps(response, cls=NewEncoder, ensure_ascii=False)

In [77]:
#!g1.1
preprocessor = Preprocessor(batch_size=64, max_len=128)
shuffled_datloader, unshuffled_dataloader, NUM_CLUSTERS = preprocessor.init_trec()

Using custom data configuration default
Reusing dataset trec (/tmp/xdg_cache/huggingface/datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9)


  0%|          | 0/2 [00:00<?, ?it/s]

In [78]:
#!g1.1
b_kmeans = BertKMeansExecutor(num_clusters=NUM_CLUSTERS)
b_kmeans.init_model(shuffled_datloader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/86 [00:00<?, ?it/s]

In [71]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [72]:
#!g1.1
metrics = Metrics()
print("trec")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

trec
NMI: 0.14369906379502373
AR: 0.11337071453405732
ACC: 0.38462949376375644
PURITY: 0.42956713132795304


In [73]:
#!g1.1
b_kmeans.train(shuffled_datloader, epochs=20)

Еpoch 0:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 1:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 2:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 3:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 4:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 5:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 6:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 7:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 8:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 9:   0%|          | 0/86 [00:00<?, ?it/s]

Еpoch 10:   0%|          | 0/86 [00:00<?, ?it/s]

==== 0
==== 1
==== 2
==== 3
==== 4
==== 5
==== 6
==== 7
==== 8
==== 9
==== 10
==== 11


Еpoch 11:   0%|          | 0/86 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [74]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [75]:
#!g1.1
metrics = Metrics()
print("trec")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

trec
NMI: 0.15229398820009243
AR: 0.11968746829403153
ACC: 0.3805942773294204
PURITY: 0.4165443873807777


In [79]:
#!g1.1
from collections import Counter
color_pallet = sns.color_palette('Spectral', n_colors=10)
for i in range(20):
    print("epoch", i)
    y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)
    emb_shape = embedings.shape
    cen_shape = b_kmeans.model.centers.shape
    print(Counter(list(y_pred)))
    all_embs = np.concatenate((embedings, b_kmeans.model.centers.cpu().detach().numpy()), 0)
    coords = b_kmeans.umapper.get_2d_embeddings(all_embs)
    plt.figure(figsize=(20,10))
    sizes = torch.ones(all_embs.shape[0])
    sizes[-cen_shape[0]:] *= 500
    plt.scatter(coords[:, 0], coords[:, 1], c=[color_pallet[x] for x in y_pred] + [color_pallet[-1]]*5, s=sizes)
    plt.savefig(f'pictures/{i}.png', dpi=600)
    b_kmeans.train(shuffled_datloader, epochs=1)

0it [00:00, ?it/s]

epoch 0
Counter({0: 1280, 1: 1076, 3: 924, 4: 889, 5: 868, 2: 415})


KeyboardInterrupt: 

Exception ignored on calling ctypes callback function: <function ExecutionEngine._raw_object_cache_notify at 0x7fd3158ab790>
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/llvmlite/binding/executionengine.py", line 171, in _raw_object_cache_notify
    def _raw_object_cache_notify(self, data):
KeyboardInterrupt: 


In [None]:
#!g1.1
metrics = Metrics()
print("tweets")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))

In [None]:
#!g1.1
b_kmeans.train(shuffled_datloader, epochs=20)

In [None]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

In [None]:
#!g1.1
metrics = Metrics()
print("tweets")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))

In [None]:
#!g1.1
coords = b_kmeans.umapper.get_2d_embeddings(embedings)
plt.figure(figsize=(20,10))
color_pallet = sns.color_palette('Spectral', n_colors=89)
plt.scatter(coords[:, 0], info[:, 0], c=[color_pallet[x] for x in y_pred])

In [None]:
#!g1.1


In [86]:
import pandas as pd
df = pd.DataFrame([["0.010", "0.213", "0.024", "0.020"],               
                   ["0.021", "0.015", "0.011", "0.017"],
                   ["0.034", "0.086", "0.075", "0.021"],
                   ["0.064", "0.087", "0.098", "0.020"],
                   ["0.027", "0.051", "0.021", "0.011"],
                   ["0.011", "0.015", "0.013", "0.077"]])
df.index = ["dbpedia-14", "trec-6", "google news", "tweet", "agnews-8000", "yelp"]
df.columns = ["NMI", "ARI", "ACC", "Purity"]
df

Unnamed: 0,ΔNMI,ΔARI,ΔACC,ΔPurity
dbpedia-14,0.01,0.213,0.024,0.02
trec-6,0.021,0.015,0.011,0.017
google news,0.034,0.086,0.075,0.021
tweet,0.064,0.087,0.098,0.02
agnews-8000,0.027,0.051,0.021,0.011
yelp,0.011,0.015,0.013,0.077


In [81]:
df = pd.DataFrame([[3309, 1504, 1289, 1686, 2212],
                  [3236, 1705, 1270, 1879, 1910]])
df.index = ["Итерация 0", "Итерация 30"]
df

Unnamed: 0,0,1,2,3,4
Итерация 0,3309,1504,1289,1686,2212
Итерация 30,3236,1705,1270,1879,1910


In [None]:
#!g1.1


# DBPEDIA

In [19]:
#!g1.1
preprocessor = Preprocessor(batch_size=64, max_len=128)
shuffled_datloader, unshuffled_dataloader, NUM_CLUSTERS = preprocessor.init_dbpedia()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/70000 [00:00<?, ? examples/s]

Downloading and preparing dataset dbpedia_14/dbpedia_14 (download: 65.18 MiB, generated: 191.44 MiB, post-processed: Unknown size, total: 256.62 MiB) to /tmp/xdg_cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c...
Dataset dbpedia_14 downloaded and prepared to /tmp/xdg_cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
#!g1.1
b_kmeans = BertKMeansExecutor(num_clusters=NUM_CLUSTERS, lr=3e-6)
b_kmeans.init_model(shuffled_datloader)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/157 [00:00<?, ?it/s]

In [21]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [22]:
#!g1.1
metrics = Metrics()
print("short")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

short
NMI: 0.6826028142097458
AR: 0.44534243761209347
ACC: 0.6267
PURITY: 0.6719


In [23]:
#!g1.1
b_kmeans.train(shuffled_datloader, epochs=20)

Еpoch 0:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 1:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 2:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 3:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 4:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 5:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 6:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 7:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 8:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 9:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 10:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 11:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 12:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 13:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 14:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 15:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 16:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 17:   0%|          | 0/157 [00:00<?, ?it/s]

Еpoch 18:   0%|          | 0/157 [00:00<?, ?it/s]

==== 0
==== 1
==== 2
==== 3
==== 4
==== 5
==== 6
==== 7
==== 8
==== 9
==== 10
==== 11
==== 12
==== 13
==== 14
==== 15
==== 16
==== 17
==== 18
==== 19


Еpoch 19:   0%|          | 0/157 [00:00<?, ?it/s]

In [24]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [25]:
#!g1.1
metrics = Metrics()
print("short")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

short
NMI: 0.6205360520217091
AR: 0.26555272406026176
ACC: 0.5503
PURITY: 0.5918


# NEWS


In [21]:
#!g1.1


In [26]:
#!g1.1
preprocessor = Preprocessor(batch_size=64, max_len=128)
shuffled_datloader, unshuffled_dataloader, NUM_CLUSTERS = preprocessor.init_news()

In [27]:
#!g1.1
b_kmeans = BertKMeansExecutor(num_clusters=NUM_CLUSTERS)
b_kmeans.init_model(shuffled_datloader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/174 [00:00<?, ?it/s]

In [28]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [29]:
#!g1.1
metrics = Metrics()
print("news")
print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

news
NMI: 0.8476952786920904
AR: 0.5515762603639739
ACC: 0.6232784229003511
PURITY: 0.822576289495004


In [30]:
#!g1.1
b_kmeans.train(shuffled_datloader, epochs=20)

Еpoch 0:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 1:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 2:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 3:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 4:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 5:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 6:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 7:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 8:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 9:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 10:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 11:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 12:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 13:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 14:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 15:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 16:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 17:   0%|          | 0/174 [00:00<?, ?it/s]

Еpoch 18:   0%|          | 0/174 [00:00<?, ?it/s]

==== 0
==== 1
==== 2
==== 3
==== 4
==== 5
==== 6
==== 7
==== 8
==== 9
==== 10
==== 11
==== 12
==== 13
==== 14
==== 15
==== 16
==== 17
==== 18
==== 19


Еpoch 19:   0%|          | 0/174 [00:00<?, ?it/s]

In [31]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [32]:
#!g1.1
metrics = Metrics()
print("news")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

news
NMI: 0.8775051715539885
AR: 0.6304078260715236
ACC: 0.6981726528040327
PURITY: 0.843280223242416


In [29]:
#!g1.1


# SHORT


In [48]:
#!g1.1
preprocessor = Preprocessor(batch_size=64, max_len=128)
shuffled_datloader, unshuffled_dataloader, NUM_CLUSTERS = preprocessor.init_tweets()

In [49]:
#!g1.1
b_kmeans = BertKMeansExecutor(num_clusters=NUM_CLUSTERS)
b_kmeans.init_model(shuffled_datloader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/39 [00:00<?, ?it/s]

In [50]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [51]:
#!g1.1
metrics = Metrics()
print("yelp")

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))

yelp
NMI: 0.7317252527303122
AR: 0.33778916283922017
ACC: 0.45671521035598706


In [52]:
#!g1.1
metrics = Metrics()

print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

NMI: 0.7317252527303122
AR: 0.33778916283922017
ACC: 0.45671521035598706
PURITY: 0.758495145631068


In [53]:
#!g1.1
for i in range(10):
    b_kmeans.train(shuffled_datloader, epochs=1)
    y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)
    print("epoch", 1)

    print("NMI:", metrics.NMI(y_true, y_pred))
    print("AR:", metrics.AR(y_true, y_pred))
    print("ACC:", metrics.cluster_accuracy(y_true, y_pred))

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Еpoch 0:   0%|          | 0/39 [00:00<?, ?it/s]

0it [00:00, ?it/s]

==== 0
epoch 1
NMI: 0.7375490600654301
AR: 0.4080129208005109
ACC: 0.5
==== 0
epoch 1
NMI: 0.7338880949289144
AR: 0.4110061317413801
ACC: 0.5048543689320388
==== 0
epoch 1
NMI: 0.7343652694614837
AR: 0.42148254423001685
ACC: 0.5117313915857605
==== 0
epoch 1
NMI: 0.7409730105443854
AR: 0.42376141017999625
ACC: 0.5157766990291263
==== 0
epoch 1
NMI: 0.739192740565946
AR: 0.43826932378166683
ACC: 0.5182038834951457
==== 0
epoch 1
NMI: 0.7369161563115406
AR: 0.4383087151949432
ACC: 0.5137540453074434
==== 0
epoch 1
NMI: 0.7423188333079901
AR: 0.428728411651956
ACC: 0.5157766990291263
==== 0
epoch 1
NMI: 0.7454255588885478
AR: 0.42851033890131135
ACC: 0.5157766990291263
==== 0
epoch 1
NMI: 0.7417344911032502
AR: 0.43327856802588627
ACC: 0.5194174757281553
==== 0
epoch 1
NMI: 0.7461118905588179
AR: 0.43556841665222146
ACC: 0.5206310679611651


In [54]:
#!g1.1
y_pred, y_true, embedings = b_kmeans.predict(unshuffled_dataloader)

0it [00:00, ?it/s]

In [55]:
#!g1.1
metrics = Metrics()
print("NMI:", metrics.NMI(y_true, y_pred))
print("AR:", metrics.AR(y_true, y_pred))
print("ACC:", metrics.cluster_accuracy(y_true, y_pred))
print("PURITY:", metrics.calculate_purity(y_pred, y_true))

NMI: 0.7461118905588179
AR: 0.43556841665222146
ACC: 0.5206310679611651
PURITY: 0.7593042071197411


In [56]:
#!g1.1


In [57]:
#!g1.1
plt.figure(figsize=(20,10))
plt.scatter(info[4][:,0], info[4][:,1], s=info[4][:,2]*10, c=[(sns.color_palette('Spectral', n_colors=len(info[4])))[x] for x in range(len(info[4]))])

NameError: name 'info' is not defined

<Figure size 1440x720 with 0 Axes>

In [None]:
#!g1.1
