In [20]:
import numpy as np
import pandas as pd
import torch

# Загрузка данных о постах из CSV файла
posts_info = pd.read_csv('post_info.csv')
print('Post data has been loaded successfully')

Post data has been loaded successfully


In [3]:
! pip3 install datasets transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.11-cp312-cp312-win_amd64.whl.metadata (8.0 kB)
Collecting huggingface-hub>=0.23.0 (from datasets)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.2 which is incompatible.


In [5]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ---------------------- ----------------- 1.3/2.3 MB 7.5 MB/s eta 0:00:01
   ---------------------------------------- 2.3/2.3 MB 9.5 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13


In [6]:
### Сделаем эмбеддинги текстов постов с помощью DistilBertModel

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel

def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [7]:
tokenizer, model = get_model('distilbert')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
### Сделаем датасет для постов

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])
    
    
dataset = PostDataset(posts_info['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [9]:
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
Quadro RTX 4000


In [11]:
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

100%|████████████████████████████████████████████████████████████████████████████████| 220/220 [01:09<00:00,  3.14it/s]


array([[ 3.63150895e-01,  4.89375368e-02, -2.64081061e-01, ...,
        -1.41593322e-01,  1.59182139e-02,  9.20454768e-05],
       [ 2.36416250e-01, -1.59500927e-01, -3.27798247e-01, ...,
        -2.89936215e-01,  1.19365320e-01, -1.62333541e-03],
       [ 3.75191480e-01, -1.13943964e-01, -2.40547031e-01, ...,
        -3.38919371e-01,  5.86942211e-02, -2.12655663e-02],
       ...,
       [ 3.40382665e-01,  6.64923415e-02, -1.63184628e-01, ...,
        -8.65629688e-02,  2.03403845e-01,  3.20907943e-02],
       [ 4.32092220e-01,  1.10914288e-02, -1.17306098e-01, ...,
         7.54014999e-02,  1.02739543e-01,  1.52745005e-02],
       [ 3.04277658e-01, -7.62155876e-02, -6.77587539e-02, ...,
        -5.43488339e-02,  2.44383425e-01, -1.41483517e-02]], dtype=float32)

In [15]:
embeddings.shape

(7023, 768)

In [16]:
### Теперь кластеризуем простраство признаков в виде эмбеддингов текстов постов

from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()

### Используем PCA, чтобы снизить кол-во признаков до 50
pca = PCA(n_components=50) 
pca_decomp = pca.fit_transform(centered)

In [21]:
from sklearn.cluster import KMeans

n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()

Unnamed: 0,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,3.265039,3.440374,3.396928,1.866927,3.39876,3.418766,2.352611,3.002679,3.46926,3.62203,2.833773,1.905098,2.217937,3.462479,3.440123
1,3.177069,3.155884,3.377641,1.405306,3.221546,3.32126,2.32279,2.859811,3.244966,3.359796,2.555706,2.182762,2.23694,3.385573,2.978166
2,2.962932,3.177997,3.489826,1.716033,3.292228,3.337284,2.391665,3.08999,3.391826,3.357538,2.883836,1.802533,3.034516,3.422431,2.965651
3,3.188558,3.779545,3.614577,2.448267,3.693933,3.729766,2.812101,3.317147,4.06148,3.815181,3.374131,2.424181,3.392677,3.649387,3.713351
4,2.967554,2.76732,2.840126,2.135114,2.834027,2.815793,2.028386,2.662076,3.240814,3.043798,2.142556,1.47938,2.935708,3.079051,2.643215


In [22]:
posts_info = pd.concat((posts_info, dists_df), axis=1)

posts_info.drop(["text"], axis=1, inplace=True)

posts_info

Unnamed: 0,post_id,topic,TextCluster,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,1,business,3,3.265039,3.440374,3.396928,1.866927,3.398760,3.418766,2.352611,3.002679,3.469260,3.622030,2.833773,1.905098,2.217937,3.462479,3.440123
1,2,business,3,3.177069,3.155884,3.377641,1.405306,3.221546,3.321260,2.322790,2.859811,3.244966,3.359796,2.555706,2.182762,2.236940,3.385573,2.978166
2,3,business,3,2.962932,3.177997,3.489826,1.716033,3.292228,3.337284,2.391665,3.089990,3.391826,3.357538,2.883836,1.802533,3.034516,3.422431,2.965651
3,4,business,11,3.188558,3.779545,3.614577,2.448267,3.693933,3.729766,2.812101,3.317147,4.061480,3.815181,3.374131,2.424181,3.392677,3.649387,3.713351
4,5,business,11,2.967554,2.767320,2.840126,2.135114,2.834027,2.815793,2.028386,2.662076,3.240814,3.043798,2.142556,1.479380,2.935708,3.079051,2.643215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,movie,5,3.077984,1.794818,1.798245,3.000850,1.983489,1.269867,2.739766,2.818786,3.396672,2.951973,2.339939,3.001296,3.354130,3.153287,3.050743
7019,7316,movie,5,2.881163,1.800488,1.396291,3.012703,1.739630,0.944839,2.450563,2.497095,3.383455,2.597047,2.239629,2.965856,3.189683,2.935872,3.196685
7020,7317,movie,5,2.686712,1.968014,2.047667,3.258316,2.151720,1.475123,2.811653,2.544725,3.463248,2.378140,2.453856,3.186467,3.403342,2.892240,3.155552
7021,7318,movie,2,3.353119,1.423109,1.068736,3.283081,1.798801,1.498591,2.992455,3.109643,3.410108,3.306195,2.310479,3.194858,3.443137,3.461116,3.217078


Добавили новые фичи на основе близости эмбединга поста к центру кластеров