In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import BertModel
from transformers import RobertaModel
from transformers import DistilBertModel
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer
from transformers import BertModel
from transformers import RobertaModel
from transformers import DistilBertModel
import torch
import torch.nn as nn

In [3]:
import os
import pandas as pd
from sqlalchemy import create_engine


pg_user = os.getenv("POSTGRES_USER", " ")
pg_password = os.getenv("POSTGRES_PASSWORD", " ")
pg_host = os.getenv("POSTGRES_HOST", " ")
pg_port = os.getenv("POSTGRES_PORT", " ")
pg_db = os.getenv("POSTGRES_DATABASE", " ")

engine = create_engine(
    f"postgresql://{pg_user}:{pg_password}"
    f"{pg_host}:{pg_port}/{pg_db}"
)

post_text_df = pd.read_sql('SELECT * FROM public.post_text_df;', con=engine) # считываем таблицу
post_text_df

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [4]:
texts = post_text_df["text"]
texts

0       UK economy facing major risks\n\nThe UK manufa...
1       Aids and climate top Davos agenda\n\nClimate c...
2       Asian quake hits European shares\n\nShares in ...
3       India power shares jump on debut\n\nShares in ...
4       Lacroix label bought by US firm\n\nLuxury good...
                              ...                        
7018    OK, I would not normally watch a Farrelly brot...
7019    I give this movie 2 stars purely because of it...
7020    I cant believe this film was allowed to be mad...
7021    The version I saw of this film was the Blockbu...
7022    Piece of subtle art. Maybe a masterpiece. Doub...
Name: text, Length: 7023, dtype: object

In [5]:
def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [6]:
tokenizer, model = get_model('bert')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

cuda:0
NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [7]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example["text"], add_special_tokens = True, return_token_type_ids = False, truncation = True)

In [8]:
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset

In [9]:
class TextsDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.encodings = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens = True,
            return_token_type_ids = False,
            truncation = True,
            padding=True,
            return_tensors='pt')

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings['input_ids'][idx],
            "attention_mask": self.encodings['attention_mask'][idx]
            }



In [10]:

dataset = TextsDataset(texts.values.tolist(), tokenizer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
loader = DataLoader(dataset, batch_size=2, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [11]:
model = model.to(device)

In [12]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [13]:
from tqdm import tqdm
def get_embeddings_labels(model, loader):
    model.eval()
    total_embeddings = []

    with torch.no_grad():
        for batch in tqdm(loader):

            batch = {key: val.to(device) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}
            outputs = model(**batch)
            embeddings = outputs.last_hidden_state[:, 0, :]  # CLS токен
            total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [14]:
post_embeddings = get_embeddings_labels(model, loader)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 3512/3512 [05:19<00:00, 11.00it/s]


In [15]:
post_embeddings.shape

torch.Size([7023, 768])

In [16]:
post_embeddings[0]

tensor([ 1.4036e-01, -1.4070e-01, -5.7568e-01, -1.1817e-01, -3.1532e-01,
        -1.1438e-01,  4.3198e-01, -1.4429e-01,  3.2331e-03, -1.1878e+00,
        -3.3517e-01,  8.5946e-02, -1.0785e-01, -3.6509e-01, -3.5554e-02,
         1.1618e-01,  2.9538e-01, -6.3872e-02,  6.1660e-02, -4.6042e-01,
        -7.2604e-02, -3.3956e-01,  2.6281e-01, -3.3909e-01,  2.1391e-02,
        -1.2962e-01,  6.0904e-02,  3.6134e-01, -2.9707e-01,  7.4856e-01,
         1.0701e-01,  2.3395e-01, -3.4371e-01,  1.8407e-01,  4.9053e-02,
        -1.5357e-01,  2.0610e-02, -3.9094e-01, -6.3100e-02,  9.3924e-02,
        -2.2694e-01,  5.5329e-01,  2.8346e-01, -2.8457e-01,  2.5225e-02,
        -2.1027e-01,  3.0637e-01,  1.4088e-02,  2.5683e-03,  2.2604e-02,
        -1.5580e-01, -7.6782e-04,  1.3047e-01,  6.4576e-02, -1.3793e-01,
         1.8241e-01,  2.8353e-01,  1.1229e-01, -6.3608e-01,  5.5540e-01,
         3.0667e-01, -4.7828e-01,  1.1780e-01,  9.7994e-02,  1.9741e-01,
         2.6836e-01, -8.4358e-02, -4.3326e-02, -3.6

In [17]:
texts_df = pd.DataFrame(post_embeddings)
texts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.140363,-0.140695,-0.575681,-0.118175,-0.315325,-0.114379,0.431976,-0.144286,0.003233,-1.187819,...,0.051767,0.614535,-0.613107,0.131061,0.202857,0.175932,-0.167769,-0.137880,0.042959,0.142284
1,0.157530,-0.097739,-0.230651,-0.364431,-0.242782,0.310065,0.374489,-0.089235,0.202152,-1.130711,...,0.586031,0.652154,-0.112308,-0.085171,-0.051813,0.240048,0.200298,-0.300891,0.190542,0.019754
2,0.314568,-0.115163,-0.181322,-0.274697,-0.357378,0.285228,0.266597,0.002641,-0.033905,-1.092078,...,0.416194,0.641697,-0.326962,-0.042802,-0.073850,0.212023,-0.090192,-0.354050,-0.204324,-0.027024
3,0.415117,-0.241302,-0.260732,-0.436026,-0.194695,0.130078,0.458805,-0.235223,-0.032936,-1.008514,...,0.791115,0.562938,-0.194189,0.022461,0.108904,0.019627,0.362089,-0.150885,-0.048834,0.083070
4,0.614585,-0.235812,-0.047733,-0.406701,-0.284798,0.124150,0.545585,-0.284447,0.047563,-1.139113,...,0.605897,0.518612,0.007372,0.032436,0.015633,0.055697,0.145704,-0.061322,-0.021119,0.121545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,0.521686,0.292126,-0.210760,-0.219636,0.124439,-0.032428,0.055144,0.118430,-0.033624,-1.167992,...,0.575904,0.319888,-0.427476,0.161139,0.179984,0.053670,0.051578,-0.353712,-0.010725,-0.047067
7019,0.487033,0.442943,-0.251731,-0.303212,0.068587,-0.176149,0.235078,-0.050546,-0.011601,-1.161575,...,0.077758,0.174062,-0.359764,-0.196519,0.043920,0.178851,-0.054636,-0.233470,0.349205,-0.031997
7020,0.619477,0.274618,-0.126504,-0.110225,0.167652,-0.184623,0.244475,0.020286,0.058746,-1.046321,...,0.307723,0.187839,-0.378197,-0.223615,0.019644,0.250250,0.038904,-0.358137,-0.164277,0.171024
7021,0.694089,0.067175,-0.228680,-0.255529,0.134818,0.223566,0.249515,-0.032107,-0.160485,-1.041767,...,0.361885,0.352623,-0.307157,-0.139961,0.150053,0.015053,0.045805,0.037858,0.141017,0.124412


In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components=30)
posts_pca = pca.fit_transform(texts_df)
posts_pca = pd.DataFrame(posts_pca)
posts_pca.columns = [f"PCA_{i}" for i in range(posts_pca.shape[1])]
posts_pca

Unnamed: 0,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,...,PCA_20,PCA_21,PCA_22,PCA_23,PCA_24,PCA_25,PCA_26,PCA_27,PCA_28,PCA_29
0,0.954851,-1.755630,0.206989,1.697702,2.381049,0.014750,-0.139384,0.573162,2.037537,-0.976337,...,0.337080,-0.184843,-0.193652,0.566699,-0.527048,-0.287264,-0.192171,-0.484622,-0.778505,-0.212134
1,3.082151,-0.872368,-1.121499,0.695177,-0.051375,0.234761,-0.309037,0.000413,0.246704,0.531294,...,-1.030763,0.205328,-0.432245,-0.031841,0.747930,0.438376,-0.222713,0.336269,0.001601,0.170153
2,2.298747,-0.771820,-1.484289,0.921830,-0.043767,0.577133,0.102365,0.678316,1.164005,-1.001139,...,0.150033,0.149849,-0.020774,-0.073812,0.182070,-0.879496,-0.853309,0.182570,-0.400530,0.316251
3,3.830452,-0.031845,-1.307990,-2.101638,-0.484521,-0.095325,0.206711,-0.808233,-0.210449,0.079049,...,0.199825,0.091199,-0.306309,0.109941,-0.088283,-0.034158,0.363915,-0.209983,-0.094214,0.132085
4,2.248865,0.231328,-1.637727,-1.685150,-0.175618,-0.363436,0.877843,-0.201346,-0.476314,0.037063,...,0.085809,0.336506,0.260088,-0.188166,-0.754898,0.082013,0.086568,-0.216844,-0.182454,-0.180040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,-2.656971,-1.065921,-1.688347,-0.109288,-0.562751,-0.528286,0.269985,0.190374,-1.075470,-0.433450,...,-0.560250,0.112874,-0.116136,-0.409503,-0.142918,0.364955,-0.388964,-0.519685,0.371051,-0.380896
7019,-2.547189,-0.549886,-0.247585,0.458581,-0.235309,-0.913795,0.242980,0.429369,0.440352,-0.022949,...,0.475040,-0.312022,0.349382,0.379183,-0.642329,0.759320,-0.269646,-0.491487,0.156947,0.422528
7020,-2.438387,-0.878830,-1.034503,-0.249474,-0.589286,-0.427734,0.258933,0.983647,-0.138927,-0.731650,...,-0.287236,-1.025114,-0.107608,0.214689,0.128779,0.021019,0.306247,0.006642,-0.097618,-0.182296
7021,-2.499125,-1.253238,-0.025752,-0.992780,0.589764,0.471233,0.410791,-0.301117,-0.129270,-0.344704,...,-0.124225,0.210372,-0.424361,-0.257300,0.021405,-0.209294,0.303103,0.456081,0.321806,0.232650


In [19]:
post_data = pd.concat([post_text_df, posts_pca], axis=1)
post_data

Unnamed: 0,post_id,text,topic,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,...,PCA_20,PCA_21,PCA_22,PCA_23,PCA_24,PCA_25,PCA_26,PCA_27,PCA_28,PCA_29
0,1,UK economy facing major risks\n\nThe UK manufa...,business,0.954851,-1.755630,0.206989,1.697702,2.381049,0.014750,-0.139384,...,0.337080,-0.184843,-0.193652,0.566699,-0.527048,-0.287264,-0.192171,-0.484622,-0.778505,-0.212134
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,3.082151,-0.872368,-1.121499,0.695177,-0.051375,0.234761,-0.309037,...,-1.030763,0.205328,-0.432245,-0.031841,0.747930,0.438376,-0.222713,0.336269,0.001601,0.170153
2,3,Asian quake hits European shares\n\nShares in ...,business,2.298747,-0.771820,-1.484289,0.921830,-0.043767,0.577133,0.102365,...,0.150033,0.149849,-0.020774,-0.073812,0.182070,-0.879496,-0.853309,0.182570,-0.400530,0.316251
3,4,India power shares jump on debut\n\nShares in ...,business,3.830452,-0.031845,-1.307990,-2.101638,-0.484521,-0.095325,0.206711,...,0.199825,0.091199,-0.306309,0.109941,-0.088283,-0.034158,0.363915,-0.209983,-0.094214,0.132085
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,2.248865,0.231328,-1.637727,-1.685150,-0.175618,-0.363436,0.877843,...,0.085809,0.336506,0.260088,-0.188166,-0.754898,0.082013,0.086568,-0.216844,-0.182454,-0.180040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie,-2.656971,-1.065921,-1.688347,-0.109288,-0.562751,-0.528286,0.269985,...,-0.560250,0.112874,-0.116136,-0.409503,-0.142918,0.364955,-0.388964,-0.519685,0.371051,-0.380896
7019,7316,I give this movie 2 stars purely because of it...,movie,-2.547189,-0.549886,-0.247585,0.458581,-0.235309,-0.913795,0.242980,...,0.475040,-0.312022,0.349382,0.379183,-0.642329,0.759320,-0.269646,-0.491487,0.156947,0.422528
7020,7317,I cant believe this film was allowed to be mad...,movie,-2.438387,-0.878830,-1.034503,-0.249474,-0.589286,-0.427734,0.258933,...,-0.287236,-1.025114,-0.107608,0.214689,0.128779,0.021019,0.306247,0.006642,-0.097618,-0.182296
7021,7318,The version I saw of this film was the Blockbu...,movie,-2.499125,-1.253238,-0.025752,-0.992780,0.589764,0.471233,0.410791,...,-0.124225,0.210372,-0.424361,-0.257300,0.021405,-0.209294,0.303103,0.456081,0.321806,0.232650


In [39]:
post_data.to_sql(
    "fedorrybalov_post_data_dl_features",
    con=engine,
    if_exists="replace",
    index=False,
    chunksize=1000,
)

7023

In [40]:
post_text_df = pd.read_sql('SELECT * FROM fedorrybalov_post_data_dl_features;', con=engine) # считываем таблицу
post_text_df

Unnamed: 0,post_id,text,topic,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,...,PCA_20,PCA_21,PCA_22,PCA_23,PCA_24,PCA_25,PCA_26,PCA_27,PCA_28,PCA_29
0,1,UK economy facing major risks\n\nThe UK manufa...,business,0.954851,-1.755630,0.206989,1.697702,2.381049,0.014750,-0.139384,...,0.337080,-0.184843,-0.193652,0.566699,-0.527048,-0.287264,-0.192171,-0.484622,-0.778505,-0.212135
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,3.082151,-0.872368,-1.121499,0.695177,-0.051375,0.234761,-0.309037,...,-1.030763,0.205328,-0.432245,-0.031841,0.747930,0.438376,-0.222713,0.336269,0.001601,0.170153
2,3,Asian quake hits European shares\n\nShares in ...,business,2.298747,-0.771820,-1.484289,0.921830,-0.043767,0.577133,0.102365,...,0.150033,0.149849,-0.020774,-0.073812,0.182070,-0.879496,-0.853309,0.182570,-0.400530,0.316251
3,4,India power shares jump on debut\n\nShares in ...,business,3.830452,-0.031845,-1.307990,-2.101638,-0.484521,-0.095325,0.206711,...,0.199825,0.091199,-0.306309,0.109941,-0.088283,-0.034158,0.363915,-0.209983,-0.094214,0.132085
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,2.248865,0.231328,-1.637728,-1.685150,-0.175618,-0.363436,0.877843,...,0.085809,0.336506,0.260088,-0.188166,-0.754898,0.082013,0.086568,-0.216844,-0.182454,-0.180040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie,-2.656971,-1.065921,-1.688347,-0.109288,-0.562751,-0.528286,0.269985,...,-0.560250,0.112874,-0.116136,-0.409503,-0.142918,0.364955,-0.388964,-0.519685,0.371051,-0.380896
7019,7316,I give this movie 2 stars purely because of it...,movie,-2.547189,-0.549886,-0.247585,0.458581,-0.235309,-0.913795,0.242980,...,0.475040,-0.312022,0.349382,0.379183,-0.642329,0.759320,-0.269646,-0.491487,0.156947,0.422528
7020,7317,I cant believe this film was allowed to be mad...,movie,-2.438387,-0.878830,-1.034503,-0.249474,-0.589286,-0.427734,0.258933,...,-0.287236,-1.025114,-0.107608,0.214689,0.128779,0.021019,0.306247,0.006642,-0.097618,-0.182296
7021,7318,The version I saw of this film was the Blockbu...,movie,-2.499125,-1.253238,-0.025752,-0.992780,0.589764,0.471233,0.410791,...,-0.124225,0.210372,-0.424361,-0.257300,0.021405,-0.209294,0.303103,0.456081,0.321806,0.232650


In [None]:
post_text_df = pd.read_sql('SELECT * FROM fedorrybalov_lesson_22_user_data;', con=engine) # считываем таблицу
post_text_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads
...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic
163201,168549,0,18,Russia,Tula,2,Android,organic
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic
163203,168551,0,38,Russia,Moscow,3,iOS,organic
