In [1]:
from typing import List, Tuple, Iterable, Any
from warnings import warn
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import lightning as pl
from transformers import AutoTokenizer, RobertaModel

torch.manual_seed(42)
torch.set_float32_matmul_precision('high')

  warn(f"Failed to load image Python extension: {e}")


In [2]:
DATA_DIR = 'crisis_data'
FILE_BLACKLIST = [
    'crisis_data/Jan Szyszko_Córka leśniczego.xlsx',
    'crisis_data/Komenda Główna Policji.xlsx',
    'crisis_data/Ministerstwo Zdrowia_respiratory od handlarza bronią.xlsx',
    'crisis_data/Polska Grupa Energetyczna.xlsx',
    'crisis_data/Polski Związek Kolarski.xlsx',
    'crisis_data/Zbój_energetyk.xlsx'
]

crisis = pd.read_excel('crisis_data/Daty_kryzysów.xlsx').dropna()
crisis = crisis[~crisis['Plik'].apply(lambda x: os.path.join(DATA_DIR, x) in FILE_BLACKLIST)]

In [3]:
def clip_date_range(index: pd.DatetimeIndex, crisis_start: pd.Timestamp) -> pd.DatetimeIndex:
    return pd.date_range(max(index.min(), crisis_start - pd.Timedelta(days=60)), min(index.max(), crisis_start + pd.Timedelta(days=29)))

def extract_data(filename: str, crisis_start: pd.Timestamp, num_samples: int = 100) -> Tuple[pd.DataFrame, pd.DataFrame]:
    src_df = pd.read_excel(filename)
    
    new_cols = ['brak', 'negatywny', 'neutralny', 'pozytywny']
    new_cols_ex = [c for c in new_cols if c in src_df['Wydźwięk'].unique().tolist()]
    src_df[new_cols_ex] = pd.get_dummies(src_df['Wydźwięk'])
    for col in new_cols:
        if col not in src_df.columns:
            src_df[col] = 0

    df = src_df[['Data wydania'] + new_cols].groupby(['Data wydania']).sum()

    df = df.reindex(clip_date_range(df.index, crisis_start))
    df[new_cols] = df[new_cols].fillna(0)

    df['suma'] = df[new_cols].sum(axis=1)
    df['labels'] = df.index >= crisis_start
    if np.unique(df['labels']).shape[0] != 2:
        warn(f'Samples from only 1 class in {filename}.')
    if df.shape[0] == 0:
        warn(f'No data after clipping for {filename}.')

    text = src_df.apply(lambda x: ".".join([str(x['Tytuł publikacji']), str(x['Lead']), str(x['Kontekst publikacji'])]), axis=1)
    text_df = src_df[['Data wydania']].copy()
    text_df['text'] = text
    texts = []
    for date in df.index:
        daily_posts = text_df[text_df['Data wydania'] == date]
        texts.append(daily_posts if daily_posts.shape[0] <= num_samples else daily_posts.sample(n=num_samples))
    text_df = pd.concat(texts).reset_index(drop=True)
    
    return df, text_df

def load_data(filenames: Iterable[str], crisis_dates: Iterable[pd.Timestamp], num_samples: int = 100) -> Tuple[pd.DataFrame, pd.DataFrame]:
    assert len(filenames) == len(crisis_dates)
    dfs, text_dfs = [], []
    for i, (fname, date) in enumerate(tqdm(zip(filenames, crisis_dates), total=len(filenames))):
        df, text_df = extract_data(fname, date, num_samples)
        df = df.reset_index(names='Data wydania')
        df['group'] = i
        text_df['group'] = i
        dfs.append(df)
        text_dfs.append(text_df)
    return pd.concat(dfs, ignore_index=True), pd.concat(text_dfs, ignore_index=True)

In [4]:
class DictDataset(Dataset):
    def __init__(self, items: dict) -> None:
        super().__init__()
        self.items = items
        self.len = len(self.items[list(self.items.keys())[0]])
    
    def __getitem__(self, index):
        return {key: val[index] for key, val in self.items.items()}
    
    def __len__(self) -> int:
        return self.len

class SeriesDataset(Dataset):
    def __init__(self, series: pd.Series) -> None:
        super().__init__()
        self.series = series
    
    def __getitem__(self, index):
        return self.series.iloc[index]
    
    def __len__(self) -> int:
        return self.series.shape[0]
    
class TextVectorizer(pl.LightningModule):
    def __init__(self, pretrained_name: str, max_length: int = 256, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)
        self.max_length = max_length

        self.model = RobertaModel.from_pretrained(pretrained_name)
        self.model._modules['pooler'] = torch.nn.Identity()
    
    def forward(self, x) -> Any:
        return self.model(**x).pooler_output.mean(dim=1)

In [51]:
def add_embeddings(days_df: pd.DataFrame, text_df: pd.DataFrame, embeddings: List[torch.Tensor] | torch.Tensor) -> pd.DataFrame:
    if type(embeddings) == list:
        embeddings = torch.cat(embeddings, dim=0)
    embeddings = embeddings.numpy()
    sections = np.cumsum(text_df.groupby(['group', 'Data wydania']).count()['text']).tolist()[:-1]
    day_embeddings = np.stack([np.mean(t, axis=0) for t in np.vsplit(embeddings, sections)], axis=0)
    embedding_df = text_df[['group', 'Data wydania']].drop_duplicates().reset_index(drop=True)
    embedding_df['embedding'] = day_embeddings.tolist()
    days_df = days_df.join(day_embeddings.set_index(['group', 'Data wydania']), ['group', 'Data wydania'], how='left')
    days_df.loc[:, 'embedding'].iloc[days_df['embedding'].isna()] = pd.Series(np.zeros((days_df['embedding'].isna().sum(), 768)).tolist())
    return days_df

In [None]:
def create_dataset(df: pd.DataFrame) -> Dataset:
    sentiment_cols = ['brak', 'pozytywny', 'neutralny', 'negatywny']
    X = torch.tensor(df[sentiment_cols])
    y = torch.tensor(df['label'], dtype=torch.long)
    embeddings = torch.tensor(df['embedding'])
    groups = df['group'].to_list()




In [139]:
# df, text_df = load_data(crisis['Plik'].apply(lambda x: os.path.join(DATA_DIR, x)).to_list(), crisis['Data'].to_list())
# df.to_feather('other_data/days_df.feather')
# text_df.to_feather('other_data/posts_df.feather')

days_df = pd.read_feather('other_data/days_df.feather')
text_df = pd.read_feather('other_data/posts_df.feather')

In [6]:
# MAX_LENGTH = 256

# tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-distilroberta")
# tokens  = tokenizer(text_df['text'].to_list(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')

In [7]:
tokenizer = AutoTokenizer.from_pretrained('sdadas/polish-distilroberta')
ds = SeriesDataset(text_df['text'])
collate_fn = lambda x: tokenizer(x, truncation=True, padding=True, max_length=256, return_tensors='pt')
dl = DataLoader(ds, 256, num_workers=10, collate_fn=collate_fn, pin_memory=True)
model = TextVectorizer('sdadas/polish-distilroberta')
trainer = pl.Trainer(accelerator='gpu')
embeddings = trainer.predict(model, dl)

Some weights of the model checkpoint at sdadas/polish-distilroberta were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at sdadas/polish-distilroberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GP

Predicting: 0it [00:00, ?it/s]

In [96]:
day_embeddings = aggregate_embeddings(embeddings, text_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  days_df.loc[:, 'embedding'].iloc[days_df['embedding'].isna()] = pd.Series(np.zeros((days_df['embedding'].isna().sum(), 768)).tolist())


In [141]:
days_df

Unnamed: 0,Data wydania,brak,negatywny,neutralny,pozytywny,suma,labels,group,embedding
0,2022-11-06,0.0,2.0,150.0,26.0,178.0,False,0,"[0.023331861943006516, 0.16732020676136017, 0...."
1,2022-11-07,0.0,2.0,137.0,122.0,261.0,False,0,"[0.023742012679576874, 0.12002455443143845, 0...."
2,2022-11-08,0.0,0.0,35.0,19.0,54.0,False,0,"[0.02524435520172119, 0.14957286417484283, 0.0..."
3,2022-11-09,0.0,8.0,136.0,12.0,156.0,False,0,"[0.01904178224503994, 0.1732526421546936, 0.02..."
4,2022-11-10,0.0,22.0,1157.0,122.0,1301.0,False,0,"[0.005615352187305689, 0.15572427213191986, 0...."
...,...,...,...,...,...,...,...,...,...
6945,2015-09-12,0.0,0.0,0.0,0.0,0.0,True,77,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6946,2015-09-13,0.0,0.0,0.0,0.0,0.0,True,77,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6947,2015-09-14,0.0,0.0,0.0,0.0,0.0,True,77,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6948,2015-09-15,0.0,0.0,0.0,0.0,0.0,True,77,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [143]:
torch.tensor(days_df['embedding']).shape

torch.Size([6950, 768])