In [15]:
from typing import List, Tuple, Iterable
from warnings import warn
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import lightning as pl
from transformers import AutoTokenizer, EncoderDecoderModel

In [8]:
DATA_DIR = 'crisis_data'
FILE_BLACKLIST = [
    'crisis_data/Jan Szyszko_Córka leśniczego.xlsx',
    'crisis_data/Komenda Główna Policji.xlsx',
    'crisis_data/Ministerstwo Zdrowia_respiratory od handlarza bronią.xlsx',
    'crisis_data/Polska Grupa Energetyczna.xlsx',
    'crisis_data/Polski Związek Kolarski.xlsx',
    'crisis_data/Zbój_energetyk.xlsx'
]

crisis = pd.read_excel('crisis_data/Daty_kryzysów.xlsx').dropna()
crisis = crisis[~crisis['Plik'].apply(lambda x: os.path.join(DATA_DIR, x) in FILE_BLACKLIST)]

In [9]:
def clip_date_range(index: pd.DatetimeIndex, crisis_start: pd.Timestamp) -> pd.DatetimeIndex:
    return pd.date_range(max(index.min(), crisis_start - pd.Timedelta(days=60)), min(index.max(), crisis_start + pd.Timedelta(days=29)))

def extract_data(filename: str, crisis_start: pd.Timestamp, num_samples: int = 100) -> Tuple[pd.DataFrame, pd.DataFrame]:
    src_df = pd.read_excel(filename)
    
    new_cols = ['brak', 'negatywny', 'neutralny', 'pozytywny']
    new_cols_ex = [c for c in new_cols if c in src_df['Wydźwięk'].unique().tolist()]
    src_df[new_cols_ex] = pd.get_dummies(src_df['Wydźwięk'])
    for col in new_cols:
        if col not in src_df.columns:
            src_df[col] = 0

    df = src_df[['Data wydania'] + new_cols].groupby(['Data wydania']).sum()

    df = df.reindex(clip_date_range(df.index, crisis_start))
    df[new_cols] = df[new_cols].fillna(0)

    df['suma'] = df[new_cols].sum(axis=1)
    df['labels'] = df.index >= crisis_start
    if np.unique(df['labels']).shape[0] != 2:
        warn(f'Samples from only 1 class in {filename}.')
    if df.shape[0] == 0:
        warn(f'No data after clipping for {filename}.')

    text = src_df.apply(lambda x: ".".join([str(x['Tytuł publikacji']), str(x['Lead']), str(x['Kontekst publikacji'])]), axis=1)
    text_df = src_df[['Data wydania']].copy()
    text_df['text'] = text
    texts = []
    for date in df.index:
        daily_posts = text_df[text_df['Data wydania'] == date]
        texts.append(daily_posts if daily_posts.shape[0] <= num_samples else daily_posts.sample(n=num_samples))
    text_df = pd.concat(texts).reset_index(drop=True)
    
    return df, text_df

def load_data(filenames: Iterable[str], crisis_dates: Iterable[pd.Timestamp], num_samples: int = 100) -> Tuple[pd.DataFrame, pd.DataFrame]:
    assert len(filenames) == len(crisis_dates)
    dfs, text_dfs = [], []
    for i, (fname, date) in enumerate(tqdm(zip(filenames, crisis_dates), total=len(filenames))):
        df, text_df = extract_data(fname, date, num_samples)
        df = df.reset_index(names='Data wydania')
        df['group'] = i
        text_df['group'] = i
        dfs.append(df)
        text_dfs.append(text_df)
    return pd.concat(dfs, ignore_index=True), pd.concat(text_dfs, ignore_index=True)

In [10]:
# df, text_df = load_data(crisis['Plik'].apply(lambda x: os.path.join(DATA_DIR, x)).to_list(), crisis['Data'].to_list())
# df.to_feather('other_data/days_df.feather')
# text_df.to_feather('other_data/posts_df.feather')

days_df = pd.read_feather('other_data/days_df.feather')
text_df = pd.read_feather('other_data/posts_df.feather')

In [11]:
df

Unnamed: 0,Data wydania,brak,negatywny,neutralny,pozytywny,suma,labels,group
0,2022-11-06,0.0,2.0,150.0,26.0,178.0,False,0
1,2022-11-07,0.0,2.0,137.0,122.0,261.0,False,0
2,2022-11-08,0.0,0.0,35.0,19.0,54.0,False,0
3,2022-11-09,0.0,8.0,136.0,12.0,156.0,False,0
4,2022-11-10,0.0,22.0,1157.0,122.0,1301.0,False,0
...,...,...,...,...,...,...,...,...
6945,2015-09-12,0.0,0.0,0.0,0.0,0.0,True,77
6946,2015-09-13,0.0,0.0,0.0,0.0,0.0,True,77
6947,2015-09-14,0.0,0.0,0.0,0.0,0.0,True,77
6948,2015-09-15,0.0,0.0,0.0,0.0,0.0,True,77


In [12]:
tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-distilroberta")

In [14]:
text_df['tokens'] = text_df['text'].apply(lambda x: tokenizer(x, truncation=True, max_length=512))

In [24]:
text_df['tokens']

0         [input_ids, attention_mask]
1         [input_ids, attention_mask]
2         [input_ids, attention_mask]
3         [input_ids, attention_mask]
4         [input_ids, attention_mask]
                     ...             
253697    [input_ids, attention_mask]
253698    [input_ids, attention_mask]
253699    [input_ids, attention_mask]
253700    [input_ids, attention_mask]
253701    [input_ids, attention_mask]
Name: tokens, Length: 253702, dtype: object

In [64]:
class SimpleTextDataset(Dataset):
    def __init__(self, text: pd.Series) -> None:
        super().__init__()
        self.text = text
    
    def __getitem__(self, index) -> str:
        return self.text.iloc[index]
    
    def __len__(self) -> int:
        return self.text.shape[0]

In [16]:
transformer = EncoderDecoderModel.from_pretrained('sdadas/polish-distilroberta')
trainer = pl.Trainer(accelerator='gpu')

You are using a model of type roberta to instantiate a model of type encoder-decoder. This is not supported for all configurations of models and can yield errors.


AssertionError: Config has to be initialized with encoder and decoder config

In [14]:
transformer

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50001, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [78]:
text_df['tokens'].iloc[0]

{'input_ids': [0, 17774, 8191, 13499, 37967, 5, 786, 221, 12340, 20, 2944, 514, 6, 178, 8, 49000, 5, 5, 786, 221, 12340, 20, 2944, 514, 6, 178, 8, 49000, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [77]:
transformer(np.array([text_df['tokens'].iloc[0]]))

TypeError: 'int' object is not callable