In [1]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, random_split, DataLoader
from torch.nn.functional import cross_entropy
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchmetrics
import lightning as pl
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
from tqdm import tqdm
from typing import List

torch.set_float32_matmul_precision('high')

In [2]:
DATA_DIR = 'crisis_data'
FILE_BLACKLIST = [
    'crisis_data/Afera Rywina.xlsx',
    'crisis_data/Ministerstwo Zdrowia_respiratory od handlarza bronią.xlsx',
    'crisis_data/Fake news_baza publikacji.xlsx'
]

files = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f[-5:] == '.xlsx']
for f in FILE_BLACKLIST:
    files.remove(f)

In [3]:
def extract_data(filename: str):
    src_df = pd.read_excel(filename)

    if src_df['Kryzys'].hasnans:
        src_df['Kryzys'] = src_df['Kryzys'].notna()
    else:
        src_df['Kryzys'] = (src_df['Kryzys'] != 'NIE') & (src_df['Kryzys'] != 'Nie')
    if src_df['Kryzys'].nunique() != 2:
        raise RuntimeError(f'Crisis column data error in file {filename}.')
    
    new_cols = ['brak', 'negatywny', 'neutralny', 'pozytywny']
    new_cols_ex = [c for c in new_cols if c in src_df['Wydźwięk'].unique().tolist()]
    src_df[new_cols_ex] = pd.get_dummies(src_df['Wydźwięk'])
    for col in new_cols:
        if col not in src_df.columns:
            src_df[col] = 0

    df = src_df[['Data wydania', 'Kryzys']].groupby(['Data wydania']).any()
    df = df.join(src_df[['Data wydania'] + new_cols].groupby(['Data wydania']).sum())

    df = df.reindex(pd.date_range(df.index.min(), df.index.max()))
    df[new_cols] = df[new_cols].fillna(0)
    df['Kryzys'] = df['Kryzys'].fillna(method='ffill') & df['Kryzys'].fillna(method='bfill')

    df['suma'] = df[new_cols].sum(axis=1)

    X = torch.tensor(df.drop('Kryzys', axis=1).values, dtype=torch.float32)
    zeros = X == 0
    X[1:] = X[1:] / X[:-1]
    X[0] = 1.
    X = X.pow(2)
    X = (X - 1) / (X + 1)
    X[zeros & X.isnan()] = 0.
    X[X.isnan()] = 1.

    y = torch.tensor(df['Kryzys'].values, dtype=torch.long)
    
    return X, y
    

In [4]:
def extract_raw_data(filename: str):
    src_df = pd.read_excel(filename)

    if src_df['Kryzys'].hasnans:
        src_df['Kryzys'] = src_df['Kryzys'].notna()
    else:
        src_df['Kryzys'] = (src_df['Kryzys'] != 'NIE') & (src_df['Kryzys'] != 'Nie')
    if src_df['Kryzys'].nunique() != 2:
        raise RuntimeError(f'Crisis column data error in file {filename}.')
    
    new_cols = ['brak', 'negatywny', 'neutralny', 'pozytywny']
    new_cols_ex = [c for c in new_cols if c in src_df['Wydźwięk'].unique().tolist()]
    src_df[new_cols_ex] = pd.get_dummies(src_df['Wydźwięk'])
    for col in new_cols:
        if col not in src_df.columns:
            src_df[col] = 0

    df = src_df[['Data wydania', 'Kryzys']].groupby(['Data wydania']).any()
    df = df.join(src_df[['Data wydania'] + new_cols].groupby(['Data wydania']).sum())

    df = df.reindex(pd.date_range(df.index.min(), df.index.max()))
    df[new_cols] = df[new_cols].fillna(0)
    df['Kryzys'] = df['Kryzys'].fillna(method='ffill') & df['Kryzys'].fillna(method='bfill')

    df['suma'] = df[new_cols].sum(axis=1)

    X = torch.tensor(df.drop('Kryzys', axis=1).values, dtype=torch.long)
    y = torch.tensor(df['Kryzys'].values, dtype=torch.long)
    
    return X, y
    

In [5]:
def transform_data(tensors: List):
    t2 = []
    for X, y in tensors:
        newX, newy = torch.zeros_like(X, dtype=torch.float32), y.clone().detach()
        zeros = X == 0
        newX[1:] = X[1:] / X[:-1]
        newX[0] = 1.
        newX = newX.pow(2)
        newX = (newX - 1) / (newX + 1)
        newX[zeros & newX.isnan()] = 0.
        newX[newX.isnan()] = 1.
        t2.append((newX, newy))
    return t2

In [6]:
def transform_data_simple(tensors: List):
    t2 = []
    for X, y in tensors:
        newX = torch.tensor(StandardScaler().fit_transform(X), dtype=torch.float32)
        newy = y.clone().detach()
        t2.append((newX, newy))
    return t2

In [7]:
def create_datasets(tensors, sequence_len: int = 20):
    X_seq, y_seq = [], []
    for (X, y) in tensors:
        for i in range(0, len(X) - sequence_len, sequence_len):
            X_seq.append(X[i:i+sequence_len])
            y_seq.append(y[i:i+sequence_len])
    X, y = torch.stack(X_seq), torch.stack(y_seq)
    ds = TensorDataset(X, y)
    return random_split(ds, (.8, .1, .1))
    

In [23]:
class MyModel(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dim: int, n_classes: int) -> None:
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes
        self.f1 = torchmetrics.F1Score('binary', average='macro')

        self.nets = nn.ModuleList([
            nn.Linear(self.input_dim, 16),
            nn.LSTM(16, self.hidden_dim, batch_first=True),
            nn.Sequential(
                nn.Linear(self.hidden_dim, self.n_classes),
                nn.Softmax()
            )
        ])
    
    def forward(self, x):
        x = self.nets[0](x)
        x, _ = self.nets[1](x)
        x = self.nets[2](x)
        return x

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_pred = self(X)
        loss = cross_entropy(y_pred.view(-1, self.n_classes), y.view(-1))
        self.log('train_loss', loss)
        return loss

    @torch.no_grad()
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_pred = self(X)
        score = self.f1(torch.argmax(y_pred, -1), y)
        loss = cross_entropy(y_pred.view(-1, self.n_classes), y.view(-1))
        self.validation_step_losses.append(loss)
        self.log('val_score', score)
        self.log('val_loss', loss)
    
    @torch.no_grad()
    def test_step(self, batch, batch_idx):
        X, y = batch
        y_pred = self(X)
        score = self.f1(torch.argmax(y_pred, -1), y)
        loss = cross_entropy(y_pred.view(-1, self.n_classes), y.view(-1))
        self.log('test_score', score)
        self.log('test_loss', loss)

    def on_validation_epoch_start(self) -> None:
        self.validation_step_losses = []

    def on_validation_epoch_end(self):
        loss = torch.stack(self.validation_step_losses).mean(dim=0)
        self.scheduler.step(loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0002, weight_decay=0.01)
        self.scheduler = ReduceLROnPlateau(optimizer, 'min')
        return optimizer

In [24]:
# tensors = [extract_raw_data(f) for f in tqdm(files)]

# with open('other_data/tensors.pt', 'wb') as f:
#     torch.save(tensors, f)

with open('other_data/tensors.pt', 'rb') as f:
    tensors = torch.load(f)

In [25]:
tensors = transform_data_simple(tensors)

In [26]:
train_ds, val_ds, test_ds = create_datasets(tensors)

In [27]:
BATCH_SIZE = 32

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

model = MyModel(5, 16, 2)
trainer = pl.Trainer(
    accelerator='gpu',
    callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=20)]
)
trainer.fit(model, train_dl, val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type          | Params
---------------------------------------
0 | f1   | BinaryF1Score | 0     
1 | nets | ModuleList    | 2.3 K 
---------------------------------------
2.3 K     Trainable params
0         Non-trainable params
2.3 K     Total params
0.009     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  input = module(input)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
trainer.test(model, test_dl)