In [None]:
!pip install transformers
!pip install sacremoses

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split
import joblib
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
from transformers import HerbertTokenizer, RobertaModel, AutoTokenizer, BertModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import random_split
from torch.utils.data import TensorDataset, DataLoader
import torchvision.transforms as transforms
import torchvision
from sklearn.utils.class_weight import compute_class_weight

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
os.mkdir("figs")

In [None]:
herbert_klej = ["Herbert-klej",
                HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1"),
                RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")]

In [None]:
def common_compute(model, batch):
    x, y = batch
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    return logits, loss, y

def train_batch(model, optimizer, batch):
    logits, loss, y = common_compute(model, batch)
    _, predicted = torch.max(logits.data, -1)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss, (predicted == y).sum().item()

def validate_batch(model, batch):
    logits, loss, y = common_compute(model, batch)
    _, predicted = torch.max(logits.data, -1)
    return loss, (predicted == y).sum().item()

def test_batch(model, batch):
    logits, loss, y = common_compute(model, batch)
    _, predicted = torch.max(logits.data, -1)
    return np.array(y).size, (predicted == y).sum().item(), loss, predicted, y

In [None]:
# uśredniona macierz konfizji

def get_confusion_matrix(CM_avrg, CM_std, model_name, data_type, classes, normalize):

  cls = []
  for k in classes.keys():
    cls.append(k)

  tick_marks = np.arange(len(cls))
  cms = {"Average": CM_avrg, "Std": CM_std}


  fig, axes = plt.subplots(1, 2, figsize=(20,10), sharey='row')

  for i, (key, cm) in enumerate(cms.items()):

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=cls)
    disp.plot(ax=axes[i], xticks_rotation=45)
    disp.ax_.set_title("{} | Model: Neural Network | Data type: {} | Acc: {}".format(key, data_type, round(score_avrg,2)))
    disp.im_.colorbar.remove()
    disp.ax_.set_xlabel('')
    disp.ax_.set_ylabel('')


  fig.text(0.40, 0.1, 'Predicted label', ha='left')
  plt.subplots_adjust(wspace=0.40, hspace=0.1)

  fig.colorbar(disp.im_, ax=axes)
  plt.show()

  plt.gcf().set_size_inches(10, 5)
  fig.savefig('/content/figs/avrg_neural_network_{}_{}_{}.png'.format(model_name, data_type, normalize), dpi=200)
  files.download('/content/figs/avrg_neural_network_{}_{}_{}.png'.format(model_name, data_type, normalize))

In [None]:
# uśredniona krzywa uczenia

def draw_learning_curve(model_name, history, data_type, n_epochs, normalize, key='accuracy'):

  val_loss_mean, train_loss_mean, val_acc_mean, train_acc_mean, val_loss_std, train_loss_std, val_acc_std, train_acc_std = history.values()

  fontsize = 16
  epoch_vec = np.arange(0,n_epochs)
  fig, ax = plt.subplots(1,2,figsize = (12, 8))
  clrs = sns.color_palette("flare")

  #ax[1].set_ylim([0,2])
  ax[1].plot(epoch_vec, train_loss_mean, label = "train")
  ax[1].fill_between(epoch_vec, train_loss_mean - train_loss_std, train_loss_mean + train_loss_std, alpha = 0.3, facecolor=clrs[4] )
  ax[1].plot(val_loss_mean,  label = "val")
  ax[1].fill_between(epoch_vec, val_loss_mean - val_loss_std, val_loss_mean + val_loss_std, alpha = 0.3, facecolor=clrs[4] )

  ax[0].plot(epoch_vec, train_acc_mean,  label = "train")
  ax[0].fill_between(epoch_vec, train_acc_mean - train_acc_std, train_acc_mean + train_acc_std, alpha = 0.3, facecolor=clrs[4])
  ax[0].plot(val_acc_mean,  label = "val")
  ax[0].fill_between(epoch_vec, val_acc_mean - val_acc_std, val_acc_mean + val_acc_std, alpha = 0.3, facecolor=clrs[4] )


  ax[1].set_xlabel("Traning epoch", fontsize=fontsize)
  ax[1].set_ylabel("Loss", fontsize=fontsize)
  ax[1].set_yscale('log')
  ax[0].set_xlabel("Traning epoch", fontsize=fontsize)
  ax[0].set_ylabel("Accuracy", fontsize=fontsize)
  ax[0].set_yscale('log')

  ax[1].legend( fontsize = fontsize)
  ax[0].legend( fontsize = fontsize)
  fig.suptitle('Learning curve | Data type: {}'.format(data_type))

  fig.savefig('/content/figs/avgr_learning_curve_{}_{}_{}.png'.format(model_name, data_type, normalize))
  files.download('/content/figs/avgr_learning_curve_{}_{}_{}.png'.format(model_name, data_type, normalize))

In [None]:
def print_classes(df):
  authors = {}
  y = df['Label']
  if len(df['Label'].unique()) < 8:
    y = df ['Label'].factorize()[0]
  num_classes = len(df['Label'].unique())
  for label in range(0, num_classes):
    i, = np.where(y == label)
    authors['{}'.format(df['Author-short'][i[0]])] = label

  return authors

In [None]:
def make_embedding(df, model):

  X_stack = []
  model_name, tokenizer, model = model
  embedded = {}
  tokens = {}
  num_idxs = df.shape[0]
  for idx in tqdm(range(0,num_idxs)):
    single_poem_input = df['Text'][idx]
    inputs = tokenizer.batch_encode_plus([single_poem_input], max_length = 512, padding="longest", add_special_tokens=True, return_tensors="pt",)
    single_poem_output = model(**inputs)
    X_single_poem = single_poem_output[0][:,0,:].detach().numpy()
    X_stack.append(X_single_poem[0])

    embedded[idx] = X_single_poem[0], df['Label'][idx]

  df_embedded = pd.DataFrame.from_dict(embedded,  orient='index', columns=['embedding', 'label'])

  return df_embedded

In [None]:
def normalize_data(X):

  X_normalized = np.zeros((X.shape[0],X.shape[1]))

  for idx in range(0,X.shape[0]):
    X_normalized[idx,:] = (X[idx,:] - np.mean(X[idx,:]))/ np.std(X[idx,:])

  return X_normalized

In [None]:
def make_experiment(embedding_model, df, data_type, n_realizations, normalization):

  normalize = "normalized.0"
  if(normalization==True):
    normalize = "normalized.1"
  model_name = embedding_model[0]

  n_classes = len(df['Label'].unique())
  CM = np.zeros((n_classes, n_classes, n_realizations))
  scores = []
  n_epochs = 20

  train_loss_realizations = np.zeros((n_epochs, n_realizations))
  train_acc_realizations = np.zeros((n_epochs, n_realizations))
  val_loss_realizations = np.zeros((n_epochs, n_realizations))
  val_acc_realizations = np.zeros((n_epochs, n_realizations))

  embed_data = make_embedding(df, embedding_model)
  df = pd.concat([df, embed_data['embedding']], axis=1)
  for n in range(0, n_realizations):
    # podział danych na dane testowe oraz treningowe i validacyjne w celu przeprowadzenia n niezależnych realizacji

    df_train, df_test, _, _ = train_test_split(df, test_size=0.2)
    df_train, df_val, _, _ = train_test_split(df_train, test_size=0.1)

    X_train = np.stack(df_train['embedding'])
    y_train = df_train['Label'].values

    X_val = np.stack(df_val['embedding'])
    y_val = df_val['Label'].values

    X_test = np.stack(df_test['embedding'])
    y_test = df_test['Label'].values

    train_dataset = TensorDataset(torch.FloatTensor(X_train),torch.LongTensor(y_train))
    val_dataset = TensorDataset(torch.FloatTensor(X_val),torch.LongTensor(y_val))
    test_dataset = TensorDataset(torch.FloatTensor(X_test),torch.LongTensor(y_test))

    train_loader = DataLoader(train_dataset, batch_size=1024,shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1024,shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=1024,shuffle=True)

    model_NN = MLPClassifier()
    optimizer = torch.optim.Adam(model_NN.parameters(), lr = 1e-4)

    # pętla trenowania
    n_epochs = n_epochs
    for epoch in range(n_epochs):
      model_NN.train()
      train_loss = []
      train_acc = []
      bar = tqdm(train_loader, position=0, leave=False, desc='epoch %d'%epoch)
      for batch in bar:
        loss, acc = train_batch(model_NN, optimizer, batch)
        train_loss.append(loss)
        train_acc.append(acc)
        avg_train_loss = torch.stack(train_loss).mean()
        avg_train_loss = avg_train_loss.detach().numpy()
        avg_train_acc = np.stack(train_acc).mean()
      print('train_loss', avg_train_loss.item())
      train_loss_realizations[epoch,n] = avg_train_loss
      train_acc_realizations[epoch,n] = avg_train_acc

      model_NN.eval()
      with torch.no_grad():
        val_loss = []
        val_acc = []
        for batch in val_loader:
          loss, acc = validate_batch(model_NN, batch)
          val_loss.append(loss)
          val_acc.append(acc)
          avg_val_loss = torch.stack(val_loss).mean()
          avg_val_loss = avg_val_loss.detach().numpy()
          avg_val_acc = np.stack(val_acc).mean()
        print('val_loss', avg_val_loss.item())

        val_loss_realizations[epoch,n] = avg_val_loss
        val_acc_realizations[epoch,n] = avg_val_acc

    classes = print_classes(df_test)

    # pętla testowania
    bar = tqdm(test_loader, position=0, leave=False, desc='test')
    test_loss = []
    true_labels = []
    pred_labels = []
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in bar:
            batch_size, batch_correct, loss, predicted, y = test_batch(model_NN, batch)
            total += batch_size
            correct += batch_correct
            test_loss.append(loss)
            true_labels.append(predicted)
            pred_labels.append(y)
        print('Acc: {}'.format(100 * float(correct) / total))
        score = (100 * float(correct) / total)
        cm = confusion_matrix(true_labels, pred_labels, normalize='true')
        CM[:,:,n] = cm
        scores.append(score)

# dane do uśrednionej krzywej uczenia
  val_loss_mean = np.mean(val_loss_realizations, axis=1)
  train_loss_mean = np.mean(train_loss_realizations, axis=1)
  val_acc_mean = np.mean(val_acc_realizations, axis=1)
  train_acc_mean = np.mean(train_acc_realizations, axis=1)

  val_loss_std = np.std(val_loss_realizations, axis=1)
  train_loss_std = np.std(train_loss_realizations, axis=1)
  val_acc_std = np.std(val_acc_realizations, axis=1)
  train_acc_std = np.std(train_acc_realizations, axis=1)

  dict_history = {"val_loss_mean":val_loss_mean,
                  "train_loss_mean":train_loss_mean,
                  "val_acc_mean":val_acc_mean,
                  "train_acc_mean":train_acc_mean,
                  "val_loss_std":val_loss_std,
                  "train_loss_std":train_loss_std,
                  "val_acc_std":val_acc_std,
                  "train_acc_std":train_acc_std}

# uśredniona krzywa uczenia
  draw_learning_curve(model_name, dict_history, data_type, n_epochs, normalize, key='accuracy')

# uśredniona macierz konfuzji
  CM_avrg = np.zeros((n_classes,n_classes))
  CM_std = np.zeros((n_classes,n_classes))
  score_avrg = np.mean(scores)

  for i in range(0,n_classes):
    for j in range(0,n_classes):
      CM_avrg[i,j] = np.mean(CM[i,j,:])
      CM_std[i,j] = np.std(CM[i,j,:])

  classes = print_classes(df)

  get_confusion_matrix(CM_avrg, CM_std, model_name, data_type, classes, normalize)

  return score_avrg, dict_history

In [None]:
class MLPClassifier(nn.Module):
    def __init__(self):
      super().__init__()
      self.fc1 = nn.Linear(768, 768*2)
      self.fc2 = nn.Linear(768*2, 768*4)
      self.out = nn.Linear(768*4, 4)
      self.dropout = nn.Dropout(0.2)

    def forward(self, x):
      x = x.view(x.size(0))
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = self.dropout(x)
      x = self.out(x)
      return x

In [None]:
df_raw = pd.read_csv('/content/polish_poetry.csv', ";")
df_raw.shape

In [None]:
df_women = pd.concat([df_raw["Text"],df_raw["Label"],df_raw["Author-short"]], axis=1)
df_women = df_women[200:].reset_index(drop=True)
df_men = pd.concat([df_raw["Text"],df_raw["Label"],df_raw["Author-short"]], axis=1)
df_men = df_men[:200].reset_index(drop=True)

In [None]:
df_all = pd.DataFrame
df_all = pd.concat([df_raw["Text"],df_raw["Label"],df_raw["Author-short"]], axis=1)
df_all = df_all.sample(frac = 1).reset_index(drop=True)
df_all

In [None]:
df = df_all
model = herbert_klej
data_type = 'all'
n_realization = 10

score, dict_history_men = make_experiment(model, df, data_type, n_realizations, normalization=True)