<a href="https://colab.research.google.com/github/NikolaJanik/Polish_poetry_classification_with_transformers/blob/main/herBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Dependencies (for Colab)

In [None]:
!pip install transformers
!pip install sacremoses

# 2. Imports

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split
import joblib
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import torch
from transformers import HerbertTokenizer, RobertaModel, AutoTokenizer, BertModel

import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
from tensorflow.keras.models import load_model

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# 3. Load and clean data

In [None]:
df_raw = pd.read_csv('/content/drive/MyDrive/wiersze_do_BERT_light.csv', ";")
df_raw  = df_raw .drop(columns = ['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'])
df_women = df_raw[200:].reset_index(drop=True)
df_men = df_raw[:200].reset_index(drop=True)

In [None]:
def get_data_set(labels, df):
  idxs = []
  for label in labels:
    idxs_for_label, = np.where(df['Label'] == label)
    for idx in idxs_for_label:
      idxs.append(idx)

  new_df = df.iloc[idxs]
  new_df = new_df.sample(frac = 1).reset_index(drop=True)
  return new_df

In [None]:
labels = [0,1,2,3,4,5,6,7]
df = get_data_set(labels, df_raw)
print("Number of classes: {}".format(len(df['Label'].unique())))
print("Shape of new data set: {}".format(df.shape))

In [None]:
df_tokens, inputs = make_tokens(df_raw, herbert)
input_ids = np.stack(df_tokens['input_ids'][1:2])
token_type_ids = np.stack(df_tokens['token_type_ids'][1:2])
attention_mask = np.stack(df_tokens['attention_mask'][1:2])

inputs = {"input_ids":torch.tensor(input_ids),"token_type_ids":torch.tensor(token_type_ids),"attention_mask":torch.tensor(attention_mask)}
outputs = model(**inputs)

In [None]:
def print_classes(df):
 # authors = {}
  num_classes = len(df['label'].unique())
#  for label in range(0, num_classes):
 #   i, = np.where(y == label)
  #  authors['{}'.format(df['Author-short'][i[0]])] = label

  return num_classes

In [None]:
classes = print_classes(df)
classes

# 4. Initialize HerBERT Model

In [None]:
herbert = ["Herbert", HerbertTokenizer.from_pretrained("allegro/herbert-large-cased"), RobertaModel.from_pretrained("allegro/herbert-large-cased")]
bert = ["Bert", AutoTokenizer.from_pretrained("bert-base-uncased"), BertModel.from_pretrained("bert-base-uncased")]

# 5. Generate Embeddings

In [None]:
def make_embedding(df, model):

  X_stack = []
  model_name, tokenizer, model = model
  embedded = {}
  tokens = {}
  num_idxs = df.shape[0]
  for idx in tqdm(range(0,num_idxs)):
    single_poem_input = df['Text'][idx]
    inputs = tokenizer.batch_encode_plus([single_poem_input], max_length = 512, padding="longest", add_special_tokens=True, return_tensors="pt",)
    single_poem_output = model(**inputs)
    X_single_poem = single_poem_output[0][:,0,:].detach().numpy()
    X_stack.append(X_single_poem[0])

    embedded[idx] = X_single_poem[0], df['Label'][idx]

  df_embedded = pd.DataFrame.from_dict(embedded,  orient='index', columns=['{}_embedding'.format(model_name), 'label'])

  return df_embedded

In [None]:
embedding = make_embedding(df_raw, herbert)
df = df_embedded.sample(frac = 1).reset_index(drop=True)

# 6. Prepare X and y

In [None]:
def get_X_y(df):

  X = np.stack(df['Herbert_embedding'])
  y = df['label']

  #jeśli jest mniej niż 8 klas:
  if len(df['label'].unique()) < 8:
    y = df ['label'].factorize()[0]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

  print(X.shape)

  return X, y, X_train, X_test, y_train, y_test,  X_val, y_val

In [None]:
X, y, X_train, X_test, y_train, y_test,  X_val, y_val = get_X_y(df)

# 7. Run Machine Learning models

In [None]:
def run_models(X_train, y_train, X_test, y_test, data_type, classes):

  cls = []
  for k in classes.keys():
    cls.append(k)

  models = [
      ["decision_tree", DecisionTreeClassifier(max_depth=20)]
      #["random_forest", RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)],
      #["xgboost", xgb.XGBClassifier(n_estimators=100, random_state=0)],
      #["lgbm", lgb.LGBMClassifier(n_estimators=50, random_state=0, max_depth=10)]
  ]


  for model_name, model_clf in models:

    scores = model_clf.fit(X_train, y_train)
    y_pred = model_clf.predict(X_test)
    num_classes = len(classes)
    score = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model_clf.classes_)
    disp.plot()
    disp.ax_.set_title("Model: {} | Data type: {} | Acc: {}".format(model_name, data_type, score))

    if classes is not None:
        tick_marks = np.arange(len(cls))
        plt.xticks(tick_marks, cls, rotation=45)
        plt.yticks(tick_marks, cls, rotation=50)

    #plt.gcf().set_size_inches(10, 10)
    #plt.savefig('/content/figs/{}_{}_{}_classes.png'.format(model_name, data_type, num_classes), dpi=200)
    #files.download('/content/figs/{}_{}_{}_classes.png'.format(model_name, data_type, num_classes))

  return score, cm

In [None]:
run_models(X_train, y_train, X_test, y_test, "all", classes)

In [None]:
classes = print_classes(df)
cls = []
for k in classes.keys():
  cls.append(k)
model_name = "decision_tree"
data_type = 'women'
n_realizations = 20
CM = np.zeros((len(cls),len(cls),n_realizations))
scores = []

for r in range(0,n_realizations):
  X, y, X_train, X_test, y_train, y_test,  X_val, y_val = get_X_y(df)
  classes = print_classes(df)

  score, CM[:,:,r] = run_models(X_train, y_train, X_test, y_test, "women", classes)
  scores.append(score)

CM_avrg = np.zeros((n_classes,n_classes))
CM_std = np.zeros((n_classes,n_classes))
score_avrg = np.mean(scores)

for i in range(0,n_classes):
  for j in range(0,n_classes):
    CM_avrg[i,j] = np.mean(CM[i,j,:])
    CM_std[i,j] = np.std(CM[i,j,:])


tick_marks = np.arange(4)
cms = {"Average": CM_avrg, "Std": CM_std}



fig, axes = plt.subplots(1, 2, figsize=(20,10), sharey='row')

for i, (key, cm) in enumerate(cms.items()):

  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=cls)
  disp.plot(ax=axes[i], xticks_rotation=45)
  disp.ax_.set_title("{} | Model: {} | Data type: {} | Acc: {}". format(key, model_name, data_type, round(score_avrg, 2)))
  disp.im_.colorbar.remove()
  disp.ax_.set_xlabel('')
  disp.ax_.set_ylabel('')


fig.text(0.40, 0.1, 'Predicted label', ha='left')
plt.subplots_adjust(wspace=0.40, hspace=0.1)

fig.colorbar(disp.im_, ax=axes)
plt.show()

plt.gcf().set_size_inches(10, 5)
fig.savefig('/content/figs/avrg_{}_{}.png'.format(model_name, data_type), dpi=200)
files.download('/content/figs/avrg_{}_{}.png'.format(model_name, data_type))

# 8. Run Neural Network model

In [None]:
def get_confusion_matrix(model, X_test, y_test, model_name, data_type, classes):

  cls =[]
  for k in classes.keys():
    cls.append(k)

  y_pred = model.predict(X_test)
  pred_labels=[]
  for idx in range(len(y_pred)):
    pred_label = np.argmax(y_pred[idx])
    pred_labels.append(pred_label)

  true_labels = y_test
  score = accuracy_score(true_labels, pred_labels)
  cm = confusion_matrix(true_labels, pred_labels, normalize='true')

  disp = ConfusionMatrixDisplay(confusion_matrix=cm)
  disp.plot()
  disp.ax_.set_title("Model: {} | Data type: {} |  Acc: {}".format(model_name, data_type, num_classes, score))

  if classes is not None:
      tick_marks = np.arange(len(cls))
      plt.xticks(tick_marks, cls, rotation=45)
      plt.yticks(tick_marks, cls, rotation=50)

  #plt.gcf().set_size_inches(10, 10)
  #plt.savefig('/content/figs/{}_{}.png'.format(model_name, data_type), dpi=200)
  #files.download('/content/figs/{}_{}.png'.format(model_name, data_type))

  return score, cm

In [None]:
def draw_learning_curve(history, data_type, key='accuracy'):

  fig, ax = plt.subplots(1, 2, figsize=(12,6))
  ax[0].plot(history.history[key])
  ax[0].plot(history.history['val_'+ key] )
  ax[0].set_ylabel(key.title())
  ax[0].set_xlabel('Epoch')
  ax[0].legend(['train', 'val'])

  ax[1].plot(history.history['loss'])
  ax[1].plot(history.history['val_loss'] )
 # ax[1].set_ylim([0,1])
  ax[1].set_ylabel('loss'.title())
  ax[1].set_xlabel('Epoch')
  ax[1].legend(['train', 'val'])
  fig.suptitle('Learning curve | Data type: {}'.format(data_type))
  plt.show()
  fig.savefig('/content/figs/learning_curve_{}.png'.format(data_type))
  files.download('/content/figs/learning_curve_{}.png'.format(data_type))


In [None]:
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [None]:
input_size = 1024
num_classes = 8
batch_size = 512
epochs = 200

model_NN = Sequential([
    Dense(input_size, input_dim=input_size, activation='relu'),
    Dense(2*input_size, activation='relu'),
   # Dropout(0.1),
   # Dense(2*input_size, activation='relu'),
    #Dropout(0.1),
   # Dense(2*input_size, activation='relu'),
    # Dense(2*input_size, activation='relu'),
    # Dropout(0.2),
    # Dense(2*input_size, activation='relu'),
    # Dense(2*input_size, activation='relu'),
   #  Dropout(0.2),
   #  Dense(4*input_size, activation='relu'),
     Dense(4*input_size, activation='relu'),
     Dropout(0.2),
    Dense(num_classes, activation='softmax')
  ])

model_NN.compile(loss='categorical_crossentropy', optimizer='Adam', metrics='accuracy')
#callback = keras.callbacks.EarlyStopping(monitor='loss', patience=25)
history = model_NN.fit(X_train, y_train,
          batch_size=batch_size, epochs=epochs, verbose=1,
          validation_data=(X_val, y_val))

In [None]:
n_realizations = 20
n_classes = 8
CM = np.zeros((n_classes, n_classes, n_realizations))
scores = []
train_loss_realizations = np.zeros((epochs, n_realizations))
train_acc_realizations = np.zeros((epochs, n_realizations))
val_loss_realizations = np.zeros((epochs, n_realizations))
val_acc_realizations = np.zeros((epochs, n_realizations))
for n in range(0, n_realizations):
  X, y, X_train, X_test, y_train, y_test,  X_val, y_val = get_X_y(df)
  y_train = to_categorical(y_train)
  y_val = to_categorical(y_val)

  model_NN = Sequential([
    Dense(input_size, input_dim=input_size, activation='relu'),
    Dense(2*input_size, activation='relu'),
    Dense(4*input_size, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
  ])

  model_NN.compile(loss='categorical_crossentropy', optimizer='Adam', metrics='accuracy')
  history = model_NN.fit(X_train, y_train,
          batch_size=batch_size, epochs=epochs,verbose=0,
          validation_data=(X_val, y_val))

  train_loss_realizations[:,n] = history.history['loss']
  val_loss_realizations[:,n] = history.history['val_loss']
  train_acc_realizations[:,n] = history.history['accuracy']
  val_acc_realizations[:,n] = history.history['val_accuracy']

  classes = print_classes(df)

  score, CM[:,:,n] = get_confusion_matrix(model_NN, X_test, y_test, 'neural_network', 'all', classes)
  scores.append(score)

In [None]:
val_loss_mean = np.mean(val_loss_realizations, axis=1)
train_loss_mean = np.mean(train_loss_realizations, axis=1)
val_acc_mean = np.mean(val_acc_realizations, axis=1)
train_acc_mean = np.mean(train_acc_realizations, axis=1)

val_loss_std = np.std(val_loss_realizations, axis=1)
train_loss_std = np.std(train_loss_realizations, axis=1)
val_acc_std = np.std(val_acc_realizations, axis=1)
train_acc_std = np.std(train_acc_realizations, axis=1)

In [None]:
fontsize = 16
epoch_vec = np.arange(0,epochs)
fig, ax = plt.subplots(1,2,figsize = (12, 8))
clrs = sns.color_palette("flare")
ax[1].set_ylim([0,2])
ax[1].plot(epoch_vec, train_loss_mean, label = "train")
ax[1].fill_between(epoch_vec, train_loss_mean - train_loss_std, train_loss_mean + train_loss_std, alpha = 0.3, facecolor=clrs[4] )
ax[1].plot(val_loss_mean,  label = "val")
ax[1].fill_between(epoch_vec, val_loss_mean - val_loss_std, val_loss_mean + val_loss_std, alpha = 0.3, facecolor=clrs[4] )

ax[0].plot(epoch_vec, train_acc_mean,  label = "train")
ax[0].fill_between(epoch_vec, train_acc_mean - train_acc_std, train_acc_mean + train_acc_std, alpha = 0.3, facecolor=clrs[4])
ax[0].plot(val_acc_mean,  label = "val")
ax[0].fill_between(epoch_vec, val_acc_mean - val_acc_std, val_acc_mean + val_acc_std, alpha = 0.3, facecolor=clrs[4] )


ax[1].set_xlabel("Traning epoch", fontsize=fontsize)
ax[1].set_ylabel("Loss", fontsize=fontsize)
ax[0].set_xlabel("Traning epoch", fontsize=fontsize)
ax[0].set_ylabel("Accuracy", fontsize=fontsize)

ax[1].legend( fontsize = fontsize)
ax[0].legend( fontsize = fontsize)
fig.suptitle('Learning curve | Data type: {}'.format(data_type))

fig.savefig('/content/figs/avgr_learning_curve_{}.png'.format(data_type))
files.download('/content/figs/avgr_learning_curve_{}.png'.format(data_type))

In [None]:
CM_avrg = np.zeros((n_classes,n_classes))
CM_std = np.zeros((n_classes,n_classes))
score_avrg = np.mean(scores)

for i in range(0,n_classes):
  for j in range(0,n_classes):
    CM_avrg[i,j] = np.mean(CM[i,j,:])
    CM_std[i,j] = np.std(CM[i,j,:])

In [None]:
classes = print_classes(df)
cls = []
for k in classes.keys():
  cls.append(k)

tick_marks = np.arange(4)
cms = {"Average": CM_avrg, "Std": CM_std}


fig, axes = plt.subplots(1, 2, figsize=(20,10), sharey='row')

for i, (key, cm) in enumerate(cms.items()):

  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=cls)
  disp.plot(ax=axes[i], xticks_rotation=45)
  disp.ax_.set_title("{} | Model: Neural Network | Data type: {} | Acc: {}".format(key, data_type, round(score_avrg,2)))
  disp.im_.colorbar.remove()
  disp.ax_.set_xlabel('')
  disp.ax_.set_ylabel('')


fig.text(0.40, 0.1, 'Predicted label', ha='left')
plt.subplots_adjust(wspace=0.40, hspace=0.1)

fig.colorbar(disp.im_, ax=axes)
plt.show()

plt.gcf().set_size_inches(10, 5)
fig.savefig('/content/figs/avrg_neural_network_{}.png'.format(data_type), dpi=200)
files.download('/content/figs/avrg_neural_network_{}.png'.format(data_type))

In [None]:
scores, cm = get_confusion_matrix(model_NN, X_test, y_test, 'neural_network', 'all', classes)

In [None]:
draw_learning_curve(history, 'all')

# 9. I don't why do I have that

In [None]:
def make_tokens(df, model):
  model_name, tokenizer, model = model
  tokens = {}

  df_tokens = pd.DataFrame()
  tokenize = lambda sent: tokenizer.encode_plus(sent, max_length=512, padding='max_length', truncation=True)
  df_tokens['tokens'] = df['Text'].map(tokenize)
  df_tokens['input_ids'] = df_tokens['tokens'].map(lambda t: t['input_ids'] )
  df_tokens['token_type_ids'] = df_tokens['tokens'].map(lambda t: t['token_type_ids'] )
  df_tokens['attention_mask'] = df_tokens['tokens'].map(lambda t: t['attention_mask'] )


  input_ids = np.stack(df_tokens['input_ids'])
  token_type_ids = np.stack(df_tokens['token_type_ids'])
  attention_mask = np.stack(df_tokens['attention_mask'])

  inputs = {"input_ids":torch.tensor(input_ids),"token_type_ids":torch.tensor(token_type_ids),"attention_mask":torch.tensor(attention_mask)}

  return df_tokens, inputs

In [None]:
z = np.stack(df_tokens['input_ids'][2])
z = np.stack(z[60:160])

In [None]:
X_stack = []
embedded = {}
model_name, tokenizer, model = herbert
window_step = 30
window_size = 60

for idx in tqdm(range(0,400)):
  x = 0

  for i in range(0,3):

    y = x + window_size
    if(i>7):
      y = 512

    input_ids = np.stack(df_tokens["input_ids"].iloc[idx:idx+1])
    token_type_ids = np.stack(df_tokens["token_type_ids"].iloc[idx:idx+1])
    attention_mask = np.stack(df_tokens["attention_mask"].loc[idx:idx+1])

    input_ids = np.array([input_ids[0][x:y]])
    token_type_ids = np.array([token_type_ids[0][x:y]])
    attention_mask = np.array([attention_mask[0][x:y]])

    x = x + window_step

    inputs = {"input_ids":torch.tensor(input_ids),"token_type_ids":torch.tensor(token_type_ids),"attention_mask":torch.tensor(attention_mask)}

    single_poem_output = model(**inputs)
    X_single_poem = single_poem_output[0][:,0,:].detach().numpy()
    X_stack.append(X_single_poem[0])
    embedded[idx,i] = X_single_poem[0], df_raw['Label'][idx], df_raw['Author-short'][idx]


  df_embedded = pd.DataFrame.from_dict(embedded,  orient='index', columns=['{}_embedding'.format(model_name), 'Label'])

In [None]:
X = np.stack(df_embedded["Herbert_embedding"])
input_ids = np.stack(df_tokens["input_ids"][0:1][0:100])
input_ids =np.stack(np.pad(input_ids,[(0, 512-len(input_ids))], mode='constant', constant_values=1))

In [None]:
text_length_mean = np.mean(df_raw['Words'])
text_length_std = np.std(df_raw['Words'])
text_length_median = np.median(df_raw['Words'])
text_length_mean, text_length_std, text_length_median