In [None]:
!pip install transformers
!pip install sacremoses

In [None]:
!pip uninstall numba
!pip install umap-learn
!pip install -U numba

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split
import joblib
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import torch
from transformers import HerbertTokenizer, RobertaModel, AutoTokenizer, BertModel

import umap
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
from tensorflow.keras.models import load_model

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
def get_data_set(labels, df):
  idxs = []
  for label in labels:
    idxs_for_label, = np.where(df['Label'] == label)
    for idx in idxs_for_label:
      idxs.append(idx)

  new_df = df.iloc[idxs]
  new_df = new_df.sample(frac = 1).reset_index(drop=True)
  return new_df

In [None]:
def print_classes(df):
  y = df["Label"]
  authors = {}
  num_classes = len(df['Label'].unique())
  for label in range(0, num_classes):
    i, = np.where(y == label)
    authors['{}'.format(df['Author-short'][i[0]])] = label

  return authors

In [None]:
def make_tokens(df, model):
  model_name, tokenizer, model = model
  tokens = {}

  df_tokens = pd.DataFrame()
  tokenize = lambda sent: tokenizer.encode_plus(sent, max_length=512, padding='max_length', truncation=True)
  df_tokens['tokens'] = df['Text'].map(tokenize)
  df_tokens['input_ids'] = df_tokens['tokens'].map(lambda t: t['input_ids'] )
  df_tokens['token_type_ids'] = df_tokens['tokens'].map(lambda t: t['token_type_ids'] )
  df_tokens['attention_mask'] = df_tokens['tokens'].map(lambda t: t['attention_mask'] )


  input_ids = np.stack(df_tokens['input_ids'])
  token_type_ids = np.stack(df_tokens['token_type_ids'])
  attention_mask = np.stack(df_tokens['attention_mask'])

  inputs = {"input_ids":torch.tensor(input_ids),"token_type_ids":torch.tensor(token_type_ids),"attention_mask":torch.tensor(attention_mask)}

  return df_tokens, inputs


In [None]:
df_tokens, inputs = make_tokens(df_raw, herbert)

In [None]:
outputs = model(**inputs)

In [None]:
#dzielenie tokenów na 9 porcji
X_stack = []
embedded = {}
model_name, tokenizer, model = herbert

for idx in tqdm(range(0,400)):
  x = 0

  for i in range(0,9):

    y = x+100
    if(i>7):
      y = 512

    input_ids = np.stack(df_tokens["input_ids"].iloc[idx:idx+1])
    token_type_ids = np.stack(df_tokens["token_type_ids"].iloc[idx:idx+1])
    attention_mask = np.stack(df_tokens["attention_mask"].loc[idx:idx+1])

    input_ids = np.array([input_ids[0][x:y]])
    token_type_ids = np.array([token_type_ids[0][x:y]])
    attention_mask = np.array([attention_mask[0][x:y]])

    x = x + 60

    inputs = {"input_ids":torch.tensor(input_ids),"token_type_ids":torch.tensor(token_type_ids),"attention_mask":torch.tensor(attention_mask)}

    single_poem_output = model(**inputs)
    X_single_poem = single_poem_output[0][:,0,:].detach().numpy()
    X_stack.append(X_single_poem[0])
    embedded[idx,i] = X_single_poem[0], df_raw['Label'][idx]


  df_embedded = pd.DataFrame.from_dict(embedded,  orient='index', columns=['{}_embedding'.format(model_name), 'label'])



In [None]:
def make_embedding(df, model):

  X_stack = []
  model_name, tokenizer, model = model
  embedded = {}
  tokens = {}
  num_idxs = df.shape[0]
  for idx in tqdm(range(0,num_idxs)):
    single_poem_input = df['Text'][idx]
    inputs = tokenizer.batch_encode_plus([single_poem_input], max_length = 512, padding="longest", add_special_tokens=True, return_tensors="pt",)
    single_poem_output = model(**inputs)
    X_single_poem = single_poem_output[0][:,0,:].detach().numpy()
    X_stack.append(X_single_poem[0])

    embedded[idx] = X_single_poem[0], df['Label'][idx]

  df_embedded = pd.DataFrame.from_dict(embedded,  orient='index', columns=['{}_embedding'.format(model_name), 'label'])

  return df_embedded

In [None]:
def get_X_y(df):

  X = np.stack(df['Herbert_embedding'])
  y = df['Label']

  #jeśli jest mniej niż 8 klas:
  if len(df['Label'].unique()) < 8:
    y = df ['Label'].factorize()[0]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

  print(X.shape)

  return X, y, X_train, X_test, y_train, y_test,  X_val, y_val

In [None]:
def normalize_data(X):

  X_normalized = np.zeros((X.shape[0],X.shape[1]))

  for idx in range(0,400):
    X_normalized[idx,:] = (X[idx,:] - np.mean(X[idx,:]))/ np.std(X[idx,:])

  return X_normalized

In [None]:
herbert = ["Herbert", HerbertTokenizer.from_pretrained("allegro/herbert-large-cased"), RobertaModel.from_pretrained("allegro/herbert-large-cased")]
#bert = ["Bert", AutoTokenizer.from_pretrained("bert-base-uncased"), BertModel.from_pretrained("bert-base-uncased")]

In [None]:
df_raw = pd.read_csv('/content/drive/MyDrive/wiersze_do_BERT_light.csv', ";")
df_raw.columns
df_raw.shape

In [None]:
df_BeforeEbedding = pd.DataFrame
df_BeforeEbedding = pd.concat([df_raw['Text'],df_raw['Label'],df_raw['Author-short']], axis=1)
df_BeforeEbedding

In [None]:
embedding = make_embedding(df_BeforeEbedding, herbert)

In [None]:
df_AfterEmbedding = pd.concat([df_BeforeEbedding, embedding['Herbert_embedding']], axis=1)
df_AfterEmbedding

In [None]:
classes = print_classes(df_AfterEmbedding)
classes

In [None]:
X, y, _,_,_,_,_,_ = get_X_y(df_AfterEmbedding)

**Umap (data before normalization)**

In [None]:
# Umap
df_umap = pd.DataFrame()
df_umap["y"] = df_AfterEmbedding['Author-short']
data_type = 'all'
num_classes = len(classes)
n_neighbors = [10]
min_distnces= [0.1]
n_components=2
metric='euclidean'

for n_neighbor in n_neighbors:
  for min_dist in min_distnces:
    reducer = umap.UMAP(n_neighbors=n_neighbor,
            min_dist=min_dist,
            n_components=n_components,
            metric=metric)
    scaled_X = StandardScaler().fit_transform(X)
    embed = reducer.fit_transform(scaled_X)

    df_umap["comp-1"] = embed[:,0]
    df_umap["comp-2"] = embed[:,1]

    plt.figure(figsize = (10,10))
    sns.scatterplot(x="comp-1", y="comp-2", hue=df_umap.y.tolist(),
                   palette=sns.color_palette("Set2", num_classes),
                   data=df_umap, s=100).set(title="Poems data Umap projection | Data type: {} | N_neighbors: {} | Distance: {}".format(data_type, n_neighbor, min_dist))


plt.savefig('/content/figs/umap_{}.png'.format(data_type))
files.download('/content/figs/umap_{}.png'.format(data_type))

**3D PCA (data before normalization)**

In [None]:
# 3D PCA
data_type = 'all'
scaler = StandardScaler()
scaler.fit(X)
scaled_X = scaler.transform(X)
pca = PCA(n_components=3)
pca.fit(scaled_X)
pca_X = pca.transform(scaled_X)

In [None]:
Xax = pca_X[:,0]
Yax = pca_X[:,1]
Zax = pca_X[:,2]

cdict = {0:'cyan',1:'red',2:'blue',3:'green',4:'yellow',5:'violet',6:'orange',7:'brown'}
label = {0:'K. K. Baczyński',1:'A. Mickiewicz',2:'J. Kochanowki',3:'Cz. Miłosz',4:'W. Szymborska',5:'H. Poświatowska',6:'M. P. Jasnorzewska',7:'E. Lipska'}


fig = plt.figure(figsize=(14,9))
ax = fig.add_subplot(111,
                     projection='3d')

for l in np.unique(y):
 ix=np.where(y==l)
 ax.scatter(Xax[ix],
            Yax[ix],
            Zax[ix],
            c=cdict[l],
            s=60,
           label=label[l])

ax.set_xlabel("PC1",
              fontsize=12)
ax.set_ylabel("PC2",
              fontsize=12)
ax.set_zlabel("PC3",
              fontsize=12)

ax.view_init(30, 140)
ax.legend()
plt.title("Poems data 3D PCA projection | Data type: {}".format(data_type))
plt.show()

plt.savefig('/content/figs/pca_3D_{}.png'.format(data_type))
files.download('/content/figs/pca_3D_{}.png'.format(data_type))

**Normalization**

In [None]:
X_normalized = normalize_data(X)
print(X_normalized)

**Euclidean distance**

In [None]:
distance_euclidean_X_normalized = np.zeros((X_normalized.shape[0], X_normalized.shape[0]))
distance_cosinus_X_normalized = np.zeros((X_normalized.shape[0], X_normalized.shape[0]))

for i in range(0,distance_euclidean_X_normalized.shape[0]):
  x_normalized_i = X_normalized[i,:]
  for j in range(0,distance_euclidean_X_normalized.shape[0]):
    x_normalized_j = X_normalized[j,:]
    distance_euclidean_X_normalized[i,j] = np.sqrt(np.sum( np.abs(x_normalized_i - x_normalized_j)**2))
    distance_cosinus_X_normalized[i,j] = np.dot(x_normalized_i, x_normalized_j)/1024


In [None]:
distances = {"Euclidean distance":distance_euclidean_X_normalized, "Cosinus distance":distance_cosinus_X_normalized}
data_type = "all"
fig, ax = plt.subplots(1,2, figsize=(20,10), sharey='row')
im = ax[0].imshow(distance_euclidean_X_normalized)
ax[0].set_title('Euclidean distance', fontsize=20)
im2 = ax[1].imshow(distance_cosinus_X_normalized)
ax[1].set_title('Cosinus distance', fontsize=20)
fig.colorbar(im, ax=ax[0])
fig.colorbar(im2, ax=ax[1])

fig.savefig('/content/figs/euclidean_and_cosinus.png')
files.download('/content/figs/euclidean_and_cosinus.png')


In [None]:
M = np.zeros((400,400))
for i in range(0,400):
  x_i = embedded_all['Herbert_embedding'][i]
  for j in range(0,400):
    x_j = embedded_all['Herbert_embedding'][j]

    M[i,j] = np.sqrt(np.sum( np.abs(x_i - x_j)**2))

In [None]:
plt.imshow(M)
plt.colorbar()
plt.title('Distance between vectors | data type: all')


plt.savefig('/content/odległość_wektorów_po_ombeddingu_all.png')
files.download('/content/odległość_wektorów_po_ombeddingu_all.png')