In [11]:
import json
import os
import pathlib
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from keras import layers
from matplotlib.ticker import MaxNLocator
import sklearn.metrics as skmetrics
from sklearn.preprocessing import LabelEncoder

In [4]:
path_to_data = pathlib.Path(os.getcwd()).parent.absolute() / 'data/hromadske.jsonl'
with open(path_to_data, 'r', encoding='utf-8') as f:
    data = f.readlines()

In [8]:
data = [json.loads(record.strip()) for record in data]

In [12]:
df = pd.DataFrame.from_records(data)[['cleared_text', 'tonality', 'embedding_clf']]
df.head(3)

Unnamed: 0,cleared_text,tonality,embedding_clf
0,Росіяни масовано атакували енергетику України ...,негативна,"[0.010125471, 0.03413575, -0.0815052, -0.00182..."
1,Це була одна з найбільших атак на енергетику з...,нейтральна,"[-0.020650906, 0.018376114, -0.055988964, 0.01..."
2,"У Бєлгороді заявили про обстріл міста, відомо ...",негативна,"[-0.0119311195, 0.015456551, -0.014062805, 0.0..."


In [None]:
def build_classification_model(input_size: int, num_classes: int) -> keras.Model:
  inputs = x = keras.Input(input_size)
  x = layers.Dense(input_size, activation='relu')(x)
  x = layers.Dense(num_classes, activation='sigmoid')(x)
  return keras.Model(inputs=[inputs], outputs=x)
     

In [None]:
# Derive the embedding size from the first training element.
embedding_size = len(df_train['Embeddings'].iloc[0])

# Give your model a different name, as you have already used the variable name 'model'
classifier = build_classification_model(embedding_size, len(df_train['Class Name'].unique()))
classifier.summary()

classifier.compile(loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   optimizer = keras.optimizers.Adam(learning_rate=0.001),
                   metrics=['accuracy'])

In [None]:
NUM_EPOCHS = 20
BATCH_SIZE = 32

# Split the x and y components of the train and validation subsets.
y_train = df_train['Encoded Label']
x_train = np.stack(df_train['Embeddings'])
y_val = df_test['Encoded Label']
x_val = np.stack(df_test['Embeddings'])

# Train the model for the desired number of epochs.
callback = keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)

history = classifier.fit(x=x_train,
                         y=y_train,
                         validation_data=(x_val, y_val),
                         callbacks=[callback],
                         batch_size=BATCH_SIZE,
                         epochs=NUM_EPOCHS,)

In [None]:
classifier.evaluate(x=x_val, y=y_val, return_dict=True)

In [None]:
def plot_history(history):
  """
    Plotting training and validation learning curves.

    Args:
      history: model history with all the metric measures
  """
  fig, (ax1, ax2) = plt.subplots(1,2)
  fig.set_size_inches(20, 8)

  # Plot loss
  ax1.set_title('Loss')
  ax1.plot(history.history['loss'], label = 'train')
  ax1.plot(history.history['val_loss'], label = 'test')
  ax1.set_ylabel('Loss')

  ax1.set_xlabel('Epoch')
  ax1.legend(['Train', 'Validation'])

  # Plot accuracy
  ax2.set_title('Accuracy')
  ax2.plot(history.history['accuracy'],  label = 'train')
  ax2.plot(history.history['val_accuracy'], label = 'test')
  ax2.set_ylabel('Accuracy')
  ax2.set_xlabel('Epoch')
  ax2.legend(['Train', 'Validation'])

  plt.show()

plot_history(history)

In [None]:
y_hat = classifier.predict(x=x_val)
y_hat = np.argmax(y_hat, axis=1)

In [None]:
labels_dict = dict(zip(df_test['Class Name'], df_test['Encoded Label']))
labels_dict

In [None]:
cm = skmetrics.confusion_matrix(y_val, y_hat)
disp = skmetrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=labels_dict.keys())
disp.plot(xticks_rotation='vertical')
plt.title('Confusion matrix for newsgroup test dataset');
plt.grid(False)