# Chunk salience classification
We have trained a deep neural network to discriminate between useful and useless chunk setting a binary classification problem. 

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sentence_transformers import SentenceTransformer
from rouge import Rouge

from transcript_utils import get_transcription, semantic_segmentation, extract_features

from tensorflow import keras
import tensorflow as tf

## 1. Dataset preparation

### 1.1. Dataset reading
A cleaned version of the golden set is used for the training and the test of the *chunck classifier*

In [3]:
# loading the dataset from the csv file
dataset_path = os.path.join(os.path.abspath(""), 'podcasts-no-audio-13GB')
dataset = pd.read_csv(os.path.join(dataset_path, "gold_set_cleaned.tsv"), sep='\t')
dataset.head()

Unnamed: 0,episode id,transcript,best_summary
0,spotify:episode:4KRC1TZ28FavN3J5zLHEtQ,What's up fellas? So I got a patron supported...,All right guys now as y'all guys might know so...
1,spotify:episode:4tdDQcsBOUVWnA9XrpgTzS,If you are bored you are boring. One of my ki...,It was the first and last time I ever said tha...
2,spotify:episode:626YAxomH0HZ6nCW9NLlGY,Visit Larisa English club.com English everyday...,Prepositions of movement review two is the sec...
3,spotify:episode:6AUFl7KQWN6pzGFEIEKFQu,So so and salutations Summers and welcome to t...,It only seems fitting to walk you through a fe...
4,spotify:episode:6IDbemwG5t6XMlctbqcna7,Hi everyone. This is Justin from a liquidy pla...,"This week on Nothing But A Bob Thang, Nathan a..."


### 1.2 Ground truth targets creation
In order to create the *ground truth targets* for the classifier,  we compare the chunk with the corresponding summary of the transcript it belongs to and, if the score obtained with a certain metric is below a threshold (strictly coupled with the metric), the chunk is not taken into account as a part of the transcript.

In [4]:
def isChunkUseful(chunk, summary, metric, threshold, verbose=False):
    """
    Function to check if a chunk is useful or not

    Parameters:
        - chunk: part of the transcript
        - summary: summary of a transcript
        - metric: function of ariety 2 (chunk, summary) used to evaluate the summary
        - threshold: value used to decide whether chunk is a good summary or not
    Returns:
        - True if the chunk is a good summary, False otherwise
    """
    score = metric(chunk, summary)
    if verbose: print(f"\tChunck: {chunk}\n\tSummary: {summary}\n\tScore: {score}")

    if score < threshold:
        result = False
    else:
        result = True
    
    return result

The chosen metric to compare the chuck with the description is ROUGE-L f1-score.

In [5]:
def rouge_score(candidate, reference, type='rouge-l', metric='f'):
    """
    ROUGE score
    Parameters:
        reference: reference text
        candidate: candidate text
        type: type of ROUGE, it can be rouge-1, rouge-2, rouge-l (default)
        metric: precision (p), recall (r) or f-score (f) (default)
    """
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    return scores[0][type][metric]

### 1.3 Extracting the input and target data for the chunck classifier
We have all the tools to create the ground truth targets for the chunk selection classifier. 
The following code creates the dataset to train the chunk classifier:
- the input features are the chunk encoding with a sentence transformer
- the ground truth targets are created as aforementioned

In [11]:
threshold = 0.20
metric = rouge_score
verbose = False

# creation of the dataset for chunk classification
# creation of the targets

features = []
targets = []

# initalize the model for the sentence transformer
sentence_encoder = SentenceTransformer('all-MiniLM-L6-v2')

for i in tqdm(range(len(dataset)), desc="Extracting features and targets"):
    if verbose: print(f"Episode: {i}")
    chunks = semantic_segmentation(dataset.transcript[i], sentence_encoder)
    description = dataset.best_summary[i]

    num_chunks = len(chunks)
    if verbose: print(f"Num chunks: {num_chunks}")

    for j in range(num_chunks):
        if verbose: print(f"\tChunk {j}")
        features.append(extract_features(chunks[j], sentence_encoder))
        if isChunkUseful(' '.join(chunks[j]), description, metric, threshold, verbose):
            targets.append(1)
        else:
            targets.append(0)

y = np.array(targets)
y = y.reshape(y.shape[0], 1)
X = np.array(features)

Extracting features and targets: 100%|██████████| 141/141 [1:04:23<00:00, 27.40s/it]


In [17]:
# show the percentage of useful and unuseful chunks
positive = y[y==1].shape[0]
negative = y.shape[0] - positive
print(f"Percentage of useful chunks: {positive/(positive+negative)*100}%")
print(f"Percentage of unuseful chunks: {negative/(positive+negative)*100}%")

# store chunk classification dataset
chunck_classification_dataset = np.hstack((X, y))
df_chunk = pd.DataFrame(chunck_classification_dataset)
df_chunk.to_csv(os.path.join(dataset_path, "chunk_classification_dataset.csv"), header=False, index=False)

Percentage of useful chunks: 9.119270458363331%
Percentage of unuseful chunks: 90.88072954163667%


## 2. Training the chunk classifier

In [4]:
# load brass set
chunck_classification_dataset = pd.read_csv(os.path.join(dataset_path, "chunk_classification_dataset.csv"), header=None)

In [5]:
# The dataset contains 384 features and 1 target
y = chunck_classification_dataset.iloc[:,-1]
X = chunck_classification_dataset.drop(chunck_classification_dataset.columns[[-1]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
X_train_positive = X_train[y_train>0]
X_train_negative = X_train[y_train==0][:X_train_positive.shape[0]]
y_train_positive = y_train[y_train>0]
y_train_negative = y_train[y_train==0][:X_train_positive.shape[0]]

X_train = np.vstack((X_train_positive,X_train_negative))
y_train = np.hstack((y_train_positive, y_train_negative))

In [7]:
# Neural Network for chunk classification

inputs = keras.Input(shape=(384))
x = keras.layers.Dense(512, activation='relu')(inputs)
x = keras.layers.Dense(256, activation='relu')(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Dense(256, activation='relu', kernel_regularizer='l2')(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Dense(128, activation='relu', kernel_regularizer='l2')(x)
output = keras.layers.Dense(1, activation='sigmoid', kernel_regularizer='l2')(x)
model = keras.Model(inputs, output)

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

history = model.fit(
    X_train,
    y_train,
    batch_size=16,
    epochs=15,
    validation_split=0.15,
    validation_data=(X_test,y_test),
    callbacks=[keras.callbacks.EarlyStopping(monitor='loss', patience=3)]
)

model.save("modelChunkNN")

y_pred = model.predict(X_test)
y_pred = [1 if y>0.5 else 0 for y in y_pred]
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision_score(y_test, y_pred, average=None)}")
print(f"Recall: {recall_score(y_test, y_pred, average=None)}")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
INFO:tensorflow:Assets written to: modelChunkNN\assets
Accuracy: 0.5710650672482733
Precision: [0.96262341 0.15453863]
Recall: [0.54775281 0.7953668 ]
