In [6]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from tensorflow import keras
import tensorflow as tf
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from rouge import Rouge
from sklearn.metrics import accuracy_score, precision_score, recall_score
from get_transcripts import semantic_segmentation, extract_features

In [7]:
dataset = pd.read_csv("gold_set_cleaned.tsv", sep='\t')
dataset.head()

Unnamed: 0,episode id,transcript,best_summary
0,spotify:episode:4KRC1TZ28FavN3J5zLHEtQ,What's up fellas? So I got a patron supported...,All right guys now as y'all guys might know so...
1,spotify:episode:4tdDQcsBOUVWnA9XrpgTzS,If you are bored you are boring. One of my ki...,It was the first and last time I ever said tha...
2,spotify:episode:626YAxomH0HZ6nCW9NLlGY,Visit Larisa English club.com English everyday...,Prepositions of movement review two is the sec...
3,spotify:episode:6AUFl7KQWN6pzGFEIEKFQu,So so and salutations Summers and welcome to t...,It only seems fitting to walk you through a fe...
4,spotify:episode:6IDbemwG5t6XMlctbqcna7,Hi everyone. This is Justin from a liquidy pla...,"This week on Nothing But A Bob Thang, Nathan a..."


In [8]:
def isChunkUseful(chunk, summary, metric, threshold, verbose=False):
    score = metric(chunk, summary)
    if verbose: print(f"\tChunck: {chunk}\n\tSummary: {summary}\n\tScore: {score}")

    if score < threshold:
        result = False
    else:
        result = True
    
    return result

In [9]:
def rouge_score(candidate, reference, type='rouge-l', metric='f'):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    return scores[0][type][metric]

In [10]:
threshold = 0.20
metric = rouge_score
verbose = False

features = []
targets = []

sentence_encoder = SentenceTransformer('all-MiniLM-L6-v2')

for i in tqdm(range(len(dataset)), desc="Extracting features and targets"):
    if verbose: print(f"Episode: {i}")
    chunks = semantic_segmentation(dataset.transcript[i], sentence_encoder)
    description = dataset.best_summary[i]

    num_chunks = len(chunks)
    if verbose: print(f"Num chunks: {num_chunks}")

    for j in range(num_chunks):
        if verbose: print(f"\tChunk {j}")
        features.append(extract_features(chunks[j], sentence_encoder))
        if isChunkUseful(' '.join(chunks[j]), description, metric, threshold, verbose):
            targets.append(1)
        else:
            targets.append(0)

y = np.array(targets)
y = y.reshape(y.shape[0], 1)
X = np.array(features)

Extracting features and targets: 100%|██████████| 141/141 [20:05<00:00,  8.55s/it]


In [11]:
positive = y[y==1].shape[0]
negative = y.shape[0] - positive
print(f"Percentage of useful chunks: {positive/(positive+negative)*100}%")
print(f"Percentage of unuseful chunks: {negative/(positive+negative)*100}%")

chunck_classification_dataset = np.hstack((X, y))
df_chunk = pd.DataFrame(chunck_classification_dataset)
df_chunk.to_csv("chunk_classification_dataset.csv", header=False, index=False)

Percentage of useful chunks: 9.119270458363331%
Percentage of unuseful chunks: 90.88072954163667%


In [12]:
chunck_classification_dataset = pd.read_csv("chunk_classification_dataset.csv", header=None)

In [13]:
y = chunck_classification_dataset.iloc[:,-1]
X = chunck_classification_dataset.drop(chunck_classification_dataset.columns[[-1]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
X_train_positive = X_train[y_train>0]
X_train_negative = X_train[y_train==0][:X_train_positive.shape[0]]
y_train_positive = y_train[y_train>0]
y_train_negative = y_train[y_train==0][:X_train_positive.shape[0]]

X_train = np.vstack((X_train_positive,X_train_negative))
y_train = np.hstack((y_train_positive, y_train_negative))

In [17]:
inputs = keras.Input(shape=(384,))
x = keras.layers.Dense(512, activation='relu')(inputs)
x = keras.layers.Dense(256, activation='relu')(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Dense(256, activation='relu', kernel_regularizer='l2')(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Dense(128, activation='relu', kernel_regularizer='l2')(x)
output = keras.layers.Dense(1, activation='sigmoid', kernel_regularizer='l2')(x)
model = keras.Model(inputs, output)

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

history = model.fit(
    X_train,
    y_train,
    batch_size=16,
    epochs=15,
    validation_split=0.15,
    validation_data=(X_test,y_test),
    callbacks=[keras.callbacks.EarlyStopping(monitor='loss', patience=3)]
)

model.save("modelChunkNN.h5")
y_pred = model.predict(X_test)
y_pred = [1 if y>0.5 else 0 for y in y_pred]
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision_score(y_test, y_pred, average=None)}")
print(f"Recall: {recall_score(y_test, y_pred, average=None)}")

Epoch 1/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 3.6683 - precision_1: 0.5250 - recall_1: 0.5662 - val_loss: 1.3647 - val_precision_1: 0.0947 - val_recall_1: 0.9961
Epoch 2/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0977 - precision_1: 0.5641 - recall_1: 0.8122 - val_loss: 0.6237 - val_precision_1: 0.2009 - val_recall_1: 0.4942
Epoch 3/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6681 - precision_1: 0.6627 - recall_1: 0.6458 - val_loss: 0.6804 - val_precision_1: 0.1751 - val_recall_1: 0.6718
Epoch 4/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6128 - precision_1: 0.7206 - recall_1: 0.7885 - val_loss: 0.6540 - val_precision_1: 0.1860 - val_recall_1: 0.6680
Epoch 5/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.5261 - precision_1: 0.7883 - recall_1: 0.7868 - val_loss: 0.6842 - val_pr



[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.573973100690658
Precision: [0.95769764 0.1512605 ]
Recall: [0.55417335 0.76447876]
