In [1]:
#!pip install pymongo

In [2]:
from pymongo import MongoClient
from tqdm import tqdm
import json
import pandas as pd
import numpy as np

# Extract data

In [5]:
client = MongoClient('mongodb://49.13.173.177:27020/')
embs = client.sponsoredbye.embeddings
cursor = embs.find({"embeddings.added_label": 1})

data = []

for i, elem in enumerate(cursor):
    if i == 1000:                 # 1000 elements 
        break
    data.append(elem)

for elem in data:
    elem.pop("_id")

# Save json

In [None]:
json_string = json.dumps(data)
file = open("data.json", "w")
file.write(json_string)
file.close()

# Read json

In [None]:
file_path = 'data.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# extract embeddings and labels

In [6]:


tot_embeddings, id_, tot_time, tot_label, tot_add_label  = [],[],[],[],[]

for i,video in enumerate(data):
    vid_embeddings,vid_label, vid_added_label = [], [], []

    t1 = video['end_times']

    t2 = video['start_times']

    the_id = video['videoID']

    for j,segment in enumerate(video['embeddings']):

        vid_embeddings.append(segment['embedding'])

        vid_label.append(segment['label'])

        vid_added_label.append(segment['added_label'])

    tot_embeddings.append(vid_embeddings)
    tot_time.append([t1,t2])
    tot_label.append(vid_label)
    tot_add_label.append(vid_added_label)
    id_.append(the_id)

df_vids = pd.DataFrame({'id_video' :id_,
                   'ambeddings' : tot_embeddings,
                   'timme_start_end' : tot_time,
                   'label':tot_label,
                   'label_added': tot_add_label})

# functions

In [12]:
def extract_embes_labels_logistic(df):
    nested_embs = list(df['ambeddings'])
    nested_labels = list(df['label'])
    return [element for video in nested_embs for element in video], [element for video in nested_labels for element in video]

def extract_embes_labels_NN(df):
    nested_embs = list(df['ambeddings'])
    nested_labels = list(df['label'])
    return nested_embs, nested_labels



# Logistic Classifier

In [None]:
X_train, Y_train = extract_embes_labels_logistic(df_vids.iloc[:800])
X_test, Y_test = extract_embes_labels_logistic(df_vids.iloc[800:])

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [None]:
logistic_class = LogisticRegression(random_state=0
                                     ,n_jobs=-1
                                     ,max_iter = 10000
                                     ,solver='lbfgs'
                                     ,multi_class='auto'
                                     ,C = 1 )
logistic_class.fit(X_train , Y_train)
predictions = logistic_class.predict(X_test)

In [None]:
f1_score(Y_test, predictions, average="macro")

0.7580870769640886

# Neural network classifier

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Masking,TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Extract embeddings and labels + padding 

In [51]:
X, y = extract_embes_labels_NN(df_vids)

max_len = max(len(seq) for seq in X)
vector_dim = len(X[0][0])

X_padded = pad_sequences(X,dtype='float32', padding='post')
y_padded = pad_sequences(y, maxlen=max_len, dtype='float32', padding='post')
y_padded = np.expand_dims(y_padded, -1)

789 768


### neural network structure

In [43]:
model = Sequential()
model.add(Masking(mask_value=0., input_shape=(max_len, vector_dim)))
model.add(Bidirectional(LSTM(50, return_sequences=True)))
#model.add(Bidirectional(LSTM(50, return_sequences=True)))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_2 (Masking)         (None, 789, 768)          0         
                                                                 
 bidirectional_2 (Bidirecti  (None, 789, 100)          327600    
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 789, 1)            101       
 stributed)                                                      
                                                                 
Total params: 327701 (1.25 MB)
Trainable params: 327701 (1.25 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
model.fit(X_padded[:800], padded_targets[:800], epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a219455a6b0>

In [45]:
predictions = model.predict(X_padded[800:])
binary_predictions = (predictions >= 0.5).astype(int)



In [59]:
# remove the padded part from the results. Note I use directly the unpadded labels 'y_test' to get the lenghts
y_test = y[800:]cr_bin_predictions = []
for j,n in enumerate(y_test):
    i = len(n)
    cr_bin_predictions.append(binary_predictions[j][:i])


In [73]:
predictions_ = [x for vec in cr_bin_predictions for x in vec]
real = [x for vec in y_test for x in vec]
print(len(predictions_),len(real))
f1_score(real, predictions_, average="macro")

27024 27024


0.9030000527159652