## Setup

In [115]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import re
from collections import Counter

In [116]:
from bertopic import BERTopic
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [117]:
from sklearn.preprocessing import LabelEncoder

In [118]:
from tensorflow.keras.layers import SimpleRNN

In [119]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional

In [120]:
import keras_tuner as kt
# import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Nadam, Adagrad, Adadelta, Adam, RMSprop, SGD

## Prep

In [None]:
# train=pd.read_csv("Web Scraping/youtube_history_chrome_linux.csv")
train=pd.read_csv("https://drive.google.com/uc?id=1HE3Vp9Expz8EruSEDRXScuzNvoVHQcn7")


In [122]:
train.head(3)

Unnamed: 0,Date,Title,Creator,Views
0,Today,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133K views
1,Today,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66K views
2,Today,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805K views


In [123]:
train.tail(1)

Unnamed: 0,Date,Title,Creator,Views
688,May 19,REAL MADRID VS SEVILLA & BARCELONA VS VILLARRE...,Faysal,41K views


# Data Preprocessing

In [124]:
def normalize_date(date_str):
    DAY_NAMES = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    current_time = datetime.now()

    if date_str == "Today":
        return pd.to_datetime(current_time.date())
    elif date_str == "Yesterday":
        return pd.to_datetime((current_time - timedelta(days=1)).date())
    elif date_str in DAY_NAMES:
        today_date = current_time.date()
        today_weekday = today_date.weekday()
        days_map = {
            "Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3,
            "Friday": 4, "Saturday": 5, "Sunday": 6
        }
        target_weekday = days_map[date_str]
        days_ago = (today_weekday - target_weekday + 7) % 7

        # If date_str is today's weekday (e.g., input "Friday" and today is Friday),
        # and "Today" is handled separately, this implies the "Friday" entry refers to
        # the previous week's Friday.
        if days_ago == 0 and date_str != current_time.strftime('%A'):
             # This case might need refinement based on exact data semantics.
             # For simplicity, if days_ago is 0, it means the most recent occurrence was today.
             # If "Today" is explicitly used for current day, a named day matching today's weekday
             # might imply last week, but the current calculation (days_ago=0) correctly points to today.
             # The problem description implies "Today" and "Yesterday" are special strings.
             # Other day names refer to their most recent occurrence.
             pass
        calculated_date = today_date - timedelta(days=days_ago)
        return pd.to_datetime(calculated_date)

    try:
        # Attempt to parse with year first (e.g., "Jun 06, 2025")
        return pd.to_datetime(date_str, format='%b %d, %Y')
    except ValueError:
        # If year is missing, assume current year (2025 as per original logic)
        # This part might need adjustment if the data can span multiple years without explicit year numbers
        # The original code used a hardcoded 2025, respecting that.
        return pd.to_datetime(f'{date_str}, 2025', format='%b %d, %Y')


In [125]:
train['Date'] = train['Date'].apply(normalize_date)

In [126]:
train.head(1)

Unnamed: 0,Date,Title,Creator,Views
0,2025-06-07,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133K views


In [127]:
def clean_views(view_str_input):
    if pd.isna(view_str_input):
        return np.nan

    if not isinstance(view_str_input, str):
        if isinstance(view_str_input, (int, float)):
            return float(view_str_input)
        return np.nan

    s = view_str_input.lower().replace(" views", "").strip().replace(',', '')
    if not s:
        return np.nan
    multiplier = 1.0
    if s.endswith('k'):
        multiplier = 1000.0
        s = s[:-1].strip()
    elif s.endswith('m'):
        multiplier = 1_000_000.0
        s = s[:-1].strip()
    if not s:
        return np.nan
    try:
        return float(s) * multiplier
    except ValueError:
        return np.nan

In [128]:
train['Views'] = train['Views'].apply(clean_views)

In [129]:
train.head(1)

Unnamed: 0,Date,Title,Creator,Views
0,2025-06-07,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0


In [130]:
titles = train['Title'].tolist()

In [131]:
titles

['Spain vs. France FULL REACTION 👀 Lamine Yamal plays like he’s with his friends – Shaka | ESPN FC',
 "Did Spain vs. France SETTLE the Lamine Yamal-Ousmane Dembele Ballon d'Or debate?! 👀 | ESPN FC",
 'FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA NATIONS LEAGUE 2024/2025',
 'Spain vs France 5-4 - Highlights & All Goals - Nations League 2025',
 '🔴Spain vs France (5-4) Extended HIGHLIGHTS | UEFA Nations League Semi-Final',
 'Tyrese Haliburton Keeps Hitting Game Winners...',
 "Lamine Yamal Just ENDED the Ballon d'Or Debate! 🏆",
 'LAMINE YAMAL & RAYAN CHERKI ARE SAVING FOOTBALL.',
 'The moment 48k Chile fans applauded MESSI when he came on his first game for Argentina in 2025',
 'Lamine Yamal Defeats Kylian Mbappe 5 Times in a Row (France vs Spain Reaction)',
 'Club World Cup TICKET CRISIS?',
 'Lamine Yamal BLASTS Desire Doue and Mbappe after win - Post Match Interview - Spain 5-4 France',
 'LIKE THAT - Call of Duty Montage',
 'Camp Nou Construction Update (June 4, 2025)',
 'Lamine Record

In [132]:
all_words = ' '.join(train['Title'].astype(str)).lower().split()
all_words_clean = [re.sub(r'\W+', '', word) for word in all_words if re.sub(r'\W+', '', word) != '']

unique_words = set(all_words_clean)

print(f"Number of unique words in the Title column: {len(unique_words)}")

Number of unique words in the Title column: 1954


In [133]:
word_counts = Counter(all_words_clean)

In [134]:
most_common = word_counts.most_common(100)
print("20 most frequent words:")
for word, count in most_common:
    print(f"{word}: {count}")

20 most frequent words:
the: 171
to: 94
in: 72
vs: 62
is: 62
a: 51
and: 47
of: 45
on: 43
highlights: 38
barcelona: 38
at: 36
for: 34
league: 33
i: 32
game: 31
3: 30
official: 28
final: 26
lamine: 25
2025: 24
match: 24
this: 24
season: 24
united: 24
nba: 23
why: 23
with: 22
4: 22
messi: 22
after: 22
1: 22
2: 22
real: 21
trailer: 21
his: 20
fc: 20
full: 19
yamal: 19
how: 19
man: 19
5: 18
live: 18
spain: 17
uefa: 17
first: 17
win: 17
my: 17
inter: 17
2425: 17
reaction: 16
linux: 16
just: 15
world: 15
laliga: 15
grand: 15
f1: 15
france: 14
insane: 14
you: 14
prix: 14
code: 14
tottenham: 14
that: 13
10: 13
has: 13
it: 13
max: 13
new: 13
nations: 12
fans: 12
pacers: 12
from: 12
by: 12
not: 12
was: 12
eminem: 12
manchester: 12
all: 11
best: 11
champions: 11
knicks: 11
madrid: 11
joey: 11
tyrese: 10
haliburton: 10
club: 10
your: 10
ai: 10
moments: 10
sports: 10
claude: 10
no: 10
la: 10
finals: 9
up: 9
out: 9
me: 9
over: 9
as: 9


In [135]:
least_common = word_counts.most_common()[::-1]
print("20 least frequent words:")
for word, count in least_common[:20]:
    print(f"{word}: {count}")


20 least frequent words:
watchalong: 1
hardest: 1
yachty: 1
lil: 1
250: 1
compaq: 1
activate: 1
power: 1
gaming: 1
victus: 1
appreciation: 1
villareal: 1
23: 1
espanyol: 1
secured: 1
trofi: 1
angkat: 1
main: 1
superman: 1
emiliaromagna: 1


# Modelling

### Predicting topic keywords from existing videos 

In [136]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=3, prediction_data=True)

model = BERTopic(
    language="multilingual",
    nr_topics=13,
    embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

In [137]:
topics, probs = model.fit_transform(titles)

2025-06-07 10:45:48,021 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 22/22 [00:04<00:00,  4.80it/s]
2025-06-07 10:45:58,376 - BERTopic - Embedding - Completed ✓
2025-06-07 10:45:58,376 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-07 10:45:59,475 - BERTopic - Dimensionality - Completed ✓
2025-06-07 10:45:59,475 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-07 10:45:59,822 - BERTopic - Cluster - Completed ✓
2025-06-07 10:45:59,823 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-07 10:45:59,864 - BERTopic - Representation - Completed ✓
2025-06-07 10:45:59,865 - BERTopic - Topic reduction - Reducing number of topics
2025-06-07 10:45:59,870 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-07 10:45:59,886 - BERTopic - Representation - Completed ✓
2025-06-07 10:45:59,889 - BERTopic - Topic reduction - Redu

In [138]:
train['Predicted_Topic'] = topics

### Mapping

In [139]:
train['Topic_Keywords'] = train['Predicted_Topic'].apply(lambda x: model.get_topic(x))

In [140]:
print(train[['Title', 'Predicted_Topic', 'Topic_Keywords']])

                                                 Title  Predicted_Topic  \
0    Spain vs. France FULL REACTION 👀 Lamine Yamal ...                0   
1    Did Spain vs. France SETTLE the Lamine Yamal-O...                0   
2    FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...                0   
3    Spain vs France 5-4 - Highlights & All Goals -...                0   
4    🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...                0   
..                                                 ...              ...   
684  SEVILLA FC 2 - 2 REAL MADRID I HIGHLIGHTS LALI...                0   
685  HP Victus Gaming 16 - Turn On Adaptive Battery...               -1   
686  How to Activate Adaptive Battery Optimizer on ...                0   
687          Lil Yachty with the HARDEST walk out EVER               -1   
688  REAL MADRID VS SEVILLA & BARCELONA VS VILLARRE...                0   

                                        Topic_Keywords  
0    [(vs, 0.038367261420305075), (the, 0.

In [141]:
# --- Start: Dynamic Topic Mapping ---
print("Inspect Discovered Topic Keywords (Top 5 words per topic):")
# model.get_topics() returns a dictionary: {topic_id: [(word, score), ...]}
# We iterate through sorted keys to have a consistent order for inspection
discovered_topics_keywords = model.get_topics()
for topic_id in sorted(discovered_topics_keywords.keys()):
    # model.get_topic(topic_id) also works and might be more direct here
    words_scores = discovered_topics_keywords[topic_id]
    words = [word for word, score in words_scores[:5]] # Get top 5 words
    print(f"Topic {topic_id}: {words}")
dynamic_topic_mapping = {}
for topic_id, topic_words_scores in discovered_topics_keywords.items():
    # Create a name from the top 2-3 keywords
    top_words = [str(word) for word, score in topic_words_scores[:3]] # Take top 3 words
    if top_words:
        # Join top words to form a name, e.g., "word1_word2_word3"
        dynamic_name = "_".join(top_words)
    else:
        # Fallback name if a topic has no words (should be rare for actual topics)
        dynamic_name = f"topic_{topic_id}"
    dynamic_topic_mapping[topic_id] = dynamic_name
# BERTopic often assigns -1 to outliers. Ensure it has a clear name.
# If -1 was in discovered_topics_keywords and got a name like "word1_word2", that's fine.
# If -1 represents outliers and wasn't explicitly in get_topics() or needs a generic name:
if -1 in train['Predicted_Topic'].unique(): # Check if -1 exists in predictions
    if -1 not in dynamic_topic_mapping:
        dynamic_topic_mapping[-1] = "Other_Outliers"
    elif not dynamic_topic_mapping.get(-1): # If it's there but empty
         dynamic_topic_mapping[-1] = "Other_Outliers"
print("\nDynamically Generated Topic Mapping:")
print(dynamic_topic_mapping)
# Now, use this dynamically generated mapping
train["Mapped_Topic_Name"] = train["Predicted_Topic"].map(dynamic_topic_mapping)
# --- End: Dynamic Topic Mapping ---

Inspect Discovered Topic Keywords (Top 5 words per topic):
Topic -1: ['the', 'in', 'live', 'just', 'champions']
Topic 0: ['vs', 'the', 'barcelona', 'highlights', 'to']
Topic 1: ['trailer', 'official', 'the', 'eminem', 'lamine']
Topic 2: ['is', 'transfer', 'what', 'over', 'season']
Topic 3: ['tech', 'most', 'your', 'development', '99']
Topic 4: ['linux', 'distro', 'fedora', 'windows', 'to']
Topic 5: ['code', 'using', 'neovim', 'vs', 'setup']
Topic 6: ['elephant', 'baby', 'dog', 'worst', 'mom']
Topic 7: ['google', 'microsoft', 'netflix', 'opened', 'gates']
Topic 8: ['claude', 'is', 'code', 'it', 'bronny']
Topic 9: ['seconds', '100', 'in', 'bash', '103']
Topic 10: ['forever', 'way', 'changed', 'has', 'tmux']
Topic 11: ['credits', 'end', 'cards', 'blatter', 'money']

Dynamically Generated Topic Mapping:
{-1: 'the_in_live', 0: 'vs_the_barcelona', 1: 'trailer_official_the', 2: 'is_transfer_what', 3: 'tech_most_your', 4: 'linux_distro_fedora', 5: 'code_using_neovim', 6: 'elephant_baby_dog', 7

In [142]:
print("\nDataFrame with dynamically mapped topic names:")
print(train[["Date", "Title", "Predicted_Topic", "Mapped_Topic_Name", "Topic_Keywords"]].head(3))


DataFrame with dynamically mapped topic names:
        Date                                              Title  \
0 2025-06-07  Spain vs. France FULL REACTION 👀 Lamine Yamal ...   
1 2025-06-07  Did Spain vs. France SETTLE the Lamine Yamal-O...   
2 2025-06-07  FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...   

   Predicted_Topic Mapped_Topic_Name  \
0                0  vs_the_barcelona   
1                0  vs_the_barcelona   
2                0  vs_the_barcelona   

                                      Topic_Keywords  
0  [(vs, 0.038367261420305075), (the, 0.032968890...  
1  [(vs, 0.038367261420305075), (the, 0.032968890...  
2  [(vs, 0.038367261420305075), (the, 0.032968890...  


In [143]:
hashable_cols = train.columns[train.applymap(lambda x: not isinstance(x, (list, dict))).all()]
train[hashable_cols].nunique()

Date                  18
Title                581
Creator              320
Views                366
Predicted_Topic       13
Mapped_Topic_Name     13
dtype: int64

In [144]:
train["Topic_Keywords"].head(3)

0    [(vs, 0.038367261420305075), (the, 0.032968890...
1    [(vs, 0.038367261420305075), (the, 0.032968890...
2    [(vs, 0.038367261420305075), (the, 0.032968890...
Name: Topic_Keywords, dtype: object

In [145]:
topic_info = model.get_topic_info()
print(topic_info)

    Topic  Count                               Name  \
0      -1     95                -1_the_in_live_just   
1       0    286      0_vs_the_barcelona_highlights   
2       1    152      1_trailer_official_the_eminem   
3       2     29            2_is_transfer_what_over   
4       3     24       3_tech_most_your_development   
5       4     23      4_linux_distro_fedora_windows   
6       5     19             5_code_using_neovim_vs   
7       6     18          6_elephant_baby_dog_worst   
8       7     15  7_google_microsoft_netflix_opened   
9       8     14                8_claude_is_code_it   
10      9      5              9_seconds_100_in_bash   
11     10      5         10_forever_way_changed_has   
12     11      4       11_credits_end_cards_blatter   

                                       Representation  \
0   [the, in, live, just, champions, mind, has, up...   
1   [vs, the, barcelona, highlights, to, league, i...   
2   [trailer, official, the, eminem, lamine, ai, n...   
3

In [146]:
for topic_id in topic_info['Topic'].unique():
    print(f"Topic {topic_id}:")
    print(model.get_topic(topic_id))

Topic -1:
[('the', np.float64(0.05143111313125825)), ('in', np.float64(0.03887464448203324)), ('live', np.float64(0.033049956843593783)), ('just', np.float64(0.029883748838936897)), ('champions', np.float64(0.02712586746827267)), ('mind', np.float64(0.026276786415329538)), ('has', np.float64(0.02592612018926992)), ('up', np.float64(0.022858986781492046)), ('at', np.float64(0.022317272946082607)), ('paris', np.float64(0.021950414535847743))]
Topic 0:
[('vs', np.float64(0.038367261420305075)), ('the', np.float64(0.032968890895126834)), ('barcelona', np.float64(0.032279415194303614)), ('highlights', np.float64(0.03165936191625454)), ('to', np.float64(0.029249197620512216)), ('league', np.float64(0.02841295962175117)), ('in', np.float64(0.026674733381649076)), ('nba', np.float64(0.025444115580079568)), ('game', np.float64(0.02503645370610425)), ('united', np.float64(0.023689701618520576))]
Topic 1:
[('trailer', np.float64(0.057788803229924156)), ('official', np.float64(0.057646200965298475

In [147]:
train.head(3)

Unnamed: 0,Date,Title,Creator,Views,Predicted_Topic,Topic_Keywords,Mapped_Topic_Name
0,2025-06-07,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
1,2025-06-07,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
2,2025-06-07,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona


In [148]:
for col in train.columns:
    print(train[col].value_counts())

Date
2025-05-26    66
2025-06-07    64
2025-05-22    57
2025-05-21    56
2025-05-25    52
2025-05-19    50
2025-05-30    44
2025-05-24    43
2025-06-06    41
2025-06-02    33
2025-05-29    30
2025-06-04    28
2025-06-01    28
2025-05-27    26
2025-05-23    26
2025-05-28    19
2025-06-03    15
2025-05-20    11
Name: count, dtype: int64
Title
Tottenham Hotspur 1-0 Manchester United | Europa League 24/25 Match Highlights         4
Inter Miami CF vs. Columbus Crew | Full Match Highlights | Messi Brace + 3 Assists!    3
Microsoft just opened the flood gates…                                                 3
The Season Is OVER….                                                                   3
Black And Cant Play Basketball Disease (NBA Edition)                                   3
                                                                                      ..
Tailwind CSS is the worst…                                                             1
The Wrexham Situation.            

In [149]:
train.drop(["Predicted_Topic", "Topic_Keywords"], axis=1, inplace=True)

In [150]:
train.head(3)

Unnamed: 0,Date,Title,Creator,Views,Mapped_Topic_Name
0,2025-06-07,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0,vs_the_barcelona
1,2025-06-07,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0,vs_the_barcelona
2,2025-06-07,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0,vs_the_barcelona


In [151]:
train['Date']=pd.to_datetime(train['Date'])

In [152]:
train = train.reset_index().sort_values(by=['Date', 'index'], ascending=[True, True]).drop(columns='index')

### Predicting next topic to be watched

In [153]:
le = LabelEncoder()

In [154]:
encoded_topics = le.fit_transform(train['Mapped_Topic_Name'])

In [155]:
num_classes = len(le.classes_)
print(f"Number of unique topic classes: {num_classes}")
print(f"Topic classes: {le.classes_}")

Number of unique topic classes: 13
Topic classes: ['claude_is_code' 'code_using_neovim' 'credits_end_cards'
 'elephant_baby_dog' 'forever_way_changed' 'google_microsoft_netflix'
 'is_transfer_what' 'linux_distro_fedora' 'seconds_100_in'
 'tech_most_your' 'the_in_live' 'trailer_official_the' 'vs_the_barcelona']


In [156]:
sequence_length = 15 # ? look at 15 previous topidcs ot predict
X = [] # ? input sequences features
y = [] # ? target values

for i in range(len(encoded_topics) - sequence_length):
    # ? window of 15 cons topics as input
    X.append(encoded_topics[i:i+sequence_length])
    # ? next topic after this window (predict)
    y.append(encoded_topics[i+sequence_length])

X = np.array(X)
y = np.array(y)

### Simple RNN

In [157]:
optimizer_rnn = tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-5)

model_rnn = Sequential([
    Embedding(input_dim=num_classes, output_dim=128, input_length=sequence_length),
    SimpleRNN(128, return_sequences=True),
    BatchNormalization(),
    Dropout(0.3),
    SimpleRNN(64, return_sequences=True),
    BatchNormalization(),
    Dropout(0.3),
    # ? extra layer testing
    SimpleRNN(32),
    BatchNormalization(),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model_rnn.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer_rnn, metrics=['accuracy'])

### Regular LSTM (Non-Biderectional)

In [158]:
optimizer_lstm = tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-5)

model_lstm = Sequential([
    Embedding(input_dim=num_classes, output_dim=128, input_length=sequence_length),
    LSTM(128, return_sequences=True),
    BatchNormalization(),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    BatchNormalization(),
    Dropout(0.3),
    # ? extra layer testing
    LSTM(32),
    BatchNormalization(),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer_lstm, metrics=['accuracy'])

### Biderectional LSTM

In [159]:
optimizer_lstm_bi = tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-5)

model_bidirectional = Sequential([
    Embedding(input_dim=num_classes, output_dim=128, input_length=sequence_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    BatchNormalization(),
    Dropout(0.3),
    Bidirectional(LSTM(64, return_sequences=True) ),
    BatchNormalization(),
    Dropout(0.3),
    # ? extra layer testing
    Bidirectional(LSTM(32)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])
model_bidirectional.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer_lstm_bi, metrics=['accuracy'])

### The Actual Predicting

In [160]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [161]:
from tensorflow.keras.optimizers import Nadam, Adagrad, Adadelta, Adam, RMSprop, SGD

In [162]:
models = {
    'Simple RNN': model_rnn,
    'Regular LSTM': model_lstm,
    'Bidirectional LSTM': model_bidirectional
}

results = {}
# epochs = 30
# epochs = 60
epochs = 45
batch_size = 16

# ? early stpping
early_stopping = EarlyStopping(
    monitor='val_accuracy',           # ? monitor val acc
    patience=10,                      # ? wait 10 epoch for improv
    restore_best_weights=True,        # ? restore beset weight when stoppiogn
    verbose=1,                        # ? print wehn stopping
    mode='max'                        # ? max acc
)

# ? optional: learning rate reducton
reduce_lr = ReduceLROnPlateau(
    monitor='val_accuracy',           # ? monitor val acc
    factor=0.5,                       # ? reduce LR by half
    patience=5,                       # ? wait 5 epochs before reducing
    min_lr=1e-7,                     #  ? wait Minimum learning rate
    verbose=1,                        # ? wait when reducing
    mode='max'
)

for model_name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {model_name}")
    print(f"{'='*50}")

    # ? training with callbacks
    history = model.fit(
        X, y,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.2,
        verbose=1,
        callbacks=[early_stopping, reduce_lr]  # ? callbacks
    )

    # ? results
    results[model_name] = {
        'model': model,
        'history': history,
        'final_accuracy': history.history['accuracy'][-1],
        'final_val_accuracy': history.history['val_accuracy'][-1] if 'val_accuracy' in history.history else None,
        'best_val_accuracy': max(history.history['val_accuracy']) if 'val_accuracy' in history.history else None,
        'epochs_trained': len(history.history['accuracy'])  # ? actual epochs trained
    }

    # ? prediction
    last_sequence = encoded_topics[-15:]
    input_seq = np.array([last_sequence])
    predicted_class = model.predict(input_seq, verbose=0).argmax()
    predicted_topic = le.inverse_transform([predicted_class])[0]

    print(f"{model_name} prediction: {predicted_topic}")
    print(f"Final training accuracy: {results[model_name]['final_accuracy']:.4f}")
    print(f"Best validation accuracy: {results[model_name]['best_val_accuracy']:.4f}")
    print(f"Epochs trained: {results[model_name]['epochs_trained']}")
    if results[model_name]['final_val_accuracy']:
        print(f"Final validation accuracy: {results[model_name]['final_val_accuracy']:.4f}")


Training Simple RNN
Epoch 1/45
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.0805 - loss: 3.4435 - val_accuracy: 0.2815 - val_loss: 2.5636 - learning_rate: 0.0010
Epoch 2/45
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1287 - loss: 2.9674 - val_accuracy: 0.2296 - val_loss: 2.6196 - learning_rate: 0.0010
Epoch 3/45
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.1751 - loss: 2.8059 - val_accuracy: 0.4000 - val_loss: 2.2179 - learning_rate: 0.0010
Epoch 4/45
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.1970 - loss: 2.5915 - val_accuracy: 0.4370 - val_loss: 2.0922 - learning_rate: 0.0010
Epoch 5/45
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.2350 - loss: 2.5363 - val_accuracy: 0.2963 - val_loss: 2.2309 - learning_rate: 0.0010
Epoch 6/45
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

### insane tuning

In [163]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)


#### RNN tuning

#### 

In [164]:
print("RNN HYPERPARAMETER TUNING")
print("="*60)

def build_rnn_model(hp):
    model = Sequential()

    # Tunable embedding dimension
    model.add(Embedding(
        input_dim=num_classes,
        output_dim=hp.Choice('embedding_dim', [32, 64, 128]),
        input_length=sequence_length
    ))

    # First RNN layer with tunable units
    model.add(SimpleRNN(
        hp.Int('rnn_units_1', 64, 256, step=64),
        return_sequences=True
    ))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_1', 0.2, 0.5, step=0.1)))

    # Second RNN layer with tunable units
    model.add(SimpleRNN(hp.Int('rnn_units_2', 32, 128, step=32)))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_2', 0.2, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(num_classes, activation='softmax'))

    # Tunable optimizer selection
    optimizer_name = hp.Choice('optimizer', [
        'Nadam', 'Adagrad', 'Adadelta', 'AdamW', 'adam', 'rmsprop', 'sgd'
    ])
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])

    # Create optimizer based on choice
    if optimizer_name == 'Nadam':
        optimizer = Nadam(learning_rate=learning_rate)
    elif optimizer_name == 'Adagrad':
        optimizer = Adagrad(learning_rate=learning_rate)
    elif optimizer_name == 'Adadelta':
        optimizer = Adadelta(learning_rate=learning_rate)
    elif optimizer_name == 'AdamW':
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate, weight_decay=1e-5)
    elif optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer_name == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

# Set up RNN tuner
print("Setting up RNN hyperparameter tuning...")
rnn_tuner = kt.RandomSearch(
    build_rnn_model,
    objective='val_accuracy',
    max_trials=30,  # Try 30 different combinations
    executions_per_trial=1,
    directory='rnn_tuner_dir',
    project_name='youtube_rnn_prediction_tuning'
)

# Early stopping for RNN tuning
rnn_early_stop = tf.keras.callbacks.EarlyStopping(
    patience=3,
    restore_best_weights=True,
    monitor='val_accuracy',
    mode='max'
)

print("Starting RNN hyperparameter search...")
rnn_tuner.search(
    X_train, y_train,
    epochs=20,
    validation_data=(X_val, y_val),
    callbacks=[rnn_early_stop],
    verbose=1
)

# Get the best RNN model and hyperparameters
print("Getting best RNN results...")
best_rnn_model = rnn_tuner.get_best_models(1)[0]
best_rnn_hp = rnn_tuner.get_best_hyperparameters(1)[0]

# Evaluate best RNN model
rnn_loss, rnn_acc = best_rnn_model.evaluate(X_val, y_val, verbose=0)
print(f"\nBest RNN Validation Accuracy: {rnn_acc:.4f}")
print(f"Best RNN Hyperparameters:")
for param, value in best_rnn_hp.values.items():
    print(f"   - {param}: {value}")

# Save the best RNN model for predictions
best_tuned_rnn_model = best_rnn_model

Trial 30 Complete [00h 00m 08s]
val_accuracy: 0.37037035822868347

Best val_accuracy So Far: 0.46666666865348816
Total elapsed time: 00h 03m 31s
Getting best RNN results...

Best RNN Validation Accuracy: 0.4667
Best RNN Hyperparameters:
   - embedding_dim: 128
   - rnn_units_1: 64
   - dropout_1: 0.2
   - rnn_units_2: 32
   - dropout_2: 0.30000000000000004
   - optimizer: Nadam
   - learning_rate: 0.01


#### LSTM Tuning

In [165]:
print("LSTM HYPERPARAMETER TUNING")
print("="*60)

def build_lstm_model(hp):
    model = Sequential()

    # Tunable embedding dimension
    model.add(Embedding(
        input_dim=num_classes,
        output_dim=hp.Choice('embedding_dim', [64, 128, 256]),
        input_length=sequence_length
    ))

    # First LSTM layer with tunable units
    model.add(LSTM(
        hp.Int('lstm_units_1', 64, 256, step=64),
        return_sequences=True
    ))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_1', 0.2, 0.5, step=0.1)))

    # Second LSTM layer with tunable units
    model.add(LSTM(
        hp.Int('lstm_units_2', 32, 128, step=32),
        return_sequences=True
    ))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_2', 0.2, 0.5, step=0.1)))

    # Optional third layer
    if hp.Boolean('use_third_layer'):
        model.add(LSTM(hp.Int('lstm_units_3', 16, 64, step=16)))
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_3', 0.2, 0.5, step=0.1)))
    else:
        model.add(LSTM(hp.Int('lstm_units_final', 16, 64, step=16)))
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_final', 0.2, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(num_classes, activation='softmax'))

    # Tunable optimizer selection
    optimizer_name = hp.Choice('optimizer', [
        'Nadam', 'Adagrad', 'Adadelta', 'AdamW', 'adam', 'rmsprop'
    ])
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4, 5e-4])

    # Create optimizer based on choice
    if optimizer_name == 'Nadam':
        optimizer = Nadam(learning_rate=learning_rate)
    elif optimizer_name == 'Adagrad':
        optimizer = Adagrad(learning_rate=learning_rate)
    elif optimizer_name == 'Adadelta':
        optimizer = Adadelta(learning_rate=learning_rate)
    elif optimizer_name == 'AdamW':
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate, weight_decay=1e-5)
    elif optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)

    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Set up LSTM tuner
print("Setting up LSTM hyperparameter tuning...")
lstm_tuner = kt.RandomSearch(
    build_lstm_model,
    objective='val_accuracy',
    max_trials=25,  # Try 25 different combinations
    executions_per_trial=1,
    directory='lstm_tuner_dir',
    project_name='youtube_lstm_prediction_tuning'
)

# Early stopping for LSTM tuning
lstm_early_stop = tf.keras.callbacks.EarlyStopping(
    patience=5,
    restore_best_weights=True,
    monitor='val_accuracy',
    mode='max'
)

print("Starting LSTM hyperparameter search...")
lstm_tuner.search(
    X_train, y_train,
    epochs=25,
    validation_data=(X_val, y_val),
    callbacks=[lstm_early_stop],
    verbose=1
)

# Get the best LSTM model and hyperparameters
print("Getting best LSTM results...")
best_lstm_model = lstm_tuner.get_best_models(1)[0]
best_lstm_hp = lstm_tuner.get_best_hyperparameters(1)[0]

# Evaluate best LSTM model
lstm_loss, lstm_acc = best_lstm_model.evaluate(X_val, y_val, verbose=0)
print(f"\nBest LSTM Validation Accuracy: {lstm_acc:.4f}")
print(f"Best LSTM Hyperparameters:")
for param, value in best_lstm_hp.values.items():
    print(f"   - {param}: {value}")

# Save the best LSTM model for predictions
best_tuned_lstm_model = best_lstm_model

LSTM HYPERPARAMETER TUNING
Setting up LSTM hyperparameter tuning...
Reloading Tuner from lstm_tuner_dir/youtube_lstm_prediction_tuning/tuner0.json
Starting LSTM hyperparameter search...
Getting best LSTM results...

Best LSTM Validation Accuracy: 0.4667
Best LSTM Hyperparameters:
   - embedding_dim: 64
   - lstm_units_1: 128
   - dropout_1: 0.2
   - lstm_units_2: 32
   - dropout_2: 0.30000000000000004
   - use_third_layer: False
   - lstm_units_final: 16
   - dropout_final: 0.4
   - optimizer: rmsprop
   - learning_rate: 0.0005
   - lstm_units_3: 16
   - dropout_3: 0.2


#### Bi-LSTM Tuning

In [166]:
print("BIDIRECTIONAL LSTM HYPERPARAMETER TUNING")
print("="*60)

def build_bidirectional_lstm_model(hp):
    model = Sequential()

    # Tunable embedding dimension
    model.add(Embedding(
        input_dim=num_classes,
        output_dim=hp.Choice('embedding_dim', [64, 128, 256]),
        input_length=sequence_length
    ))

    # First Bidirectional LSTM layer with tunable units
    model.add(Bidirectional(LSTM(
        hp.Int('bi_lstm_units_1', 64, 256, step=64),
        return_sequences=True
    )))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_1', 0.2, 0.5, step=0.1)))

    # Second Bidirectional LSTM layer with tunable units
    model.add(Bidirectional(LSTM(
        hp.Int('bi_lstm_units_2', 32, 128, step=32),
        return_sequences=True
    )))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_2', 0.2, 0.5, step=0.1)))

    # Optional third layer
    if hp.Boolean('use_third_layer'):
        model.add(Bidirectional(LSTM(hp.Int('bi_lstm_units_3', 16, 64, step=16))))
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_3', 0.2, 0.5, step=0.1)))
    else:
        model.add(Bidirectional(LSTM(hp.Int('bi_lstm_units_final', 16, 64, step=16))))
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_final', 0.2, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(num_classes, activation='softmax'))

    # Tunable optimizer selection
    optimizer_name = hp.Choice('optimizer', [
        'Nadam', 'Adagrad', 'Adadelta', 'AdamW', 'adam', 'rmsprop'
    ])
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4, 5e-4])

    # Create optimizer based on choice
    if optimizer_name == 'Nadam':
        optimizer = Nadam(learning_rate=learning_rate)
    elif optimizer_name == 'Adagrad':
        optimizer = Adagrad(learning_rate=learning_rate)
    elif optimizer_name == 'Adadelta':
        optimizer = Adadelta(learning_rate=learning_rate)
    elif optimizer_name == 'AdamW':
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate, weight_decay=1e-5)
    elif optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)

    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Set up Bidirectional LSTM tuner
print("Setting up Bidirectional LSTM hyperparameter tuning...")
bi_lstm_tuner = kt.RandomSearch(
    build_bidirectional_lstm_model,
    objective='val_accuracy',
    max_trials=25,  # Try 25 different combinations
    executions_per_trial=1,
    directory='bi_lstm_tuner_dir',
    project_name='youtube_bi_lstm_prediction_tuning'
)

# Early stopping for Bidirectional LSTM tuning
bi_lstm_early_stop = tf.keras.callbacks.EarlyStopping(
    patience=5,
    restore_best_weights=True,
    monitor='val_accuracy',
    mode='max'
)

print("Starting Bidirectional LSTM hyperparameter search...")
bi_lstm_tuner.search(
    X_train, y_train,
    epochs=25,
    validation_data=(X_val, y_val),
    callbacks=[bi_lstm_early_stop],
    verbose=1
)

# Get the best Bidirectional LSTM model and hyperparameters
print("Getting best Bidirectional LSTM results...")
best_bi_lstm_model = bi_lstm_tuner.get_best_models(1)[0]
best_bi_lstm_hp = bi_lstm_tuner.get_best_hyperparameters(1)[0]

# Evaluate best Bidirectional LSTM model
bi_lstm_loss, bi_lstm_acc = best_bi_lstm_model.evaluate(X_val, y_val, verbose=0)
print(f"\nBest Bidirectional LSTM Validation Accuracy: {bi_lstm_acc:.4f}")
print(f"Best Bidirectional LSTM Hyperparameters:")
for param, value in best_bi_lstm_hp.values.items():
    print(f"   - {param}: {value}")

# Save the best Bidirectional LSTM model for predictions
best_tuned_bi_lstm_model = best_bi_lstm_model

BIDIRECTIONAL LSTM HYPERPARAMETER TUNING
Setting up Bidirectional LSTM hyperparameter tuning...
Reloading Tuner from bi_lstm_tuner_dir/youtube_bi_lstm_prediction_tuning/tuner0.json
Starting Bidirectional LSTM hyperparameter search...
Getting best Bidirectional LSTM results...

Best Bidirectional LSTM Validation Accuracy: 0.5481
Best Bidirectional LSTM Hyperparameters:
   - embedding_dim: 256
   - bi_lstm_units_1: 64
   - dropout_1: 0.30000000000000004
   - bi_lstm_units_2: 96
   - dropout_2: 0.2
   - use_third_layer: True
   - bi_lstm_units_final: 48
   - dropout_final: 0.2
   - optimizer: AdamW
   - learning_rate: 0.01
   - bi_lstm_units_3: 64
   - dropout_3: 0.4


### Comparison

In [174]:
### Ultimate Model Comparison - All Models Combined
print("MODEL COMPARISON - ALL MODELS")
print("="*80)

# Combine all models (regular + tuned)
all_models = {
    # Regular models
    'Regular Simple RNN': {
        'model': results['Simple RNN']['model'],
        'val_acc': results['Simple RNN']['best_val_accuracy'],
        'type': 'regular',
        'epochs_trained': results['Simple RNN']['epochs_trained'],
        'final_train_acc': results['Simple RNN']['final_accuracy']
    },
    'Regular LSTM': {
        'model': results['Regular LSTM']['model'],
        'val_acc': results['Regular LSTM']['best_val_accuracy'],
        'type': 'regular',
        'epochs_trained': results['Regular LSTM']['epochs_trained'],
        'final_train_acc': results['Regular LSTM']['final_accuracy']
    },
    'Regular Bidirectional LSTM': {
        'model': results['Bidirectional LSTM']['model'],
        'val_acc': results['Bidirectional LSTM']['best_val_accuracy'],
        'type': 'regular',
        'epochs_trained': results['Bidirectional LSTM']['epochs_trained'],
        'final_train_acc': results['Bidirectional LSTM']['final_accuracy']
    },
    # Tuned models
    'Tuned Simple RNN': {
        'model': best_tuned_rnn_model,
        'val_acc': rnn_acc,
        'type': 'tuned',
        'epochs_trained': 'Variable (Early Stopping)',
        'final_train_acc': 'N/A'
    },
    'Tuned LSTM': {
        'model': best_tuned_lstm_model,
        'val_acc': lstm_acc,
        'type': 'tuned',
        'epochs_trained': 'Variable (Early Stopping)',
        'final_train_acc': 'N/A'
    },
    'Tuned Bidirectional LSTM': {
        'model': best_tuned_bi_lstm_model,
        'val_acc': bi_lstm_acc,
        'type': 'tuned',
        'epochs_trained': 'Variable (Early Stopping)',
        'final_train_acc': 'N/A'
    }
}

# Sort models by validation accuracy (descending)
sorted_models = sorted(all_models.items(), key=lambda x: x[1]['val_acc'], reverse=True)

print("📊 DETAILED MODEL PERFORMANCE:")
print("-" * 80)
print(f"{'Rank':<4} {'Model Name':<25} {'Type':<8} {'Val Acc':<8} {'Params':<12} {'Next Prediction':<20}")
print("-" * 80)

# Store predictions for later comparison
all_predictions = {}

for rank, (model_name, model_info) in enumerate(sorted_models, 1):
    model = model_info['model']
    val_acc = model_info['val_acc']
    model_type = model_info['type']

    # Make prediction
    last_sequence = encoded_topics[-15:]
    input_seq = np.array([last_sequence])
    predicted_class = model.predict(input_seq, verbose=0).argmax()
    predicted_topic = le.inverse_transform([predicted_class])[0]

    # Store prediction for comparison table
    all_predictions[model_name] = predicted_topic

    # Display summary
    params = f"{model.count_params():,}"
    print(f"{rank:<4} {model_name:<25} {model_type:<8} {val_acc:<8.4f} {params:<12} {predicted_topic:<20}")

# Find best overall model
best_model_name, best_model_info = sorted_models[0]
print(f"\n🥇 OVERALL WINNER: {best_model_name}")
print(f"   Validation Accuracy: {best_model_info['val_acc']:.4f}")
print(f"   Model Type: {best_model_info['type'].title()}")
print(f"   Parameters: {best_model_info['model'].count_params():,}")

# Detailed breakdown by type
print(f"\n📈 PERFORMANCE BY TYPE:")
regular_models = [(name, info) for name, info in all_models.items() if info['type'] == 'regular']
tuned_models = [(name, info) for name, info in all_models.items() if info['type'] == 'tuned']

print(f"\n🔧 Regular Models:")
for name, info in sorted(regular_models, key=lambda x: x[1]['val_acc'], reverse=True):
    print(f"   {name}: {info['val_acc']:.4f} (epochs: {info['epochs_trained']})")

print(f"\n⚡ Tuned Models:")
for name, info in sorted(tuned_models, key=lambda x: x[1]['val_acc'], reverse=True):
    print(f"   {name}: {info['val_acc']:.4f}")

MODEL COMPARISON - ALL MODELS
📊 DETAILED MODEL PERFORMANCE:
--------------------------------------------------------------------------------
Rank Model Name                Type     Val Acc  Params       Next Prediction     
--------------------------------------------------------------------------------
1    Tuned Bidirectional LSTM  tuned    0.5481   475,533      vs_the_barcelona    
2    Tuned Simple RNN          tuned    0.4667   17,933       vs_the_barcelona    
3    Tuned LSTM                tuned    0.4667   124,317      trailer_official_the
4    Regular LSTM              regular  0.4593   196,397      vs_the_barcelona    
5    Regular Bidirectional LSTM regular  0.4519   473,037      trailer_official_the
6    Regular Simple RNN        regular  0.4370   51,341       vs_the_barcelona    

🥇 OVERALL WINNER: Tuned Bidirectional LSTM
   Validation Accuracy: 0.5481
   Model Type: Tuned
   Parameters: 475,533

📈 PERFORMANCE BY TYPE:

🔧 Regular Models:
   Regular LSTM: 0.4593 (epochs: 2

### Next 10 topics

In [None]:
def predict_next_n_topics(model, le, encoded_topics, n_predictions=10, sequence_length=15):
    # ? start with the last sequence from training data
    current_sequence = list(encoded_topics[-sequence_length:])
    predicted_topics = []

    for i in range(n_predictions):
        # ? reshape for model input
        input_seq = np.array([current_sequence])

        # ? predict
        predicted_probs = model.predict(input_seq, verbose=0)
        predicted_class = predicted_probs.argmax()

        # ? store prediction
        predicted_topics.append(predicted_class)

        # ? update sequence: remove first element, add prediction
        current_sequence = current_sequence[1:] + [predicted_class]

    # ? convert encoded predictions back to topic names
    predicted_topic_names = le.inverse_transform(predicted_topics)
    return predicted_topic_names, predicted_topics

In [176]:
print("\n" + "="*100)
print("NEXT 10 TOPIC PREDICTIONS - ALL MODELS")
print("="*100)

# Generate predictions for all models
all_model_predictions = {}

for model_name, model_info in all_models.items():
    model = model_info['model']
    predicted_names, predicted_encoded = predict_next_n_topics(
        model, le, encoded_topics, n_predictions=10, sequence_length=15
    )
    all_model_predictions[model_name] = {
        'names': predicted_names,
        'encoded': predicted_encoded,
        'val_acc': model_info['val_acc']
    }

# Display predictions for each model
for model_name, predictions in all_model_predictions.items():
    model_type = all_models[model_name]['type']
    val_acc = predictions['val_acc']

    print(f"\n{'-'*60}")
    print(f"🤖 {model_name.upper()} ({model_type.title()}) - Val Acc: {val_acc:.4f}")
    print(f"{'-'*60}")

    for i, (name, encoded) in enumerate(zip(predictions['names'], predictions['encoded']), 1):
        print(f"{i:2d}. {name} (encoded: {encoded})")

    print(f"\n📝 Sequence: {' → '.join(predictions['names'])}")

# Comprehensive comparison table
print(f"\n" + "="*120)
print("📊 SIDE-BY-SIDE COMPARISON TABLE")
print("="*120)

# Create headers
model_names = list(all_model_predictions.keys())
header = f"{'Pos':<4}"
for name in model_names:
    header += f"{name:<20}"
print(header)
print("-" * 120)

# Create comparison rows
for i in range(10):
    row = f"{i+1:<4}"
    for model_name in model_names:
        prediction = all_model_predictions[model_name]['names'][i]
        # Truncate long topic names for display
        display_pred = prediction[:17] + "..." if len(prediction) > 20 else prediction
        row += f"{display_pred:<20}"
    print(row)

# Accuracy ranking
print(f"\n🏅 MODEL RANKING BY VALIDATION ACCURACY:")
print("-" * 50)
sorted_by_acc = sorted(all_model_predictions.items(), key=lambda x: x[1]['val_acc'], reverse=True)

for rank, (model_name, pred_info) in enumerate(sorted_by_acc, 1):
    model_type = all_models[model_name]['type']
    emoji = "🥇" if rank == 1 else "🥈" if rank == 2 else "🥉" if rank == 3 else "🏅"
    print(f"{emoji} {rank}. {model_name} ({model_type}): {pred_info['val_acc']:.4f}")

# Consensus analysis
print(f"\n🔍 PREDICTION CONSENSUS ANALYSIS:")
print("-" * 40)

# Count how often each topic appears in position 1
first_predictions = [pred_info['names'][0] for pred_info in all_model_predictions.values()]
from collections import Counter
consensus_count = Counter(first_predictions)

print("Next topic consensus:")
for topic, count in consensus_count.most_common():
    percentage = (count / len(all_model_predictions)) * 100
    print(f"  {topic}: {count}/{len(all_model_predictions)} models ({percentage:.1f}%)")

# Summary statistics
print(f"\n📋 SUMMARY:")
print(f"   Total Models Compared: {len(all_models)}")
print(f"   Regular Models: {len([m for m in all_models.values() if m['type'] == 'regular'])}")
print(f"   Tuned Models: {len([m for m in all_models.values() if m['type'] == 'tuned'])}")
print(f"   Best Validation Accuracy: {max(info['val_acc'] for info in all_models.values()):.4f}")
print(f"   Average Validation Accuracy: {sum(info['val_acc'] for info in all_models.values()) / len(all_models):.4f}")


NEXT 10 TOPIC PREDICTIONS - ALL MODELS

------------------------------------------------------------
🤖 REGULAR SIMPLE RNN (Regular) - Val Acc: 0.4370
------------------------------------------------------------
 1. vs_the_barcelona (encoded: 12)
 2. vs_the_barcelona (encoded: 12)
 3. vs_the_barcelona (encoded: 12)
 4. linux_distro_fedora (encoded: 7)
 5. trailer_official_the (encoded: 11)
 6. vs_the_barcelona (encoded: 12)
 7. trailer_official_the (encoded: 11)
 8. vs_the_barcelona (encoded: 12)
 9. vs_the_barcelona (encoded: 12)
10. vs_the_barcelona (encoded: 12)

📝 Sequence: vs_the_barcelona → vs_the_barcelona → vs_the_barcelona → linux_distro_fedora → trailer_official_the → vs_the_barcelona → trailer_official_the → vs_the_barcelona → vs_the_barcelona → vs_the_barcelona

------------------------------------------------------------
🤖 REGULAR LSTM (Regular) - Val Acc: 0.4593
------------------------------------------------------------
 1. vs_the_barcelona (encoded: 12)
 2. vs_the_barc