## Setup

In [139]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import re
from collections import Counter

In [140]:
from bertopic import BERTopic
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [141]:
from sklearn.preprocessing import LabelEncoder

In [142]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
# import tensorflow_addons as tfa

## Prep

In [143]:
train=pd.read_csv("Web Scraping/youtube_history_chrome_linux.csv")


In [144]:
train

Unnamed: 0,Date,Title,Creator,Views
0,Today,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133K views
1,Today,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66K views
2,Today,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805K views
3,Today,Spain vs France 5-4 - Highlights & All Goals -...,Dollar VFX,848K views
4,Today,🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...,SPORTS EXTRA,625K views
...,...,...,...,...
684,May 19,SEVILLA FC 2 - 2 REAL MADRID I HIGHLIGHTS LALI...,LALIGA EA SPORTS,2.6M views
685,May 19,HP Victus Gaming 16 - Turn On Adaptive Battery...,HardReset.Info,250 views
686,May 19,How to Activate Adaptive Battery Optimizer on ...,HardReset.Info,231 views
687,May 19,Lil Yachty with the HARDEST walk out EVER,The Extra,13M views


# Data Preprocessing

In [145]:
def normalize_date(date_str):
    DAY_NAMES = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    current_time = datetime.now()

    if date_str == "Today":
        return pd.to_datetime(current_time.date())
    elif date_str == "Yesterday":
        return pd.to_datetime((current_time - timedelta(days=1)).date())
    elif date_str in DAY_NAMES:
        today_date = current_time.date()
        today_weekday = today_date.weekday()
        days_map = {
            "Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3,
            "Friday": 4, "Saturday": 5, "Sunday": 6
        }
        target_weekday = days_map[date_str]
        days_ago = (today_weekday - target_weekday + 7) % 7

        # If date_str is today's weekday (e.g., input "Friday" and today is Friday),
        # and "Today" is handled separately, this implies the "Friday" entry refers to
        # the previous week's Friday.
        if days_ago == 0 and date_str != current_time.strftime('%A'):
             # This case might need refinement based on exact data semantics.
             # For simplicity, if days_ago is 0, it means the most recent occurrence was today.
             # If "Today" is explicitly used for current day, a named day matching today's weekday
             # might imply last week, but the current calculation (days_ago=0) correctly points to today.
             # The problem description implies "Today" and "Yesterday" are special strings.
             # Other day names refer to their most recent occurrence.
             pass
        calculated_date = today_date - timedelta(days=days_ago)
        return pd.to_datetime(calculated_date)

    try:
        # Attempt to parse with year first (e.g., "Jun 06, 2025")
        return pd.to_datetime(date_str, format='%b %d, %Y')
    except ValueError:
        # If year is missing, assume current year (2025 as per original logic)
        # This part might need adjustment if the data can span multiple years without explicit year numbers
        # The original code used a hardcoded 2025, respecting that.
        return pd.to_datetime(f'{date_str}, 2025', format='%b %d, %Y')


In [146]:
train['Date'] = train['Date'].apply(normalize_date)

In [147]:
train.head(1)

Unnamed: 0,Date,Title,Creator,Views
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133K views


In [148]:
def clean_views(view_str_input):
    if pd.isna(view_str_input):
        return np.nan

    if not isinstance(view_str_input, str):
        if isinstance(view_str_input, (int, float)):
            return float(view_str_input)
        return np.nan

    s = view_str_input.lower().replace(" views", "").strip().replace(',', '')
    if not s:
        return np.nan
    multiplier = 1.0
    if s.endswith('k'):
        multiplier = 1000.0
        s = s[:-1].strip()
    elif s.endswith('m'):
        multiplier = 1_000_000.0
        s = s[:-1].strip()
    if not s:
        return np.nan
    try:
        return float(s) * multiplier
    except ValueError:
        return np.nan

In [149]:
train['Views'] = train['Views'].apply(clean_views)

In [150]:
train.head(1)

Unnamed: 0,Date,Title,Creator,Views
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0


In [151]:
train

Unnamed: 0,Date,Title,Creator,Views
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0
1,2025-06-06,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0
2,2025-06-06,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0
3,2025-06-06,Spain vs France 5-4 - Highlights & All Goals -...,Dollar VFX,848000.0
4,2025-06-06,🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...,SPORTS EXTRA,625000.0
...,...,...,...,...
684,2025-05-19,SEVILLA FC 2 - 2 REAL MADRID I HIGHLIGHTS LALI...,LALIGA EA SPORTS,2600000.0
685,2025-05-19,HP Victus Gaming 16 - Turn On Adaptive Battery...,HardReset.Info,250.0
686,2025-05-19,How to Activate Adaptive Battery Optimizer on ...,HardReset.Info,231.0
687,2025-05-19,Lil Yachty with the HARDEST walk out EVER,The Extra,13000000.0


In [152]:
train.head()

Unnamed: 0,Date,Title,Creator,Views
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0
1,2025-06-06,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0
2,2025-06-06,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0
3,2025-06-06,Spain vs France 5-4 - Highlights & All Goals -...,Dollar VFX,848000.0
4,2025-06-06,🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...,SPORTS EXTRA,625000.0


In [153]:
titles = train['Title'].tolist()

In [154]:
titles

['Spain vs. France FULL REACTION 👀 Lamine Yamal plays like he’s with his friends – Shaka | ESPN FC',
 "Did Spain vs. France SETTLE the Lamine Yamal-Ousmane Dembele Ballon d'Or debate?! 👀 | ESPN FC",
 'FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA NATIONS LEAGUE 2024/2025',
 'Spain vs France 5-4 - Highlights & All Goals - Nations League 2025',
 '🔴Spain vs France (5-4) Extended HIGHLIGHTS | UEFA Nations League Semi-Final',
 'Tyrese Haliburton Keeps Hitting Game Winners...',
 "Lamine Yamal Just ENDED the Ballon d'Or Debate! 🏆",
 'LAMINE YAMAL & RAYAN CHERKI ARE SAVING FOOTBALL.',
 'The moment 48k Chile fans applauded MESSI when he came on his first game for Argentina in 2025',
 'Lamine Yamal Defeats Kylian Mbappe 5 Times in a Row (France vs Spain Reaction)',
 'Club World Cup TICKET CRISIS?',
 'Lamine Yamal BLASTS Desire Doue and Mbappe after win - Post Match Interview - Spain 5-4 France',
 'LIKE THAT - Call of Duty Montage',
 'Camp Nou Construction Update (June 4, 2025)',
 'Lamine Record

In [155]:
all_words = ' '.join(train['Title'].astype(str)).lower().split()
all_words_clean = [re.sub(r'\W+', '', word) for word in all_words if re.sub(r'\W+', '', word) != '']

unique_words = set(all_words_clean)

print(f"Number of unique words in the Title column: {len(unique_words)}")

Number of unique words in the Title column: 1954


In [156]:
word_counts = Counter(all_words_clean)
most_common = word_counts.most_common(100)
print("20 most frequent words:")
for word, count in most_common:
    print(f"{word}: {count}")


20 most frequent words:
the: 171
to: 94
in: 72
vs: 62
is: 62
a: 51
and: 47
of: 45
on: 43
highlights: 38
barcelona: 38
at: 36
for: 34
league: 33
i: 32
game: 31
3: 30
official: 28
final: 26
lamine: 25
2025: 24
match: 24
this: 24
season: 24
united: 24
nba: 23
why: 23
with: 22
4: 22
messi: 22
after: 22
1: 22
2: 22
real: 21
trailer: 21
his: 20
fc: 20
full: 19
yamal: 19
how: 19
man: 19
5: 18
live: 18
spain: 17
uefa: 17
first: 17
win: 17
my: 17
inter: 17
2425: 17
reaction: 16
linux: 16
just: 15
world: 15
laliga: 15
grand: 15
f1: 15
france: 14
insane: 14
you: 14
prix: 14
code: 14
tottenham: 14
that: 13
10: 13
has: 13
it: 13
max: 13
new: 13
nations: 12
fans: 12
pacers: 12
from: 12
by: 12
not: 12
was: 12
eminem: 12
manchester: 12
all: 11
best: 11
champions: 11
knicks: 11
madrid: 11
joey: 11
tyrese: 10
haliburton: 10
club: 10
your: 10
ai: 10
moments: 10
sports: 10
claude: 10
no: 10
la: 10
finals: 9
up: 9
out: 9
me: 9
over: 9
as: 9


In [157]:
least_common = word_counts.most_common()[::-1]  # Balik urutan
print("20 kata yang paling jarang muncul:")
for word, count in least_common[:20]:
    print(f"{word}: {count}")


20 kata yang paling jarang muncul:
watchalong: 1
hardest: 1
yachty: 1
lil: 1
250: 1
compaq: 1
activate: 1
power: 1
gaming: 1
victus: 1
appreciation: 1
villareal: 1
23: 1
espanyol: 1
secured: 1
trofi: 1
angkat: 1
main: 1
superman: 1
emiliaromagna: 1


# Model

In [158]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=3, prediction_data=True)

model = BERTopic(
    language="multilingual",
    nr_topics=13,
    embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(titles)
train['Predicted_Topic'] = topics
train['Topic_Name'] = train['Predicted_Topic'].apply(lambda x: model.get_topic(x))
print(train[['Title', 'Predicted_Topic', 'Topic_Name']])


2025-06-06 14:57:09,752 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 22/22 [00:02<00:00,  8.80it/s]
2025-06-06 14:57:17,265 - BERTopic - Embedding - Completed ✓
2025-06-06 14:57:17,265 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-06 14:57:17,956 - BERTopic - Dimensionality - Completed ✓
2025-06-06 14:57:17,957 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-06 14:57:18,152 - BERTopic - Cluster - Completed ✓
2025-06-06 14:57:18,152 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-06 14:57:18,177 - BERTopic - Representation - Completed ✓
2025-06-06 14:57:18,177 - BERTopic - Topic reduction - Reducing number of topics
2025-06-06 14:57:18,179 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-06 14:57:18,188 - BERTopic - Representation - Completed ✓
2025-06-06 14:57:18,189 - BERTopic - Topic reduction - Redu

                                                 Title  Predicted_Topic  \
0    Spain vs. France FULL REACTION 👀 Lamine Yamal ...                0   
1    Did Spain vs. France SETTLE the Lamine Yamal-O...                0   
2    FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...                0   
3    Spain vs France 5-4 - Highlights & All Goals -...                0   
4    🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...                0   
..                                                 ...              ...   
684  SEVILLA FC 2 - 2 REAL MADRID I HIGHLIGHTS LALI...                0   
685  HP Victus Gaming 16 - Turn On Adaptive Battery...               -1   
686  How to Activate Adaptive Battery Optimizer on ...                0   
687          Lil Yachty with the HARDEST walk out EVER               -1   
688  REAL MADRID VS SEVILLA & BARCELONA VS VILLARRE...                0   

                                            Topic_Name  
0    [(vs, 0.038367261420305075), (the, 0.

## Custom Start

In [159]:
train['Topic_Keywords'] = train['Predicted_Topic'].apply(lambda x: model.get_topic(x))

In [160]:
# --- Start: Dynamic Topic Mapping ---
print("Inspect Discovered Topic Keywords (Top 5 words per topic):")
# model.get_topics() returns a dictionary: {topic_id: [(word, score), ...]}
# We iterate through sorted keys to have a consistent order for inspection
discovered_topics_keywords = model.get_topics()
for topic_id in sorted(discovered_topics_keywords.keys()):
    # model.get_topic(topic_id) also works and might be more direct here
    words_scores = discovered_topics_keywords[topic_id]
    words = [word for word, score in words_scores[:5]] # Get top 5 words
    print(f"Topic {topic_id}: {words}")
dynamic_topic_mapping = {}
for topic_id, topic_words_scores in discovered_topics_keywords.items():
    # Create a name from the top 2-3 keywords
    top_words = [str(word) for word, score in topic_words_scores[:3]] # Take top 3 words
    if top_words:
        # Join top words to form a name, e.g., "word1_word2_word3"
        dynamic_name = "_".join(top_words)
    else:
        # Fallback name if a topic has no words (should be rare for actual topics)
        dynamic_name = f"topic_{topic_id}"
    dynamic_topic_mapping[topic_id] = dynamic_name
# BERTopic often assigns -1 to outliers. Ensure it has a clear name.
# If -1 was in discovered_topics_keywords and got a name like "word1_word2", that's fine.
# If -1 represents outliers and wasn't explicitly in get_topics() or needs a generic name:
if -1 in train['Predicted_Topic'].unique(): # Check if -1 exists in predictions
    if -1 not in dynamic_topic_mapping:
        dynamic_topic_mapping[-1] = "Other_Outliers"
    elif not dynamic_topic_mapping.get(-1): # If it's there but empty
         dynamic_topic_mapping[-1] = "Other_Outliers"
print("\nDynamically Generated Topic Mapping:")
print(dynamic_topic_mapping)
# Now, use this dynamically generated mapping
train["Mapped_Topic_Name"] = train["Predicted_Topic"].map(dynamic_topic_mapping)
# --- End: Dynamic Topic Mapping ---

Inspect Discovered Topic Keywords (Top 5 words per topic):
Topic -1: ['the', 'in', 'live', 'just', 'champions']
Topic 0: ['vs', 'the', 'barcelona', 'highlights', 'to']
Topic 1: ['trailer', 'official', 'the', 'eminem', 'lamine']
Topic 2: ['is', 'transfer', 'what', 'over', 'season']
Topic 3: ['tech', 'most', 'your', 'development', '99']
Topic 4: ['linux', 'distro', 'fedora', 'windows', 'to']
Topic 5: ['code', 'using', 'neovim', 'vs', 'setup']
Topic 6: ['elephant', 'baby', 'dog', 'worst', 'mom']
Topic 7: ['google', 'microsoft', 'netflix', 'opened', 'gates']
Topic 8: ['claude', 'is', 'code', 'it', 'bronny']
Topic 9: ['seconds', '100', 'in', 'bash', '103']
Topic 10: ['forever', 'way', 'changed', 'has', 'tmux']
Topic 11: ['credits', 'end', 'cards', 'blatter', 'money']

Dynamically Generated Topic Mapping:
{-1: 'the_in_live', 0: 'vs_the_barcelona', 1: 'trailer_official_the', 2: 'is_transfer_what', 3: 'tech_most_your', 4: 'linux_distro_fedora', 5: 'code_using_neovim', 6: 'elephant_baby_dog', 7

In [161]:
print("\nDataFrame with dynamically mapped topic names:")
print(train[["Date", "Title", "Predicted_Topic", "Mapped_Topic_Name", "Topic_Keywords"]].head())


DataFrame with dynamically mapped topic names:
        Date                                              Title  \
0 2025-06-06  Spain vs. France FULL REACTION 👀 Lamine Yamal ...   
1 2025-06-06  Did Spain vs. France SETTLE the Lamine Yamal-O...   
2 2025-06-06  FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...   
3 2025-06-06  Spain vs France 5-4 - Highlights & All Goals -...   
4 2025-06-06  🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...   

   Predicted_Topic Mapped_Topic_Name  \
0                0  vs_the_barcelona   
1                0  vs_the_barcelona   
2                0  vs_the_barcelona   
3                0  vs_the_barcelona   
4                0  vs_the_barcelona   

                                      Topic_Keywords  
0  [(vs, 0.038367261420305075), (the, 0.032968890...  
1  [(vs, 0.038367261420305075), (the, 0.032968890...  
2  [(vs, 0.038367261420305075), (the, 0.032968890...  
3  [(vs, 0.038367261420305075), (the, 0.032968890...  
4  [(vs, 0.038367261420305075), (

## Custom End

In [162]:
hashable_cols = train.columns[train.applymap(lambda x: not isinstance(x, (list, dict))).all()]
train[hashable_cols].nunique()

Date                  19
Title                581
Creator              320
Views                366
Predicted_Topic       13
Mapped_Topic_Name     13
dtype: int64

In [163]:
train["Topic_Name"].head()

0    [(vs, 0.038367261420305075), (the, 0.032968890...
1    [(vs, 0.038367261420305075), (the, 0.032968890...
2    [(vs, 0.038367261420305075), (the, 0.032968890...
3    [(vs, 0.038367261420305075), (the, 0.032968890...
4    [(vs, 0.038367261420305075), (the, 0.032968890...
Name: Topic_Name, dtype: object

In [164]:
topic_info = model.get_topic_info()
print(topic_info)


    Topic  Count                               Name  \
0      -1     95                -1_the_in_live_just   
1       0    286      0_vs_the_barcelona_highlights   
2       1    152      1_trailer_official_the_eminem   
3       2     29            2_is_transfer_what_over   
4       3     24       3_tech_most_your_development   
5       4     23      4_linux_distro_fedora_windows   
6       5     19             5_code_using_neovim_vs   
7       6     18          6_elephant_baby_dog_worst   
8       7     15  7_google_microsoft_netflix_opened   
9       8     14                8_claude_is_code_it   
10      9      5              9_seconds_100_in_bash   
11     10      5         10_forever_way_changed_has   
12     11      4       11_credits_end_cards_blatter   

                                       Representation  \
0   [the, in, live, just, champions, mind, has, up...   
1   [vs, the, barcelona, highlights, to, league, i...   
2   [trailer, official, the, eminem, lamine, ai, n...   
3

In [165]:
for topic_id in topic_info['Topic'].unique():
    print(f"Topic {topic_id}:")
    print(model.get_topic(topic_id))
    print()


Topic -1:
[('the', np.float64(0.05143111313125825)), ('in', np.float64(0.03887464448203324)), ('live', np.float64(0.033049956843593783)), ('just', np.float64(0.029883748838936897)), ('champions', np.float64(0.02712586746827267)), ('mind', np.float64(0.026276786415329538)), ('has', np.float64(0.02592612018926992)), ('up', np.float64(0.022858986781492046)), ('at', np.float64(0.022317272946082607)), ('paris', np.float64(0.021950414535847743))]

Topic 0:
[('vs', np.float64(0.038367261420305075)), ('the', np.float64(0.032968890895126834)), ('barcelona', np.float64(0.032279415194303614)), ('highlights', np.float64(0.03165936191625454)), ('to', np.float64(0.029249197620512216)), ('league', np.float64(0.02841295962175117)), ('in', np.float64(0.026674733381649076)), ('nba', np.float64(0.025444115580079568)), ('game', np.float64(0.02503645370610425)), ('united', np.float64(0.023689701618520576))]

Topic 1:
[('trailer', np.float64(0.057788803229924156)), ('official', np.float64(0.0576462009652984

In [166]:
train.head()

Unnamed: 0,Date,Title,Creator,Views,Predicted_Topic,Topic_Name,Topic_Keywords,Mapped_Topic_Name
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
1,2025-06-06,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
2,2025-06-06,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
3,2025-06-06,Spain vs France 5-4 - Highlights & All Goals -...,Dollar VFX,848000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
4,2025-06-06,🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...,SPORTS EXTRA,625000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona


In [167]:
print(train[["Date", "Title", "Predicted_Topic", "Mapped_Topic_Name"]].head())


        Date                                              Title  \
0 2025-06-06  Spain vs. France FULL REACTION 👀 Lamine Yamal ...   
1 2025-06-06  Did Spain vs. France SETTLE the Lamine Yamal-O...   
2 2025-06-06  FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...   
3 2025-06-06  Spain vs France 5-4 - Highlights & All Goals -...   
4 2025-06-06  🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...   

   Predicted_Topic Mapped_Topic_Name  
0                0  vs_the_barcelona  
1                0  vs_the_barcelona  
2                0  vs_the_barcelona  
3                0  vs_the_barcelona  
4                0  vs_the_barcelona  


In [168]:
train.head(20)

Unnamed: 0,Date,Title,Creator,Views,Predicted_Topic,Topic_Name,Topic_Keywords,Mapped_Topic_Name
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
1,2025-06-06,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
2,2025-06-06,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
3,2025-06-06,Spain vs France 5-4 - Highlights & All Goals -...,Dollar VFX,848000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
4,2025-06-06,🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...,SPORTS EXTRA,625000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
5,2025-06-06,Tyrese Haliburton Keeps Hitting Game Winners...,Kenny For Real,80000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
6,2025-06-06,Lamine Yamal Just ENDED the Ballon d'Or Debate! 🏆,GRETT,2900.0,1,"[(trailer, 0.057788803229924156), (official, 0...","[(trailer, 0.057788803229924156), (official, 0...",trailer_official_the
7,2025-06-06,LAMINE YAMAL & RAYAN CHERKI ARE SAVING FOOTBALL.,CultureCams,12000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
8,2025-06-06,The moment 48k Chile fans applauded MESSI when...,SEVEN MASTERS,7700.0,-1,"[(the, 0.05143111313125825), (in, 0.0388746444...","[(the, 0.05143111313125825), (in, 0.0388746444...",the_in_live
9,2025-06-06,Lamine Yamal Defeats Kylian Mbappe 5 Times in ...,NickRTFM Extra,42000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona


In [169]:
for col in train.columns:
    print(train[col].value_counts())

Date
2025-05-26    66
2025-05-22    57
2025-05-21    56
2025-05-25    52
2025-05-19    50
2025-06-06    46
2025-05-30    44
2025-05-24    43
2025-06-05    41
2025-06-02    33
2025-05-29    30
2025-06-04    28
2025-06-01    28
2025-05-23    26
2025-05-27    26
2025-05-28    19
2025-05-31    18
2025-06-03    15
2025-05-20    11
Name: count, dtype: int64
Title
Tottenham Hotspur 1-0 Manchester United | Europa League 24/25 Match Highlights         4
Inter Miami CF vs. Columbus Crew | Full Match Highlights | Messi Brace + 3 Assists!    3
Microsoft just opened the flood gates…                                                 3
The Season Is OVER….                                                                   3
Black And Cant Play Basketball Disease (NBA Edition)                                   3
                                                                                      ..
Tailwind CSS is the worst…                                                             1
The Wrexham Situa

In [170]:
train

Unnamed: 0,Date,Title,Creator,Views,Predicted_Topic,Topic_Name,Topic_Keywords,Mapped_Topic_Name
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
1,2025-06-06,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
2,2025-06-06,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
3,2025-06-06,Spain vs France 5-4 - Highlights & All Goals -...,Dollar VFX,848000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
4,2025-06-06,🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...,SPORTS EXTRA,625000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
...,...,...,...,...,...,...,...,...
684,2025-05-19,SEVILLA FC 2 - 2 REAL MADRID I HIGHLIGHTS LALI...,LALIGA EA SPORTS,2600000.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
685,2025-05-19,HP Victus Gaming 16 - Turn On Adaptive Battery...,HardReset.Info,250.0,-1,"[(the, 0.05143111313125825), (in, 0.0388746444...","[(the, 0.05143111313125825), (in, 0.0388746444...",the_in_live
686,2025-05-19,How to Activate Adaptive Battery Optimizer on ...,HardReset.Info,231.0,0,"[(vs, 0.038367261420305075), (the, 0.032968890...","[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
687,2025-05-19,Lil Yachty with the HARDEST walk out EVER,The Extra,13000000.0,-1,"[(the, 0.05143111313125825), (in, 0.0388746444...","[(the, 0.05143111313125825), (in, 0.0388746444...",the_in_live


In [171]:
train.drop(["Predicted_Topic", "Topic_Name"], axis=1, inplace=True)

In [172]:
train

Unnamed: 0,Date,Title,Creator,Views,Topic_Keywords,Mapped_Topic_Name
0,2025-06-06,Spain vs. France FULL REACTION 👀 Lamine Yamal ...,ESPN FC,133000.0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
1,2025-06-06,Did Spain vs. France SETTLE the Lamine Yamal-O...,ESPN FC,66000.0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
2,2025-06-06,FULL HIGHLIGHT! SPAIN (5) VS (4) FRANCE | UEFA...,RCTI Sports,805000.0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
3,2025-06-06,Spain vs France 5-4 - Highlights & All Goals -...,Dollar VFX,848000.0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
4,2025-06-06,🔴Spain vs France (5-4) Extended HIGHLIGHTS | U...,SPORTS EXTRA,625000.0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
...,...,...,...,...,...,...
684,2025-05-19,SEVILLA FC 2 - 2 REAL MADRID I HIGHLIGHTS LALI...,LALIGA EA SPORTS,2600000.0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
685,2025-05-19,HP Victus Gaming 16 - Turn On Adaptive Battery...,HardReset.Info,250.0,"[(the, 0.05143111313125825), (in, 0.0388746444...",the_in_live
686,2025-05-19,How to Activate Adaptive Battery Optimizer on ...,HardReset.Info,231.0,"[(vs, 0.038367261420305075), (the, 0.032968890...",vs_the_barcelona
687,2025-05-19,Lil Yachty with the HARDEST walk out EVER,The Extra,13000000.0,"[(the, 0.05143111313125825), (in, 0.0388746444...",the_in_live


In [173]:
train['Date']=pd.to_datetime(train['Date'])

In [174]:
train = train.reset_index().sort_values(by=['Date', 'index'], ascending=[True, True]).drop(columns='index')

In [175]:
le = LabelEncoder()

In [176]:
encoded_topics = le.fit_transform(train['Mapped_Topic_Name'])

In [177]:
num_classes = len(le.classes_)
print(f"Number of unique topic classes: {num_classes}")
print(f"Topic classes: {le.classes_}")

Number of unique topic classes: 13
Topic classes: ['claude_is_code' 'code_using_neovim' 'credits_end_cards'
 'elephant_baby_dog' 'forever_way_changed' 'google_microsoft_netflix'
 'is_transfer_what' 'linux_distro_fedora' 'seconds_100_in'
 'tech_most_your' 'the_in_live' 'trailer_official_the' 'vs_the_barcelona']


In [178]:
sequence_length = 15
X = []
y = []

for i in range(len(encoded_topics) - sequence_length):
    X.append(encoded_topics[i:i+sequence_length])
    y.append(encoded_topics[i+sequence_length])

X = np.array(X)
y = np.array(y)

In [179]:
optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-5)

model = Sequential([
    Embedding(input_dim=num_classes, output_dim=128, input_length=sequence_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    BatchNormalization(),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [180]:
model.fit(X, y, epochs=30, batch_size=16)


Epoch 1/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.2072 - loss: 3.1162
Epoch 2/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3172 - loss: 2.4297
Epoch 3/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3643 - loss: 2.1439
Epoch 4/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3761 - loss: 2.1350
Epoch 5/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3978 - loss: 1.9755
Epoch 6/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4797 - loss: 1.6637
Epoch 7/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4355 - loss: 1.7320
Epoch 8/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.4818 - loss: 1.5908
Epoch 9/30
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fd5592b7440>

In [181]:
last_sequence = encoded_topics[-15:]
input_seq = np.array([last_sequence])

predicted_class = model.predict(input_seq, verbose=0).argmax()
predicted_topic = le.inverse_transform([predicted_class])[0]

print("Next topic prediction:", predicted_topic)


Next topic prediction: trailer_official_the


In [182]:
seq_length = 15  # sequence length for model

# Suppose encoded_topics is a list/array of encoded historical topics
current_sequence = list(encoded_topics[-seq_length:])  # take the last 15 topics as a list
predicted_topics = []

for _ in range(50):  # predict the next 50 topics
    input_seq = np.array([current_sequence])  # shape (1, seq_length)
    predicted_class = model.predict(input_seq, verbose=0).argmax()  # predict the next class/topic

    predicted_topics.append(predicted_class)
    # update current_sequence: remove the first element, add the latest prediction
    current_sequence = current_sequence[1:] + [predicted_class]

# Convert the predicted encoded classes to the original topic names
predicted_topic_names = le.inverse_transform(predicted_topics)
print(predicted_topic_names)


['trailer_official_the' 'trailer_official_the' 'is_transfer_what'
 'the_in_live' 'trailer_official_the' 'trailer_official_the'
 'trailer_official_the' 'trailer_official_the' 'the_in_live' 'the_in_live'
 'trailer_official_the' 'elephant_baby_dog' 'elephant_baby_dog'
 'the_in_live' 'vs_the_barcelona' 'vs_the_barcelona' 'vs_the_barcelona'
 'vs_the_barcelona' 'vs_the_barcelona' 'vs_the_barcelona'
 'trailer_official_the' 'elephant_baby_dog' 'the_in_live'
 'is_transfer_what' 'trailer_official_the' 'the_in_live' 'the_in_live'
 'the_in_live' 'is_transfer_what' 'vs_the_barcelona' 'vs_the_barcelona'
 'claude_is_code' 'claude_is_code' 'elephant_baby_dog' 'elephant_baby_dog'
 'elephant_baby_dog' 'elephant_baby_dog' 'elephant_baby_dog'
 'elephant_baby_dog' 'elephant_baby_dog' 'elephant_baby_dog'
 'vs_the_barcelona' 'vs_the_barcelona' 'vs_the_barcelona'
 'vs_the_barcelona' 'vs_the_barcelona' 'vs_the_barcelona'
 'trailer_official_the' 'trailer_official_the' 'vs_the_barcelona']
