In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    log_loss
)
import joblib

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier

# ============================================================
# 1. LOAD DATA
# ============================================================
df = pd.read_csv("YouTube_Engineered_Features_Final.csv")
df = df.dropna(subset=["sector"]).reset_index(drop=True)

text_cols = [
    "youtube_title", "youtube_description",
    "youtube_tags", "youtube_categories",
    "content_fine_category"
]
for c in text_cols:
    if c in df.columns:
        df[c] = df[c].fillna("").astype(str)

# ============================================================
# 2. NUMERIC FEATURES
# ============================================================
def safe_div(a, b):
    return np.where(b == 0, 0.0, a / b)

num_base = {
    "engagement_rate": df.get("engagement_rate", 0),
    "like_rate": df.get("like_rate", 0),
    "comment_rate": df.get("comment_rate", 0),
    "views_per_subscriber": df.get("views_per_subscriber", 0),
    "likes_per_subscriber": df.get("likes_per_subscriber", 0),
    "comments_per_subscriber": df.get("comments_per_subscriber", 0),
    "like_to_comment_ratio": df.get("like_to_comment_ratio", 0),
    "audience_engagement_index": df.get("audience_engagement_index", 0),
    "video_completeness_score": df.get("video_completeness_score", 0),
    "tts_quality_indicator": df.get("tts_quality_indicator", 0),
    "production_polish_score": df.get("production_polish_score", 0),
    "title_length_chars": df.get("title_length_chars", 0),
    "title_word_count": df.get("title_word_count", 0),
    "description_length_chars": df.get("description_length_chars", 0),
    "description_word_count": df.get("description_word_count", 0),
    "title_sentiment": df.get("title_sentiment", 0),
    "title_subjectivity": df.get("title_subjectivity", 0),
    "description_sentiment": df.get("description_sentiment", 0),
    "description_subjectivity": df.get("description_subjectivity", 0),
    "meta_description_sentiment": df.get("meta_description_sentiment", 0),
}

num_df = pd.DataFrame(num_base).astype(float)

# interactions / logs
num_df["log_views_ps"] = np.log1p(num_df["views_per_subscriber"])
num_df["log_like_rate"] = np.log1p(num_df["like_rate"])
num_df["log_comment_rate"] = np.log1p(num_df["comment_rate"])
num_df["engagement_per_word"] = safe_div(
    num_df["engagement_rate"],
    num_df["title_word_count"] + num_df["description_word_count"] + 1
)
num_df["sentiment_gap"] = (
    num_df["title_sentiment"].fillna(0) - num_df["description_sentiment"].fillna(0)
)

num_df = num_df.fillna(0.0)
NUMERIC_COLUMNS = list(num_df.columns)

# ============================================================
# 3. CATEGORICAL + TEXT FOR CATBOOST
# ============================================================
cat_cols = []
if "content_parent_category" in df.columns:
    cat_cols.append("content_parent_category")
if "youtube_channel" in df.columns:
    cat_cols.append("youtube_channel")
if "engagement_tier" in df.columns:
    cat_cols.append("engagement_tier")

text_feature_indices = []
cat_feature_indices = []

all_model_cols = NUMERIC_COLUMNS + cat_cols + text_cols

model_df = pd.concat(
    [num_df.reset_index(drop=True),
     df[cat_cols + text_cols].reset_index(drop=True)],
    axis=1
)

for i, col in enumerate(all_model_cols):
    if col in text_cols:
        text_feature_indices.append(i)
    elif col in cat_cols:
        cat_feature_indices.append(i)

# ============================================================
# 4. TARGET ENCODING
# ============================================================
le = LabelEncoder()
y = le.fit_transform(df["sector"].values)

# ============================================================
# 5. TRAIN/VAL SPLIT
# ============================================================
X = model_df.values
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# numeric block for XGBoost
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[:, :len(NUMERIC_COLUMNS)])
X_val_num = scaler.transform(X_val[:, :len(NUMERIC_COLUMNS)])

# ============================================================
# 6. FAST CATBOOST
# ============================================================
train_pool = Pool(
    X_train, y_train,
    cat_features=cat_feature_indices,
    text_features=text_feature_indices
)
val_pool = Pool(
    X_val, y_val,
    cat_features=cat_feature_indices,
    text_features=text_feature_indices
)

cat_model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Macro",
    iterations=400,
    learning_rate=0.08,
    depth=6,
    l2_leaf_reg=5.0,
    auto_class_weights="Balanced",
    random_seed=42,
    task_type="CPU",
    text_processing={
        "feature_processing": {
            "default": [
                "BoW:top_tokens_count=8000",
                "NaiveBayes",
                "BM25"
            ]
        }
    },
    verbose=50
)

cat_model.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True,
    early_stopping_rounds=40
)

# ============================================================
# 7. FAST XGBOOST
# ============================================================
xgb_model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.08,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    num_class=len(le.classes_),
    reg_lambda=1.0,
    reg_alpha=0.1,
    tree_method="hist",
    eval_metric="mlogloss"
)

xgb_model.fit(
    X=X_train_num,
    y=y_train,
    eval_set=[(X_val_num, y_val)],
    verbose=50
)

# ============================================================
# 8. STACKED META‚ÄëLEARNER (NO CALIBRATION)
# ============================================================
cat_proba_train = cat_model.predict_proba(train_pool)
xgb_proba_train = xgb_model.predict_proba(X_train_num)
stack_train = np.hstack([cat_proba_train, xgb_proba_train])

cat_proba_val = cat_model.predict_proba(val_pool)
xgb_proba_val = xgb_model.predict_proba(X_val_num)
stack_val = np.hstack([cat_proba_val, xgb_proba_val])

meta_learner = LogisticRegression(
    C=2.0,
    multi_class="multinomial",
    max_iter=200,
    n_jobs=-1
)
meta_learner.fit(stack_train, y_train)

y_val_pred = meta_learner.predict(stack_val)
y_val_proba = meta_learner.predict_proba(stack_val)

# ============================================================
# 9. ACCURACY CHECKS
# ============================================================
print("=== Individual models ===")
print("CatBoost  acc:", accuracy_score(y_val, np.argmax(cat_proba_val, axis=1)))
print("XGBoost   acc:", accuracy_score(y_val, np.argmax(xgb_proba_val, axis=1)))

print("\n=== Stacked model (recommended) ===")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Macro F1:", f1_score(y_val, y_val_pred, average="macro"))
print("Log loss:", log_loss(y_val, y_val_proba))
print(classification_report(y_val, y_val_pred, target_names=le.classes_))

val_df = pd.DataFrame({
    "sector_true": le.inverse_transform(y_val),
    "sector_pred": le.inverse_transform(y_val_pred)
})
val_df["correct"] = (val_df["sector_true"] == val_df["sector_pred"]).astype(int)
sector_perf = (
    val_df.groupby("sector_true")["correct"]
    .agg(["mean", "count"])
    .rename(columns={"mean": "accuracy", "count": "support"})
    .reset_index()
)
print("\n=== Per‚Äësector accuracy (stacked) ===")
print(sector_perf)

# ============================================================
# 10. SAVE ARTIFACTS
# ============================================================
joblib.dump(le, "sector_label_encoder.joblib")
joblib.dump(scaler, "xgb_numeric_scaler.joblib")
joblib.dump(cat_model, "sector_catboost_native.joblib")
joblib.dump(xgb_model, "sector_xgb_numeric.joblib")
joblib.dump(meta_learner, "sector_stack_meta_lr.joblib")
sector_perf.to_csv("sector_stack_validation_metrics.csv", index=False)


0:	learn: 0.9984380	test: 1.0000000	best: 1.0000000 (0)	total: 4.6s	remaining: 30m 35s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.
[0]	validation_0-mlogloss:1.77580
[50]	validation_0-mlogloss:1.59916
[100]	validation_0-mlogloss:1.58896
[150]	validation_0-mlogloss:1.58972
[200]	validation_0-mlogloss:1.59242
[249]	validation_0-mlogloss:1.59855
=== Individual models ===
CatBoost  acc: 1.0
XGBoost   acc: 0.35083333333333333

=== Stacked model (recommended) ===
Accuracy: 0.6483333333333333
Macro F1: 0.6471569307503162
Log loss: 0.9390782715068235
                      precision    recall  f1-score   support

           Education       0.66      0.55      0.60       400
       Entertainment       0.58      0.61      0.59       400
           Lifestyle       0.61      0.66      0.63       400
     News & Politics       0.68      0.74      0.70       400
Science & Technology       0.72      0.74      0.73       400


In [5]:
# train_metric_models_stacked_clean.py

import pandas as pd
import numpy as np
import joblib
from textblob import TextBlob
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ----------------------------------------------------------
# 1. Load data
# ----------------------------------------------------------

df = pd.read_csv("YouTube_Engineered_Features_Final.csv").reset_index(drop=True)

# we need these ONLY as labels (not as features)
target_cols = [
    "youtube_view_count",
    "youtube_like_count",
    "youtube_comment_count",
    "engagement_rate",
]

# inputs for feature building
input_cols = [
    "youtube_title",
    "youtube_description",
    "duration_seconds",
    "youtube_channel_follower_count",
]

df = df.dropna(subset=target_cols + input_cols).reset_index(drop=True)

# ----------------------------------------------------------
# 2. Build model features from text + basic stats
# ----------------------------------------------------------

def extract_sentiment(text: str):
    blob = TextBlob(str(text))
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def build_eng_features_row(row):
    title = str(row["youtube_title"])
    desc = str(row["youtube_description"])
    duration = float(row["duration_seconds"])
    followers = float(row["youtube_channel_follower_count"])

    title_len = len(title)
    desc_len = len(desc)
    title_words = len(title.split())
    desc_words = len(desc.split())

    t_pol, t_sub = extract_sentiment(title)
    d_pol, d_sub = extract_sentiment(desc)
    c_pol, c_sub = extract_sentiment(title + " " + desc)

    return {
        "title_length_chars": title_len,
        "description_length_chars": desc_len,
        "title_word_count": title_words,
        "description_word_count": desc_words,
        "title_sentiment": t_pol,
        "title_subjectivity": t_sub,
        "description_sentiment": d_pol,
        "description_subjectivity": d_sub,
        "combined_sentiment": c_pol,
        "combined_subjectivity": c_sub,
        "duration_seconds": duration,
        "youtube_channel_follower_count": followers,
    }

ENG_FEATURE_COLS = [
    "title_length_chars",
    "description_length_chars",
    "title_word_count",
    "description_word_count",
    "title_sentiment",
    "title_subjectivity",
    "description_sentiment",
    "description_subjectivity",
    "combined_sentiment",
    "combined_subjectivity",
    "duration_seconds",
    "youtube_channel_follower_count",
]

print("Building feature frame...")
feat_rows = []
for _, row in tqdm(df[input_cols + target_cols].iterrows(), total=len(df)):
    feat_rows.append(build_eng_features_row(row))

X_raw = pd.DataFrame(feat_rows)[ENG_FEATURE_COLS]

# ----------------------------------------------------------
# 3. Targets (labels) and transforms
# ----------------------------------------------------------

y_views_raw = df["youtube_view_count"].astype(float).values
y_likes_raw = df["youtube_like_count"].astype(float).values
y_comments_raw = df["youtube_comment_count"].astype(float).values
y_eng_raw = df["engagement_rate"].astype(float).values

# clip extreme outliers at 99.9% quantile
for arr in (y_views_raw, y_likes_raw, y_comments_raw):
    hi = np.quantile(arr, 0.999)
    arr[arr > hi] = hi

# log1p transform for count targets (better behaved) [web:183][web:186]
y_views = np.log1p(y_views_raw)
y_likes = np.log1p(y_likes_raw)
y_comments = np.log1p(y_comments_raw)
y_eng = y_eng_raw

# ----------------------------------------------------------
# 4. Train/validation split
# ----------------------------------------------------------

(
    X_train_raw, X_val_raw,
    yv_tr, yv_val,
    yl_tr, yl_val,
    yc_tr, yc_val,
    ye_tr, ye_val,
) = train_test_split(
    X_raw,
    y_views,
    y_likes,
    y_comments,
    y_eng,
    test_size=0.2,
    random_state=42,
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_val = scaler.transform(X_val_raw)

# ----------------------------------------------------------
# 5. Define models
# ----------------------------------------------------------

rf_params = dict(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=3,
    n_jobs=-1,
    random_state=42,
)

gb_params = dict(
    n_estimators=350,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42,
)

def make_stack():
    return StackingRegressor(
        estimators=[
            ("rf", RandomForestRegressor(**rf_params)),
            ("gb", GradientBoostingRegressor(**gb_params)),
        ],
        final_estimator=GradientBoostingRegressor(
            n_estimators=200, learning_rate=0.05, max_depth=2, random_state=42
        ),
        n_jobs=-1,
    )

views_model = make_stack()
likes_model = make_stack()
comments_model = make_stack()
eng_model = GradientBoostingRegressor(**gb_params)

# ----------------------------------------------------------
# 6. Train with simple progress messages
# ----------------------------------------------------------

print("\nTraining models...")
for name, model, y_tr in [
    ("Views", views_model, yv_tr),
    ("Likes", likes_model, yl_tr),
    ("Comments", comments_model, yc_tr),
    ("Engagement rate", eng_model, ye_tr),
]:
    print(f"- Fitting {name} model...")
    model.fit(X_train, y_tr)

# ----------------------------------------------------------
# 7. Accuracy checks (validation metrics)
# ----------------------------------------------------------

def print_reg_metrics(name, y_true_log, y_pred_log, is_log_target=True):
    if is_log_target:
        y_true = np.expm1(y_true_log)
        y_pred = np.expm1(y_pred_log)
    else:
        y_true = y_true_log
        y_pred = y_pred_log

    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)

    print(f"\n=== {name} ===")
    print(f"MAE  : {mae:.3f}")
    print(f"RMSE : {rmse:.3f}")
    print(f"R^2  : {r2:.3f}")

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

yv_pred_log = views_model.predict(X_val)
yl_pred_log = likes_model.predict(X_val)
yc_pred_log = comments_model.predict(X_val)
ye_pred = eng_model.predict(X_val)

print_reg_metrics("Views (log target, stacked)", yv_val, yv_pred_log, is_log_target=True)
print_reg_metrics("Likes (log target, stacked)", yl_val, yl_pred_log, is_log_target=True)
print_reg_metrics("Comments (log target, stacked)", yc_val, yc_pred_log, is_log_target=True)

print("\n=== Engagement rate (GBM) ===")
mae_eng = mean_absolute_error(ye_val, ye_pred)
rmse_eng = mean_squared_error(ye_val, ye_pred, squared=False)
r2_eng = r2_score(ye_val, ye_pred)
print(f"MAE  : {mae_eng:.4f}")
print(f"RMSE : {rmse_eng:.4f}")
print(f"R^2  : {r2_eng:.3f}")

# ----------------------------------------------------------
# 8. Save artifacts for Streamlit
# ----------------------------------------------------------

joblib.dump(scaler, "scaler_engagement_sentiment_v2.pkl")
joblib.dump(views_model, "model_views_regressor_stacked_log.pkl")
joblib.dump(likes_model, "model_likes_regressor_stacked_log.pkl")
joblib.dump(comments_model, "model_comments_regressor_stacked_log.pkl")
joblib.dump(eng_model, "model_engagement_rate_regressor_gb_v2.pkl")


Building feature frame...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11481/11481 [00:32<00:00, 352.95it/s]



Training models...
- Fitting Views model...
- Fitting Likes model...
- Fitting Comments model...
- Fitting Engagement rate model...





=== Views (log target, stacked) ===
MAE  : 246576.523
RMSE : 1195785.536
R^2  : 0.050

=== Likes (log target, stacked) ===
MAE  : 6070.811
RMSE : 36042.928
R^2  : 0.028

=== Comments (log target, stacked) ===
MAE  : 311.227
RMSE : 1499.151
R^2  : 0.103

=== Engagement rate (GBM) ===
MAE  : 0.0338
RMSE : 0.0702
R^2  : -0.073


['model_engagement_rate_regressor_gb_v2.pkl']

In [6]:
# train_hashtag_suggestion_fast.py

import ast
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, jaccard_score

# -----------------------------
# 1. CONFIG
# -----------------------------
MAX_TAGS = 1000       # keep at most this many hashtags
MIN_TAG_COUNT = 30    # minimum frequency
RANDOM_STATE = 42

# -----------------------------
# 2. LOAD DATA
# -----------------------------
df = pd.read_csv("YouTube_Engineered_Features_Final.csv")

# must have tags and text
df = df.dropna(subset=["youtube_tags", "youtube_title", "youtube_description"]).reset_index(drop=True)

def parse_tags(x):
    if isinstance(x, list):
        return [str(t).strip() for t in x if str(t).strip()]
    try:
        v = ast.literal_eval(str(x))
        if isinstance(v, list):
            return [str(t).strip() for t in v if str(t).strip()]
    except Exception:
        pass
    return []

df["tag_list"] = df["youtube_tags"].apply(parse_tags)
df = df[df["tag_list"].map(len) > 0].reset_index(drop=True)

# -----------------------------
# 3. BUILD COMBINED TEXT INPUT
# -----------------------------
def stringify_list_col(x):
    if isinstance(x, list):
        return " ".join(str(t).strip() for t in x if str(t).strip())
    try:
        v = ast.literal_eval(str(x))
        if isinstance(v, list):
            return " ".join(str(t).strip() for t in v if str(t).strip())
    except Exception:
        pass
    return str(x).strip()

title = df["youtube_title"].fillna("").astype(str)
desc = df["youtube_description"].fillna("").astype(str)

meta_parts = []
for col in ["content_fine_category", "youtube_categories", "meta_keywords"]:
    if col in df.columns:
        if col == "youtube_categories" or col == "meta_keywords":
            meta_parts.append(df[col].apply(stringify_list_col).fillna("").astype(str))
        else:
            meta_parts.append(df[col].fillna("").astype(str))

if meta_parts:
    meta_text = meta_parts[0]
    for s in meta_parts[1:]:
        meta_text = meta_text + " " + s
else:
    meta_text = pd.Series([""] * len(df))

hashtag_text = df["tag_list"].apply(lambda tags: " ".join(tags))

df["all_text"] = (
    title + " " +
    desc + " " +
    meta_text + " " +
    hashtag_text
).str.strip()

# -----------------------------
# 4. FILTER TO TOP HASHTAGS
# -----------------------------
mlb_full = MultiLabelBinarizer()
Y_full = mlb_full.fit_transform(df["tag_list"])
classes = mlb_full.classes_

tag_counts = Y_full.sum(axis=0)
tag_info = list(zip(classes, tag_counts))
tag_info.sort(key=lambda x: x[1], reverse=True)

kept_tags = [t for t, c in tag_info if c >= MIN_TAG_COUNT][:MAX_TAGS]
print("Total unique tags:", len(classes))
print("Tags kept:", len(kept_tags))

mlb = MultiLabelBinarizer(classes=kept_tags)
Y = mlb.fit_transform(df["tag_list"])

row_mask = Y.sum(axis=1) > 0
df = df[row_mask].reset_index(drop=True)
Y = Y[row_mask]

X_text = df["all_text"].fillna("").astype(str)

# -----------------------------
# 5. TRAIN / VALIDATION SPLIT
# -----------------------------
X_train_text, X_val_text, Y_train, Y_val = train_test_split(
    X_text, Y, test_size=0.2, random_state=RANDOM_STATE
)

# -----------------------------
# 6. TEXT VECTORIZER + MODEL
# -----------------------------
# Fast, high‚Äëquality TF‚ÄëIDF for titles/descriptions/metadata [web:213]
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
)

print("Fitting TF‚ÄëIDF...")
X_train = tfidf.fit_transform(X_train_text)
X_val = tfidf.transform(X_val_text)

# One‚Äëvs‚ÄëRest Logistic Regression for multilabel hashtags [web:201][web:203]
base_lr = LogisticRegression(
    solver="saga",
    penalty="l2",
    C=2.0,
    max_iter=200,
    n_jobs=-1,
)

clf = OneVsRestClassifier(base_lr, n_jobs=-1)

print("Training hashtag model...")
clf.fit(X_train, Y_train)

# -----------------------------
# 7. EVALUATION
# -----------------------------
# probabilities -> threshold 0.3 for each tag independently
Y_val_proba = clf.predict_proba(X_val)
Y_val_pred = (Y_val_proba >= 0.3).astype(int)

micro_f1 = f1_score(Y_val, Y_val_pred, average="micro")
macro_f1 = f1_score(Y_val, Y_val_pred, average="macro")
jacc = jaccard_score(Y_val, Y_val_pred, average="samples")

print("=== HASHTAG MODEL METRICS ===")
print(f"Micro F1  : {micro_f1:.4f}")
print(f"Macro F1  : {macro_f1:.4f}")
print(f"Jaccard@0.3: {jacc:.4f}")

# -----------------------------
# 8. SAVE ARTIFACTS
# -----------------------------
joblib.dump(tfidf, "tfidf_hashtags.pkl")
joblib.dump(mlb, "mlb_hashtags.pkl")
joblib.dump(clf, "hashtag_model_ovr_lr.pkl")


Total unique tags: 104992
Tags kept: 197




Fitting TF‚ÄëIDF...
Training hashtag model...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


=== HASHTAG MODEL METRICS ===
Micro F1  : 0.3306
Macro F1  : 0.2680
Jaccard@0.3: 0.1831


['hashtag_model_ovr_lr.pkl']

In [7]:
# evaluate_hashtag_model.py

import ast
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, jaccard_score

# -----------------------------
# 1. Load data and model artifacts
# -----------------------------
df = pd.read_csv("YouTube_Engineered_Features_Final.csv").reset_index(drop=True)

tfidf = joblib.load("tfidf_hashtags.pkl")
mlb = joblib.load("mlb_hashtags.pkl")
clf = joblib.load("hashtag_model_ovr_lr.pkl")

# -----------------------------
# 2. Prepare tags and text (same as training)
# -----------------------------
def parse_tags(x):
    if isinstance(x, list):
        return [str(t).strip() for t in x if str(t).strip()]
    try:
        v = ast.literal_eval(str(x))
        if isinstance(v, list):
            return [str(t).strip() for t in v if str(t).strip()]
    except Exception:
        pass
    return []

def stringify_list_col(x):
    if isinstance(x, list):
        return " ".join(str(t).strip() for t in x if str(t).strip())
    try:
        v = ast.literal_eval(str(x))
        if isinstance(v, list):
            return " ".join(str(t).strip() for t in v if str(t).strip())
    except Exception:
        pass
    return str(x).strip()

df = df.dropna(subset=["youtube_tags", "youtube_title", "youtube_description"]).reset_index(drop=True)
df["tag_list"] = df["youtube_tags"].apply(parse_tags)
df = df[df["tag_list"].map(len) > 0].reset_index(drop=True)

title = df["youtube_title"].fillna("").astype(str)
desc = df["youtube_description"].fillna("").astype(str)

meta_parts = []
for col in ["content_fine_category", "youtube_categories", "meta_keywords"]:
    if col in df.columns:
        if col in ["youtube_categories", "meta_keywords"]:
            meta_parts.append(df[col].apply(stringify_list_col).fillna("").astype(str))
        else:
            meta_parts.append(df[col].fillna("").astype(str))

if meta_parts:
    meta_text = meta_parts[0]
    for s in meta_parts[1:]:
        meta_text = meta_text + " " + s
else:
    meta_text = pd.Series([""] * len(df))

hashtag_text = df["tag_list"].apply(lambda tags: " ".join(tags))

df["all_text"] = (
    title + " " +
    desc + " " +
    meta_text + " " +
    hashtag_text
).str.strip()

X_text = df["all_text"].fillna("").astype(str)

# restrict labels to tags known to mlb (same as training)
Y = MultiLabelBinarizer(classes=mlb.classes_).fit_transform(df["tag_list"])

# remove rows with no kept tags
row_mask = Y.sum(axis=1) > 0
X_text = X_text[row_mask]
Y = Y[row_mask]

# -----------------------------
# 3. Vectorize and predict
# -----------------------------
X = tfidf.transform(X_text)

Y_proba = clf.predict_proba(X)   # shape (n_samples, n_tags)

def evaluate_at_threshold(th):
    Y_pred = (Y_proba >= th).astype(int)
    micro_f1 = f1_score(Y, Y_pred, average="micro")
    macro_f1 = f1_score(Y, Y_pred, average="macro")
    jacc = jaccard_score(Y, Y_pred, average="samples")
    print(f"\n=== Threshold = {th:.2f} ===")
    print(f"Micro F1   : {micro_f1:.4f}")
    print(f"Macro F1   : {macro_f1:.4f}")
    print(f"Jaccard    : {jacc:.4f}")

evaluate_at_threshold(0.2)
evaluate_at_threshold(0.3)





=== Threshold = 0.20 ===
Micro F1   : 0.6406
Macro F1   : 0.5697
Jaccard    : 0.4627

=== Threshold = 0.30 ===
Micro F1   : 0.5046
Macro F1   : 0.4289
Jaccard    : 0.3268
