In [3]:
# train_sector_model.py  (no channel, no fine/parent/youtube_categories)

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from catboost import CatBoostClassifier, Pool
import joblib

# ============================================================
# 1. LOAD DATA
# ============================================================

df = pd.read_csv("YouTube_Engineered_Features_Final.csv")

# target: sector
df = df.dropna(subset=["sector"]).reset_index(drop=True)

# ------------------------------------------------------------
# TEXT FEATURES
# ------------------------------------------------------------
TEXT_COLS = ["youtube_title", "youtube_description"]
for c in TEXT_COLS:
    df[c] = df[c].fillna("").astype(str)

# ------------------------------------------------------------
# NO CATEGORICAL FEATURES (we exclude youtube_channel, fine/parent/etc.)
# ------------------------------------------------------------
CAT_COLS: list[str] = []

# ------------------------------------------------------------
# NUMERIC FEATURES (safe, non-leaky)
# ------------------------------------------------------------
NUM_COLS = [
    "title_length_chars",
    "title_word_count",
    "description_length_chars",
    "description_word_count",
    "title_sentiment",
    "title_subjectivity",
    "description_sentiment",
    "description_subjectivity",
    "meta_description_sentiment",
    "duration_seconds",
    "youtube_channel_follower_count",
    "audience_engagement_index",
    "video_completeness_score",
    "tts_quality_indicator",
    "production_polish_score",
    "views_per_subscriber",
    "likes_per_subscriber",
    "comments_per_subscriber",
    "engagement_rate",
    "like_rate",
    "comment_rate",
    "like_to_comment_ratio",
]

NUM_COLS = [c for c in NUM_COLS if c in df.columns]
df[NUM_COLS] = df[NUM_COLS].fillna(0.0).astype(float)

# ============================================================
# 2. BUILD TRAINING FRAME
# ============================================================

def build_sector_training_frame(df: pd.DataFrame):
    # encode target
    le_sector = LabelEncoder()
    y = le_sector.fit_transform(df["sector"].astype(str))

    encoders = {}  # kept for API symmetry; empty because no CAT_COLS

    X = pd.DataFrame(index=df.index)

    # add text
    for c in TEXT_COLS:
        X[c] = df[c]

    # add numeric
    for c in NUM_COLS:
        X[c] = df[c]

    feature_cols = list(X.columns)

    text_idx = [feature_cols.index(c) for c in TEXT_COLS]
    cat_idx: list[int] = []  # no categorical features

    return X, y, feature_cols, text_idx, cat_idx, le_sector, encoders


X_sector, y_sector, feature_cols_sector, text_idx, cat_idx, le_sector, cat_encoders = \
    build_sector_training_frame(df)

# ============================================================
# 3. TRAIN / VALIDATION SPLIT
# ============================================================

X_train, X_val, y_train, y_val = train_test_split(
    X_sector,
    y_sector,
    test_size=0.2,
    random_state=42,
    stratify=y_sector,
)

train_pool = Pool(X_train, y_train, text_features=text_idx)
val_pool = Pool(X_val, y_val, text_features=text_idx)

# ============================================================
# 4. TRAIN CATBOOST (SECTOR PREDICTION)
# ============================================================

sector_model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Macro",
    iterations=500,
    learning_rate=0.08,
    depth=6,
    auto_class_weights="Balanced",
    random_seed=42,
    text_processing={
        "feature_processing": {
            "default": [
                "BoW:top_tokens_count=8000",
                "NaiveBayes",
                "BM25",
            ]
        }
    },
    verbose=50,
)

sector_model.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True,
    early_stopping_rounds=50,
)

# ============================================================
# 5. ACCURACY CHECKS
# ============================================================

y_val_pred = sector_model.predict(val_pool).astype(int).ravel()

acc = accuracy_score(y_val, y_val_pred)
macro_f1 = f1_score(y_val, y_val_pred, average="macro")

print("=== SECTOR MODEL METRICS (text + numeric only) ===")
print("Accuracy :", acc)
print("Macro F1 :", macro_f1)
print(classification_report(y_val, y_val_pred, target_names=le_sector.classes_))

val_df = pd.DataFrame({
    "sector_true": le_sector.inverse_transform(y_val),
    "sector_pred": le_sector.inverse_transform(y_val_pred),
})
val_df["correct"] = (val_df["sector_true"] == val_df["sector_pred"]).astype(int)
sector_perf = (
    val_df.groupby("sector_true")["correct"]
    .agg(["mean", "count"])
    .rename(columns={"mean": "accuracy", "count": "support"})
    .reset_index()
)
print("\nPer‑sector accuracy:")
print(sector_perf)

sector_perf.to_csv("sector_validation_metrics.csv", index=False)

# ============================================================
# 6. SAVE ARTIFACTS
# ============================================================

sector_model.save_model("sector_model.cbm")
joblib.dump(le_sector, "label_encoder_sector.pkl")
joblib.dump(feature_cols_sector, "sector_feature_cols.joblib")
joblib.dump(text_idx, "sector_text_idx.joblib")
joblib.dump(cat_idx, "sector_cat_idx.joblib")          # empty list
joblib.dump(cat_encoders, "sector_cat_encoders.joblib")  # empty dict


0:	learn: 0.2531741	test: 0.2587594	best: 0.2587594 (0)	total: 3.69s	remaining: 30m 41s
50:	learn: 0.4586093	test: 0.4587602	best: 0.4598703 (49)	total: 3m 27s	remaining: 30m 30s
100:	learn: 0.5097587	test: 0.5047481	best: 0.5047481 (100)	total: 7m 2s	remaining: 27m 49s
150:	learn: 0.5516949	test: 0.5381790	best: 0.5381790 (150)	total: 10m 43s	remaining: 24m 47s
200:	learn: 0.5823063	test: 0.5539518	best: 0.5539518 (200)	total: 14m 13s	remaining: 21m 9s
250:	learn: 0.6057994	test: 0.5683452	best: 0.5689522 (245)	total: 17m 32s	remaining: 17m 23s
300:	learn: 0.6293990	test: 0.5775385	best: 0.5775385 (300)	total: 20m 49s	remaining: 13m 46s
350:	learn: 0.6408540	test: 0.5817196	best: 0.5834477 (349)	total: 24m 8s	remaining: 10m 14s
400:	learn: 0.6504682	test: 0.5888616	best: 0.5915244 (391)	total: 27m 37s	remaining: 6m 49s
450:	learn: 0.6618873	test: 0.5944585	best: 0.5951202 (441)	total: 31m 38s	remaining: 3m 26s
499:	learn: 0.6742555	test: 0.5952554	best: 0.5952554 (499)	total: 35m 22s	

['sector_cat_encoders.joblib']

In [4]:
# train_sector_model_with_extra_text.py

import ast
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from catboost import CatBoostClassifier, Pool
import joblib

# ============================================================
# 1. LOAD DATA
# ============================================================

df = pd.read_csv("YouTube_Engineered_Features_Final.csv")

df = df.dropna(subset=["sector"]).reset_index(drop=True)

# ---------- helper to stringify list-like columns ----------
def stringify_list_col(x):
    if isinstance(x, list):
        return " ".join(str(t).strip() for t in x if str(t).strip())
    try:
        v = ast.literal_eval(str(x))
        if isinstance(v, list):
            return " ".join(str(t).strip() for t in v if str(t).strip())
    except Exception:
        pass
    s = str(x).strip()
    return s if s else ""

# ============================================================
# 2. BUILD TEXT FEATURE WITH EXTRA SIGNAL
# ============================================================

# base text
df["youtube_title"] = df["youtube_title"].fillna("").astype(str)
df["youtube_description"] = df["youtube_description"].fillna("").astype(str)

# convert categories to plain text
df["fine_cat_text"] = df["content_fine_category"].fillna("").astype(str)
df["yt_cat_text"] = df["youtube_categories"].apply(stringify_list_col)

# combined text feature the model will use
df["all_text"] = (
    df["youtube_title"] + " " +
    df["youtube_description"] + " " +
    df["fine_cat_text"] + " " +
    df["yt_cat_text"]
)

# ============================================================
# 3. NUMERIC FEATURES (safe, non-leaky)
# ============================================================

NUM_COLS = [
    "title_length_chars",
    "title_word_count",
    "description_length_chars",
    "description_word_count",
    "title_sentiment",
    "title_subjectivity",
    "description_sentiment",
    "description_subjectivity",
    "meta_description_sentiment",
    "duration_seconds",
    "youtube_channel_follower_count",
    "audience_engagement_index",
    "video_completeness_score",
    "tts_quality_indicator",
    "production_polish_score",
    "views_per_subscriber",
    "likes_per_subscriber",
    "comments_per_subscriber",
    "engagement_rate",
    "like_rate",
    "comment_rate",
    "like_to_comment_ratio",
]

NUM_COLS = [c for c in NUM_COLS if c in df.columns]
df[NUM_COLS] = df[NUM_COLS].fillna(0.0).astype(float)

# ============================================================
# 4. BUILD TRAINING FRAME
# ============================================================

def build_sector_training_frame(df: pd.DataFrame):
    # encode target
    le_sector = LabelEncoder()
    y = le_sector.fit_transform(df["sector"].astype(str))

    # feature frame: one text column + numeric columns
    X = pd.DataFrame(index=df.index)
    X["all_text"] = df["all_text"]

    for c in NUM_COLS:
        X[c] = df[c]

    feature_cols = list(X.columns)

    # 'all_text' is the only text feature, at index 0
    text_idx = [feature_cols.index("all_text")]
    cat_idx: list[int] = []  # no explicit categorical features

    encoders = {}  # kept for consistency

    return X, y, feature_cols, text_idx, cat_idx, le_sector, encoders


X_sector, y_sector, feature_cols_sector, text_idx, cat_idx, le_sector, cat_encoders = \
    build_sector_training_frame(df)

# ============================================================
# 5. TRAIN / VALIDATION SPLIT
# ============================================================

X_train, X_val, y_train, y_val = train_test_split(
    X_sector,
    y_sector,
    test_size=0.2,
    random_state=42,
    stratify=y_sector,
)

train_pool = Pool(
    X_train,
    label=y_train,
    text_features=text_idx,   # only all_text is text
)
val_pool = Pool(
    X_val,
    label=y_val,
    text_features=text_idx,
)

# ============================================================
# 6. TRAIN CATBOOST SECTOR MODEL
# ============================================================

sector_model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Macro",
    iterations=500,
    learning_rate=0.08,
    depth=6,
    auto_class_weights="Balanced",
    random_seed=42,
    text_processing={
        "feature_processing": {
            "default": [
                "BoW:top_tokens_count=8000",
                "NaiveBayes",
                "BM25",
            ]
        }
    },
    verbose=50,
)

sector_model.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True,
    early_stopping_rounds=50,
)

# ============================================================
# 7. ACCURACY CHECKS
# ============================================================

y_val_pred = sector_model.predict(val_pool).astype(int).ravel()

acc = accuracy_score(y_val, y_val_pred)
macro_f1 = f1_score(y_val, y_val_pred, average="macro")

print("=== SECTOR MODEL METRICS (text + extra text + numeric) ===")
print("Accuracy :", acc)
print("Macro F1 :", macro_f1)
print(classification_report(y_val, y_val_pred, target_names=le_sector.classes_))

val_df = pd.DataFrame({
    "sector_true": le_sector.inverse_transform(y_val),
    "sector_pred": le_sector.inverse_transform(y_val_pred),
})
val_df["correct"] = (val_df["sector_true"] == val_df["sector_pred"]).astype(int)
sector_perf = (
    val_df.groupby("sector_true")["correct"]
    .agg(["mean", "count"])
    .rename(columns={"mean": "accuracy", "count": "support"})
    .reset_index()
)
print("\nPer‑sector accuracy:")
print(sector_perf)

sector_perf.to_csv("sector_validation_metrics.csv", index=False)

# ============================================================
# 8. SAVE ARTIFACTS
# ============================================================

sector_model.save_model("sector_model.cbm")
joblib.dump(le_sector, "label_encoder_sector.pkl")
joblib.dump(feature_cols_sector, "sector_feature_cols.joblib")
joblib.dump(text_idx, "sector_text_idx.joblib")
joblib.dump(cat_idx, "sector_cat_idx.joblib")          # still empty
joblib.dump(cat_encoders, "sector_cat_encoders.joblib")  # still empty


0:	learn: 0.4461354	test: 0.4449311	best: 0.4449311 (0)	total: 2.16s	remaining: 17m 59s
50:	learn: 0.9333860	test: 0.9287363	best: 0.9287363 (50)	total: 2m 17s	remaining: 20m 10s
100:	learn: 0.9800791	test: 0.9791826	best: 0.9791826 (100)	total: 4m 33s	remaining: 18m 2s
150:	learn: 0.9898985	test: 0.9879350	best: 0.9879350 (150)	total: 9m 6s	remaining: 21m 4s
200:	learn: 0.9934448	test: 0.9921015	best: 0.9921015 (191)	total: 13m 14s	remaining: 19m 41s
250:	learn: 0.9937583	test: 0.9925219	best: 0.9925219 (216)	total: 17m 8s	remaining: 16m 59s
300:	learn: 0.9946939	test: 0.9937644	best: 0.9937644 (296)	total: 21m 10s	remaining: 13m 59s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9937644052
bestIteration = 296

Shrink model to first 297 iterations.
=== SECTOR MODEL METRICS (text + extra text + numeric) ===
Accuracy : 0.99375
Macro F1 : 0.9937644052241983
                      precision    recall  f1-score   support

           Education       1.00      0.99      0

['sector_cat_encoders.joblib']

In [5]:
# train_catboost_metrics_richer.py

import ast
import pandas as pd
import numpy as np
import joblib
from textblob import TextBlob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool

# ----------------------------------------------------------
# 1. Load data
# ----------------------------------------------------------

df = pd.read_csv("YouTube_Engineered_Features_Final.csv").reset_index(drop=True)

required_cols = [
    "youtube_title",
    "youtube_description",
    "duration_seconds",
    "youtube_channel_follower_count",
    "youtube_view_count",
    "youtube_like_count",
    "youtube_comment_count",
    "engagement_rate",
]

df = df.dropna(subset=required_cols).reset_index(drop=True)

# ----------------------------------------------------------
# 2. Helper functions for features
# ----------------------------------------------------------

CALL_TO_ACTION_WORDS = [
    "subscribe", "sub", "like", "comment", "share",
    "watch", "click", "link", "join", "follow",
]

def extract_sentiment(text: str):
    blob = TextBlob(str(text))
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def readability_features(text: str):
    words = text.split()
    n_words = len(words)
    n_chars = len(text)
    avg_word_len = (sum(len(w) for w in words) / n_words) if n_words > 0 else 0.0
    n_sentences = max(text.count(".") + text.count("!") + text.count("?"), 1)
    avg_sentence_len = n_words / n_sentences
    # simple Flesch-like proxy (not exact formula, but correlated) [web:177][web:180]
    flesch_proxy = 206.835 - 1.015 * avg_sentence_len - 0.84 * avg_word_len
    return avg_word_len, avg_sentence_len, flesch_proxy

def keyword_features(text: str):
    text_l = text.lower()
    count = sum(text_l.count(w) for w in CALL_TO_ACTION_WORDS)
    present = int(count > 0)
    return count, present

# ----------------------------------------------------------
# 3. Build feature frame
# ----------------------------------------------------------

print("Building feature frame...")

feat_rows = []
targets_rows = []

for _, row in tqdm(df[required_cols].iterrows(), total=len(df)):
    title = str(row["youtube_title"])
    desc = str(row["youtube_description"])
    full_text = f"{title} {desc}"

    # sentiment
    t_pol, t_sub = extract_sentiment(title)
    d_pol, d_sub = extract_sentiment(desc)
    c_pol, c_sub = extract_sentiment(full_text)

    # readability
    avg_word_len, avg_sentence_len, flesch_proxy = readability_features(full_text)

    # call-to-action features
    cta_count, cta_present = keyword_features(full_text)

    duration = float(row["duration_seconds"])
    subs = float(row["youtube_channel_follower_count"])
    views = float(row["youtube_view_count"])
    likes = float(row["youtube_like_count"])
    comments = float(row["youtube_comment_count"])
    eng_rate = float(row["engagement_rate"])

    # derived non-leaky ratios
    views_per_sub = views / max(subs, 1.0)
    likes_per_view = likes / max(views, 1.0)

    feat_rows.append({
        "duration_seconds": duration,
        "subs": subs,
        "title_length_chars": len(title),
        "description_length_chars": len(desc),
        "title_word_count": len(title.split()),
        "description_word_count": len(desc.split()),
        "title_sentiment": t_pol,
        "title_subjectivity": t_sub,
        "description_sentiment": d_pol,
        "description_subjectivity": d_sub,
        "combined_sentiment": c_pol,
        "combined_subjectivity": c_sub,
        "avg_word_length": avg_word_len,
        "avg_sentence_length": avg_sentence_len,
        "flesch_proxy": flesch_proxy,
        "cta_word_count": cta_count,
        "cta_present": cta_present,
    })

    targets_rows.append({
        "log_views_per_sub": np.log1p(views_per_sub),
        "log_likes_per_view": np.log1p(likes_per_view),
        "engagement_rate": eng_rate,
    })

X = pd.DataFrame(feat_rows)
T = pd.DataFrame(targets_rows)

# ----------------------------------------------------------
# 4. Train/validation split
# ----------------------------------------------------------

X_train, X_val, T_train, T_val = train_test_split(
    X, T, test_size=0.2, random_state=42
)

train_pool = Pool(X_train, label=T_train["log_views_per_sub"])
val_pool = Pool(X_val, label=T_val["log_views_per_sub"])

# ----------------------------------------------------------
# 5. Define and train CatBoost models
# ----------------------------------------------------------

def make_regressor():
    return CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="RMSE",
        iterations=400,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        verbose=50,
    )

print("\nTraining CatBoost for log_views_per_sub...")
model_views = make_regressor()
model_views.fit(train_pool, eval_set=val_pool, use_best_model=True)

print("\nTraining CatBoost for log_likes_per_view...")
train_pool_l = Pool(X_train, label=T_train["log_likes_per_view"])
val_pool_l = Pool(X_val, label=T_val["log_likes_per_view"])
model_likes = make_regressor()
model_likes.fit(train_pool_l, eval_set=val_pool_l, use_best_model=True)

print("\nTraining CatBoost for engagement_rate...")
train_pool_e = Pool(X_train, label=T_train["engagement_rate"])
val_pool_e = Pool(X_val, label=T_val["engagement_rate"])
model_eng = CatBoostRegressor(
    loss_function="RMSE",
    eval_metric="RMSE",
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=50,
)
model_eng.fit(train_pool_e, eval_set=val_pool_e, use_best_model=True)

# ----------------------------------------------------------
# 6. Accuracy checks
# ----------------------------------------------------------

def print_reg_metrics_ratio(name, y_true_log, y_pred_log, scale_desc):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)

    print(f"\n=== {name} ({scale_desc}) ===")
    print(f"MAE  : {mae:.4f}")
    print(f"RMSE : {rmse:.4f}")
    print(f"R^2  : {r2:.3f}")

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# views per sub
y_v_true = T_val["log_views_per_sub"].values
y_v_pred = model_views.predict(X_val)
print_reg_metrics_ratio(
    "Views per subscriber",
    y_v_true,
    y_v_pred,
    "ratio target"
)

# likes per view
y_l_true = T_val["log_likes_per_view"].values
y_l_pred = model_likes.predict(X_val)
print_reg_metrics_ratio(
    "Likes per view",
    y_l_true,
    y_l_pred,
    "ratio target"
)

# engagement rate (already [0,1])
y_e_true = T_val["engagement_rate"].values
y_e_pred = model_eng.predict(X_val)

mae_e = mean_absolute_error(y_e_true, y_e_pred)
rmse_e = mean_squared_error(y_e_true, y_e_pred, squared=False)
r2_e = r2_score(y_e_true, y_e_pred)

print("\n=== Engagement rate (CatBoost) ===")
print(f"MAE  : {mae_e:.4f}")
print(f"RMSE : {rmse_e:.4f}")
print(f"R^2  : {r2_e:.3f}")

# ----------------------------------------------------------
# 7. Save models and feature list
# ----------------------------------------------------------

model_views.save_model("cb_log_views_per_sub.cbm")
model_likes.save_model("cb_log_likes_per_view.cbm")
model_eng.save_model("cb_engagement_rate.cbm")
joblib.dump(list(X.columns), "metric_feature_cols.pkl")


Building feature frame...


100%|██████████| 11481/11481 [00:44<00:00, 260.72it/s]



Training CatBoost for log_views_per_sub...
0:	learn: 1.5426372	test: 1.5553494	best: 1.5553494 (0)	total: 8.12ms	remaining: 3.24s
50:	learn: 1.3544042	test: 1.3765340	best: 1.3765340 (50)	total: 374ms	remaining: 2.56s
100:	learn: 1.3240459	test: 1.3683984	best: 1.3683984 (100)	total: 764ms	remaining: 2.26s
150:	learn: 1.3024256	test: 1.3661057	best: 1.3658618 (147)	total: 1.1s	remaining: 1.82s
200:	learn: 1.2826009	test: 1.3658290	best: 1.3647044 (176)	total: 1.43s	remaining: 1.42s
250:	learn: 1.2603046	test: 1.3662387	best: 1.3647044 (176)	total: 1.73s	remaining: 1.03s
300:	learn: 1.2408680	test: 1.3666738	best: 1.3647044 (176)	total: 2.03s	remaining: 667ms
350:	learn: 1.2207824	test: 1.3668765	best: 1.3647044 (176)	total: 2.35s	remaining: 328ms
399:	learn: 1.2027819	test: 1.3681217	best: 1.3647044 (176)	total: 2.68s	remaining: 0us

bestTest = 1.364704434
bestIteration = 176

Shrink model to first 177 iterations.

Training CatBoost for log_likes_per_view...
0:	learn: 0.0485941	test: 



['metric_feature_cols.pkl']

In [7]:
# train_best_performance_tier_models.py

import pandas as pd
import numpy as np
from textblob import TextBlob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from catboost import CatBoostClassifier, Pool
import joblib

# ----------------------------------------------------------
# 1. Load data
# ----------------------------------------------------------

df = pd.read_csv("YouTube_Engineered_Features_Final.csv").reset_index(drop=True)

required_cols = [
    "youtube_title",
    "youtube_description",
    "duration_seconds",
    "youtube_channel_follower_count",
    "youtube_view_count",
    "youtube_like_count",
    "youtube_comment_count",
    "engagement_rate",
]
df = df.dropna(subset=required_cols).reset_index(drop=True)

# ----------------------------------------------------------
# 2. Feature helpers
# ----------------------------------------------------------

CALL_TO_ACTION_WORDS = [
    "subscribe", "sub", "like", "comment", "share",
    "watch", "click", "link", "join", "follow",
]

def extract_sentiment(text: str):
    blob = TextBlob(str(text))
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def readability_features(text: str):
    words = text.split()
    n_words = len(words)
    avg_word_len = (sum(len(w) for w in words) / n_words) if n_words > 0 else 0.0
    n_sentences = max(text.count(".") + text.count("!") + text.count("?"), 1)
    avg_sentence_len = n_words / n_sentences
    flesch_proxy = 206.835 - 1.015 * avg_sentence_len - 0.84 * avg_word_len
    return avg_word_len, avg_sentence_len, flesch_proxy

def keyword_features(text: str):
    text_l = text.lower()
    count = sum(text_l.count(w) for w in CALL_TO_ACTION_WORDS)
    present = int(count > 0)
    return count, present

# ----------------------------------------------------------
# 3. Build features and targets
# ----------------------------------------------------------

print("Building feature and target frames...")

feat_rows = []
target_rows = []

for _, row in tqdm(df[required_cols].iterrows(), total=len(df)):
    title = str(row["youtube_title"])
    desc = str(row["youtube_description"])
    full_text = f"{title} {desc}"

    duration = float(row["duration_seconds"])
    subs = float(row["youtube_channel_follower_count"])
    views = float(row["youtube_view_count"])
    likes = float(row["youtube_like_count"])
    comments = float(row["youtube_comment_count"])
    eng_rate = float(row["engagement_rate"])

    # derived ratios (no leakage of exact train labels at inference, since
    # only title/desc/duration/subs will be available there)
    views_per_sub = views / max(subs, 1.0)
    likes_per_view = likes / max(views, 1.0)

    # sentiment
    t_pol, t_sub = extract_sentiment(title)
    d_pol, d_sub = extract_sentiment(desc)
    c_pol, c_sub = extract_sentiment(full_text)

    # readability
    avg_word_len, avg_sentence_len, flesch_proxy = readability_features(full_text)

    # call-to-action
    cta_count, cta_present = keyword_features(full_text)

    feat_rows.append({
        "duration_seconds": duration,
        "subs": subs,
        "title_length_chars": len(title),
        "description_length_chars": len(desc),
        "title_word_count": len(title.split()),
        "description_word_count": len(desc.split()),
        "title_sentiment": t_pol,
        "title_subjectivity": t_sub,
        "description_sentiment": d_pol,
        "description_subjectivity": d_sub,
        "combined_sentiment": c_pol,
        "combined_subjectivity": c_sub,
        "avg_word_length": avg_word_len,
        "avg_sentence_length": avg_sentence_len,
        "flesch_proxy": flesch_proxy,
        "cta_word_count": cta_count,
        "cta_present": cta_present,
    })

    target_rows.append({
        "views_per_sub": views_per_sub,
        "likes_per_view": likes_per_view,
        "engagement_rate": eng_rate,
    })

X = pd.DataFrame(feat_rows)
T = pd.DataFrame(target_rows)

# ----------------------------------------------------------
# 4. Create tier labels (Low / Mid / High) using quantiles
# ----------------------------------------------------------

def make_tiers(series, low_q=0.33, high_q=0.66):
    lo = series.quantile(low_q)
    hi = series.quantile(high_q)
    def _tier(x):
        if x <= lo:
            return "LOW"
        elif x >= hi:
            return "HIGH"
        else:
            return "MID"
    return series.apply(_tier)

T["tier_views_per_sub"] = make_tiers(T["views_per_sub"])
T["tier_likes_per_view"] = make_tiers(T["likes_per_view"])
T["tier_eng_rate"] = make_tiers(T["engagement_rate"])

# ----------------------------------------------------------
# 5. Generic training function (fixed)
# ----------------------------------------------------------

def train_tier_model(X, y, label_name):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    train_pool = Pool(X_train, label=y_train)
    val_pool = Pool(X_val, label=y_val)

    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="TotalF1:average=Macro",
        iterations=400,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        auto_class_weights="Balanced",
        verbose=50,
    )

    print(f"\nTraining {label_name} tier model...")
    model.fit(train_pool, eval_set=val_pool, use_best_model=True, early_stopping_rounds=50)

    # CatBoost returns labels in the same format used for y (strings)
    y_pred_str = model.predict(val_pool).reshape(-1)
    y_val_str = y_val.values

    acc = accuracy_score(y_val_str, y_pred_str)
    macro_f1 = f1_score(y_val_str, y_pred_str, average="macro")

    print(f"\n=== {label_name} TIER MODEL ===")
    print("Accuracy :", acc)
    print("Macro F1 :", macro_f1)
    print(classification_report(y_val_str, y_pred_str, digits=3))

    classes = np.unique(y)
    return model, classes

# ----------------------------------------------------------
# 6. Train three tier models
# ----------------------------------------------------------

y_vps = T["tier_views_per_sub"]
y_lpv = T["tier_likes_per_view"]
y_er  = T["tier_eng_rate"]

model_vps, classes_vps = train_tier_model(X, y_vps, "Views per subscriber")
model_lpv, classes_lpv = train_tier_model(X, y_lpv, "Likes per view")
model_er,  classes_er  = train_tier_model(X, y_er,  "Engagement rate")

# ----------------------------------------------------------
# 7. Save artifacts
# ----------------------------------------------------------

model_vps.save_model("cb_tier_views_per_sub.cbm")
model_lpv.save_model("cb_tier_likes_per_view.cbm")
model_er.save_model("cb_tier_eng_rate.cbm")

joblib.dump(list(X.columns), "tier_metric_feature_cols.pkl")
joblib.dump(classes_vps, "tier_classes_views_per_sub.pkl")
joblib.dump(classes_lpv, "tier_classes_likes_per_view.pkl")
joblib.dump(classes_er,  "tier_classes_eng_rate.pkl")


Building feature and target frames...


100%|██████████| 11481/11481 [00:36<00:00, 318.32it/s]



Training Views per subscriber tier model...
0:	learn: 0.4313976	test: 0.4414337	best: 0.4414337 (0)	total: 11.2ms	remaining: 4.47s
50:	learn: 0.4858310	test: 0.4643804	best: 0.4661543 (39)	total: 817ms	remaining: 5.59s
100:	learn: 0.5144146	test: 0.4766655	best: 0.4796358 (94)	total: 1.67s	remaining: 4.95s
150:	learn: 0.5365792	test: 0.4796975	best: 0.4824427 (146)	total: 2.6s	remaining: 4.28s
200:	learn: 0.5517939	test: 0.4806667	best: 0.4852494 (193)	total: 3.37s	remaining: 3.33s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.485249434
bestIteration = 193

Shrink model to first 194 iterations.

=== Views per subscriber TIER MODEL ===
Accuracy : 0.501523726599913
Macro F1 : 0.48613172789486764
              precision    recall  f1-score   support

        HIGH      0.510     0.684     0.584       781
         LOW      0.510     0.530     0.520       758
         MID      0.468     0.285     0.354       758

    accuracy                          0.502      2297
  

['tier_classes_eng_rate.pkl']