In [18]:
"""
================================================================================
COMPLETE YOUTUBE CONTENT OPTIMIZATION PIPELINE - WITH PROGRESS BARS
================================================================================
Sectors: Education, Entertainment, Science & Technology, Lifestyle, Sports, News & Politics

STEP 1: Download data (2000 per sector)
STEP 2: Feature engineering (42+ features)
STEP 3: Data preprocessing & validation
STEP 4: Model training & evaluation
STEP 5: Generate reports & visualizations
================================================================================
"""

from datasets import load_dataset
import pandas as pd
import numpy as np
from collections import defaultdict
from textblob import TextBlob
import re
import time
import warnings
from tqdm import tqdm
from tqdm.auto import trange

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, roc_curve,
    auc, accuracy_score
)
from xgboost import XGBClassifier
import joblib

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

warnings.filterwarnings('ignore')

# ============================================================
# CONFIGURATION
# ============================================================

TARGET_SECTORS = [
    "Education",
    "Entertainment",
    "Science & Technology",
    "Lifestyle",
    "Sports",
    "News & Politics"
]

HF_TOKEN = "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"
TARGET_PER_SECTOR = 2000

required_base_cols = [
    "resolution", "duration_seconds", "content_parent_category", "content_fine_category",
    "youtube_title", "youtube_description", "text_to_speech_word_count",
    "youtube_categories", "youtube_tags", "youtube_channel",
    "youtube_view_count", "youtube_comment_count", "youtube_like_count",
    "youtube_channel_follower_count", "youtube_upload_date", "youtube_age_limit",
    "text_to_speech"
]

print("\n" + "="*80)
print("COMPLETE YOUTUBE CONTENT OPTIMIZATION PIPELINE - WITH PROGRESS BARS".center(80))
print("="*80)
print(f"\nConfiguration:")
print(f"  Sectors: {len(TARGET_SECTORS)}")
print(f"  Videos per sector: {TARGET_PER_SECTOR:,}")
print(f"  Total target: {TARGET_PER_SECTOR * len(TARGET_SECTORS):,}\n")

# ============================================================
# STEP 1: DOWNLOAD DATA WITH PROGRESS BAR
# ============================================================

print("="*80)
print("[STEP 1] DOWNLOADING DATA (2000 per sector)".center(80))
print("="*80 + "\n")

start_time = time.time()
rows_per_sector = defaultdict(list)
total_checked = 0
download_pbar = tqdm(total=TARGET_PER_SECTOR * len(TARGET_SECTORS), 
                      desc="Overall Progress", unit="videos", 
                      bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')

dataset = load_dataset(
    "HuggingFaceFV/finevideo",
    split="train",
    streaming=True,
    token=HF_TOKEN
)

sector_pbars = {sector: tqdm(total=TARGET_PER_SECTOR, 
                             desc=f"  {sector:20}", 
                             unit="videos",
                             bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}',
                             leave=False) 
               for sector in TARGET_SECTORS}

for sample in dataset:
    total_checked += 1
    meta = sample["json"]
    sector = meta.get("content_parent_category")
    
    if sector not in TARGET_SECTORS:
        continue
    if len(rows_per_sector[sector]) >= TARGET_PER_SECTOR:
        continue
    
    # Extract base fields
    row = {col: meta.get(col) for col in required_base_cols}
    
    # Extract nested content_metadata fields
    content_meta = meta.get("content_metadata", {})
    row["fps"] = content_meta.get("fps")
    row["meta_title"] = content_meta.get("title")
    row["meta_description"] = content_meta.get("description")
    row["num_scenes"] = len(content_meta.get("scenes", [])) if content_meta and "scenes" in content_meta else 0
    row["num_qa_pairs"] = len(content_meta.get("qAndA", [])) if content_meta and "qAndA" in content_meta else 0
    row["num_trim_suggestions"] = len(content_meta.get("trimmingSuggestions", [])) if content_meta and "trimmingSuggestions" in content_meta else 0
    
    storylines = content_meta.get("storylines", {})
    row["has_storyline"] = int(bool(storylines and "description" in storylines))
    
    tts = meta.get("text_to_speech", "")
    row["tts_word_count"] = len(str(tts).split()) if tts else 0
    
    tts_timecoded = meta.get("timecoded_text_to_speech", [])
    row["num_tts_segments"] = len(tts_timecoded) if tts_timecoded else 0
    
    row["sector"] = sector
    rows_per_sector[sector].append(row)
    
    sector_pbars[sector].update(1)
    download_pbar.update(1)
    
    if all(len(rows_per_sector[s]) >= TARGET_PER_SECTOR for s in TARGET_SECTORS):
        break

# Close progress bars
for pbar in sector_pbars.values():
    pbar.close()
download_pbar.close()

all_rows = []
for sector in TARGET_SECTORS:
    all_rows.extend(rows_per_sector[sector][:TARGET_PER_SECTOR])

df_raw = pd.DataFrame(all_rows)
df_raw = df_raw.sample(frac=1).reset_index(drop=True)
df_raw.to_csv("YouTube_Raw_Final_6Sectors.csv", index=False)

print(f"\n✓ Download Complete! ({time.time() - start_time:.1f}s)")
print(f"  Total records: {len(df_raw):,}")
for sector in TARGET_SECTORS:
    count = (df_raw['sector'] == sector).sum()
    print(f"    {sector:20}: {count:,}")

# ============================================================
# STEP 2: FEATURE ENGINEERING WITH PROGRESS BAR
# ============================================================

print("\n" + "="*80)
print("[STEP 2] ENHANCED FEATURE ENGINEERING".center(80))
print("="*80 + "\n")

df = df_raw.copy()

def text_length(text):
    return len(str(text).strip())

def word_count(text):
    return len(str(text).strip().split())

def has_question(text):
    return int("?" in str(text))

def has_exclamation(text):
    return int("!" in str(text))

def extract_hashtags(text):
    return re.findall(r"#\w+", str(text))

def num_hashtags(text):
    return len(extract_hashtags(text))

def num_mentions(text):
    return len(re.findall(r"@\w+", str(text)))

def num_links(text):
    return len(re.findall(r"https?://\S+", str(text)))

def get_sentiment(text):
    if pd.isna(text):
        return 0
    return TextBlob(str(text)).sentiment.polarity

def get_subjectivity(text):
    if pd.isna(text):
        return 0
    return TextBlob(str(text)).sentiment.subjectivity

# Feature extraction stages with progress
stages = [
    ("Title Features", [
        ("title_length_chars", lambda: df['youtube_title'].apply(text_length)),
        ("title_word_count", lambda: df['youtube_title'].apply(word_count)),
        ("title_has_question_mark", lambda: df['youtube_title'].apply(has_question)),
        ("title_has_exclamation_mark", lambda: df['youtube_title'].apply(has_exclamation)),
        ("num_hashtags_in_title", lambda: df['youtube_title'].apply(num_hashtags)),
        ("title_sentiment", lambda: df['youtube_title'].apply(get_sentiment)),
        ("title_subjectivity", lambda: df['youtube_title'].apply(get_subjectivity)),
    ]),
    ("Description Features", [
        ("description_length_chars", lambda: df['youtube_description'].apply(text_length)),
        ("description_word_count", lambda: df['youtube_description'].apply(word_count)),
        ("description_has_question_mark", lambda: df['youtube_description'].apply(has_question)),
        ("description_has_exclamation_mark", lambda: df['youtube_description'].apply(has_exclamation)),
        ("num_hashtags_in_description", lambda: df['youtube_description'].apply(num_hashtags)),
        ("description_num_mentions", lambda: df['youtube_description'].apply(num_mentions)),
        ("description_num_links", lambda: df['youtube_description'].apply(num_links)),
        ("description_sentiment", lambda: df['youtube_description'].apply(get_sentiment)),
        ("description_subjectivity", lambda: df['youtube_description'].apply(get_subjectivity)),
    ]),
    ("Engagement Metrics", [
        ("views_per_subscriber", lambda: df['youtube_view_count'] / (df['youtube_channel_follower_count'] + 1)),
        ("likes_per_subscriber", lambda: df['youtube_like_count'] / (df['youtube_channel_follower_count'] + 1)),
        ("comments_per_subscriber", lambda: df['youtube_comment_count'] / (df['youtube_channel_follower_count'] + 1)),
        ("audience_engagement_index", lambda: (df['youtube_like_count'] + df['youtube_comment_count']) / (df['youtube_channel_follower_count'] + 1)),
        ("engagement_rate", lambda: (df['youtube_like_count'] + df['youtube_comment_count']) / (df['youtube_view_count'] + 1)),
        ("like_rate", lambda: df['youtube_like_count'] / (df['youtube_view_count'] + 1)),
        ("comment_rate", lambda: df['youtube_comment_count'] / (df['youtube_view_count'] + 1)),
        ("like_to_comment_ratio", lambda: df['youtube_like_count'] / (df['youtube_comment_count'] + 1)),
    ]),
    ("Content Metadata", [
        ("fps", lambda: df['fps'].fillna(30.0)),
        ("num_scenes", lambda: df['num_scenes'].fillna(0)),
        ("num_qa_pairs", lambda: df['num_qa_pairs'].fillna(0)),
        ("num_trim_suggestions", lambda: df['num_trim_suggestions'].fillna(0)),
        ("has_storyline", lambda: df['has_storyline'].fillna(0)),
        ("tts_word_count", lambda: df['tts_word_count'].fillna(0)),
        ("num_tts_segments", lambda: df['num_tts_segments'].fillna(0)),
        ("meta_description_length", lambda: df['meta_description'].apply(text_length)),
        ("meta_description_sentiment", lambda: df['meta_description'].apply(get_sentiment)),
    ]),
    ("Production Quality", [
        ("video_completeness_score", lambda: (
            (df['title_length_chars'] > 10).astype(int) +
            (df['description_length_chars'] > 50).astype(int) +
            (df['num_hashtags_in_title'] + df['num_hashtags_in_description'] > 0).astype(int) +
            (df['num_scenes'] > 0).astype(int)
        ) / 4),
        ("tts_quality_indicator", lambda: (df['tts_word_count'] > 0).astype(int)),
        ("production_polish_score", lambda: (
            (df['num_trim_suggestions'] > 0).astype(int) * 0.3 +
            (df['num_qa_pairs'] > 0).astype(int) * 0.3 +
            (df['has_storyline'] > 0).astype(int) * 0.4
        )),
        ("is_short_form", lambda: (df['duration_seconds'] < 300).astype(int)),
        ("is_long_form", lambda: (df['duration_seconds'] > 1800).astype(int)),
    ]),
]

# Normalize numeric columns first
df['youtube_view_count'] = pd.to_numeric(df['youtube_view_count'], errors='coerce').fillna(1)
df['youtube_like_count'] = pd.to_numeric(df['youtube_like_count'], errors='coerce').fillna(0)
df['youtube_comment_count'] = pd.to_numeric(df['youtube_comment_count'], errors='coerce').fillna(0)
df['youtube_channel_follower_count'] = pd.to_numeric(df['youtube_channel_follower_count'], errors='coerce').fillna(1)
df['duration_seconds'] = pd.to_numeric(df['duration_seconds'], errors='coerce').fillna(0)

feature_pbar = tqdm(total=len(stages), desc="Feature Engineering Stages", unit="stage",
                    bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')

for stage_name, features in stages:
    for feature_name, feature_func in tqdm(features, desc=f"  {stage_name}", leave=False):
        df[feature_name] = feature_func()
    feature_pbar.update(1)

feature_pbar.close()

# Engagement tier
valid_engagement = df['engagement_rate'].fillna(0)
q_low = valid_engagement.quantile(0.33)
q_high = valid_engagement.quantile(0.66)

def assign_tier(engagement):
    if engagement <= q_low:
        return 'LOW'
    elif engagement <= q_high:
        return 'MID'
    else:
        return 'HIGH'

df['engagement_tier'] = df['engagement_rate'].apply(assign_tier)
df.to_csv("YouTube_Engineered_Features_Final.csv", index=False)

print(f"\n✓ Feature Engineering Complete!")
print(f"  Total features: {len(df.columns)}")
print(f"  Engagement Tiers:")
for tier in ['LOW', 'MID', 'HIGH']:
    count = (df['engagement_tier'] == tier).sum()
    pct = (count / len(df)) * 100
    print(f"    {tier}: {count:,} ({pct:.1f}%)")

# ============================================================
# STEP 3: DATA PREPROCESSING WITH PROGRESS
# ============================================================

print("\n" + "="*80)
print("[STEP 3] DATA VALIDATION & PREPROCESSING".center(80))
print("="*80 + "\n")

preprocess_pbar = tqdm(total=4, desc="Preprocessing", unit="step",
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')

df = df.dropna(subset=['engagement_tier'])
preprocess_pbar.update(1)

numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df[col].isna().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)
preprocess_pbar.update(1)

z_scores = np.abs(stats.zscore(df[['engagement_rate']]))
df = df[(z_scores < 3).all(axis=1)]
preprocess_pbar.update(1)

exclude_cols = [
    'youtube_title', 'youtube_description', 'youtube_tags', 'youtube_categories',
    'youtube_channel', 'youtube_upload_date', 'text_to_speech', 'meta_title',
    'meta_description', 'engagement_tier', 'sector'
]

feature_cols = [col for col in df.columns if col not in exclude_cols]
preprocess_pbar.update(1)
preprocess_pbar.close()

print(f"\n✓ Data preprocessing complete")
print(f"  Final dataset size: {len(df):,} rows")
print(f"  Selected features: {len(feature_cols)}")

X = df[feature_cols].copy()
y_engagement = df['engagement_tier']

# ============================================================
# STEP 4: MODEL TRAINING WITH PROGRESS
# ============================================================

print("\n" + "="*80)
print("[STEP 4] MODEL TRAINING WITH RIGOROUS VALIDATION".center(80))
print("="*80 + "\n")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_engagement, test_size=0.2, random_state=42, stratify=y_engagement
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train-Test Split:")
print(f"  Training: {len(X_train):,} ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Testing: {len(X_test):,} ({len(X_test)/len(X)*100:.1f}%)\n")

# Train engagement model with progress
print("Training XGBoost Model (Engagement Tier)...")
model_e = XGBClassifier(
    n_estimators=150, max_depth=7, learning_rate=0.1, 
    random_state=42, use_label_encoder=False, eval_metric='mlogloss'
)

cv_results_e = cross_validate(
    model_e, X_train_scaled, y_train, 
    cv=5, 
    scoring={
        'accuracy': 'accuracy',
        'precision_weighted': 'precision_weighted',
        'recall_weighted': 'recall_weighted',
        'f1_weighted': 'f1_weighted'
    }, 
    return_train_score=True
)

train_pbar = tqdm(total=1, desc="  Training", unit="model", leave=False)
model_e.fit(X_train_scaled, y_train)
train_pbar.update(1)
train_pbar.close()

y_pred_e = model_e.predict(X_test_scaled)
y_pred_proba_e = model_e.predict_proba(X_test_scaled)

acc_e = accuracy_score(y_test, y_pred_e)
roc_auc_e = roc_auc_score(y_test, y_pred_proba_e, multi_class='ovr', average='weighted')

print(f"  Accuracy: {acc_e:.4f}")
print(f"  CV Score: {cv_results_e['test_accuracy'].mean():.4f} ± {cv_results_e['test_accuracy'].std():.4f}")
print(f"  ROC-AUC: {roc_auc_e:.4f}")

joblib.dump(model_e, 'model_engagement_tier_final.pkl')
joblib.dump(scaler, 'scaler_final.pkl')

# Train sector model with progress
print("\nTraining XGBoost Model (Sector Classification)...")

le_sector = LabelEncoder()
df['sector_encoded'] = le_sector.fit_transform(df['sector'])
y_sector = df['sector_encoded']

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X, y_sector, test_size=0.2, random_state=42, stratify=y_sector
)

scaler_s = StandardScaler()
X_train_s_scaled = scaler_s.fit_transform(X_train_s)
X_test_s_scaled = scaler_s.transform(X_test_s)

cv_results_s = cross_validate(
    model_e, X_train_s_scaled, y_train_s, 
    cv=5, 
    scoring={
        'accuracy': 'accuracy',
        'precision_weighted': 'precision_weighted',
        'recall_weighted': 'recall_weighted',
        'f1_weighted': 'f1_weighted'
    }, 
    return_train_score=True
)

model_s = XGBClassifier(
    n_estimators=150, max_depth=7, learning_rate=0.1, 
    random_state=42, use_label_encoder=False, eval_metric='mlogloss'
)

train_pbar = tqdm(total=1, desc="  Training", unit="model", leave=False)
model_s.fit(X_train_s_scaled, y_train_s)
train_pbar.update(1)
train_pbar.close()

y_pred_s = model_s.predict(X_test_s_scaled)
acc_s = accuracy_score(y_test_s, y_pred_s)

print(f"  Accuracy: {acc_s:.4f}")
print(f"  CV Score: {cv_results_s['test_accuracy'].mean():.4f} ± {cv_results_s['test_accuracy'].std():.4f}")

joblib.dump(model_s, 'model_sector_classifier_final.pkl')
joblib.dump(scaler_s, 'scaler_sector_final.pkl')
joblib.dump(le_sector, 'label_encoder_sector_final.pkl')

# ============================================================
# STEP 5: EVALUATION & VISUALIZATION WITH PROGRESS
# ============================================================

print("\n" + "="*80)
print("[STEP 5] GENERATING REPORTS & VISUALIZATIONS".center(80))
print("="*80 + "\n")

viz_pbar = tqdm(total=5, desc="Generating Visualizations", unit="viz",
                bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')

# Confusion matrix - Engagement
cm_engagement = confusion_matrix(y_test, y_pred_e, labels=['LOW', 'MID', 'HIGH'])

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm_engagement, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['LOW', 'MID', 'HIGH'],
            yticklabels=['LOW', 'MID', 'HIGH'],
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Engagement Tier Prediction', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix_engagement_tier.png', dpi=300, bbox_inches='tight')
plt.close()
viz_pbar.update(1)

# Confusion matrix - Sector
cm_sector = confusion_matrix(y_test_s, y_pred_s)

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(cm_sector, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_sector.classes_,
            yticklabels=le_sector.classes_,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Sector Classification', fontsize=14, fontweight='bold')
plt.ylabel('True Sector')
plt.xlabel('Predicted Sector')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('confusion_matrix_sector_classification.png', dpi=300, bbox_inches='tight')
plt.close()
viz_pbar.update(1)

# Feature importance - Engagement
feature_importance_e = pd.DataFrame({
    'feature': feature_cols,
    'importance': model_e.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
top_15 = feature_importance_e.head(15)
ax.barh(range(len(top_15)), top_15['importance'].values, color='steelblue')
ax.set_yticks(range(len(top_15)))
ax.set_yticklabels(top_15['feature'].values)
ax.set_xlabel('Importance Score')
ax.set_title('Top 15 Features - Engagement Tier Prediction', fontsize=14, fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance_engagement_tier.png', dpi=300, bbox_inches='tight')
plt.close()
viz_pbar.update(1)

# Feature importance - Sector
feature_importance_s = pd.DataFrame({
    'feature': feature_cols,
    'importance': model_s.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
top_15 = feature_importance_s.head(15)
ax.barh(range(len(top_15)), top_15['importance'].values, color='coral')
ax.set_yticks(range(len(top_15)))
ax.set_yticklabels(top_15['feature'].values)
ax.set_xlabel('Importance Score')
ax.set_title('Top 15 Features - Sector Classification', fontsize=14, fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance_sector_classification.png', dpi=300, bbox_inches='tight')
plt.close()
viz_pbar.update(1)

# ROC curves
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, class_label in enumerate(['LOW', 'MID', 'HIGH']):
    y_bin = (y_test == class_label).astype(int)
    y_pred_proba_bin = y_pred_proba_e[:, idx]
    
    fpr, tpr, _ = roc_curve(y_bin, y_pred_proba_bin)
    roc_auc = auc(fpr, tpr)
    
    axes[idx].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
    axes[idx].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
    axes[idx].set_xlim([0.0, 1.0])
    axes[idx].set_ylim([0.0, 1.05])
    axes[idx].set_xlabel('False Positive Rate')
    axes[idx].set_ylabel('True Positive Rate')
    axes[idx].set_title(f'ROC Curve - {class_label}')
    axes[idx].legend(loc="lower right")
    axes[idx].grid(alpha=0.3)

plt.suptitle('ROC-AUC Curves - Engagement Tier', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('roc_auc_engagement_tier.png', dpi=300, bbox_inches='tight')
plt.close()
viz_pbar.update(1)

viz_pbar.close()

# Generate text report
report = f"""
{'='*80}
YOUTUBE CONTENT OPTIMIZATION PIPELINE - FINAL REPORT
{'='*80}

PIPELINE SUMMARY:
  ✓ Step 1: Downloaded {len(df_raw):,} videos (2,000 per sector)
  ✓ Step 2: Engineered {len(feature_cols)} features from rich metadata
  ✓ Step 3: Preprocessed & validated data ({len(df):,} final records)
  ✓ Step 4: Trained 2 XGBoost models with 5-fold cross-validation
  ✓ Step 5: Generated visualizations & reports

ENGAGEMENT TIER PREDICTION MODEL:
  Algorithm: XGBoost
  Test Accuracy: {acc_e:.4f}
  Cross-Val Accuracy: {cv_results_e['test_accuracy'].mean():.4f} ± {cv_results_e['test_accuracy'].std():.4f}
  ROC-AUC: {roc_auc_e:.4f}
  
  Classification Report:
{classification_report(y_test, y_pred_e, target_names=['LOW', 'MID', 'HIGH'])}

SECTOR CLASSIFICATION MODEL:
  Algorithm: XGBoost
  Test Accuracy: {acc_s:.4f}
  Cross-Val Accuracy: {cv_results_s['test_accuracy'].mean():.4f} ± {cv_results_s['test_accuracy'].std():.4f}
  
  Classification Report:
{classification_report(y_test_s, y_pred_s, target_names=le_sector.classes_)}

TOP 10 FEATURES (Engagement Tier):
{feature_importance_e.head(10).to_string(index=False)}

TOP 10 FEATURES (Sector Classification):
{feature_importance_s.head(10).to_string(index=False)}

OUTPUT FILES:
  Data:
    - YouTube_Raw_Final_6Sectors.csv
    - YouTube_Engineered_Features_Final.csv
  
  Models:
    - model_engagement_tier_final.pkl
    - model_sector_classifier_final.pkl
    - scaler_final.pkl
    - scaler_sector_final.pkl
    - label_encoder_sector_final.pkl
  
  Visualizations:
    - confusion_matrix_engagement_tier.png
    - confusion_matrix_sector_classification.png
    - feature_importance_engagement_tier.png
    - feature_importance_sector_classification.png
    - roc_auc_engagement_tier.png
  
  Report:
    - pipeline_final_report.txt

READY FOR DEPLOYMENT:
  → Streamlit dashboard (streamlit run streamlit_dashboard_final.py)
  → API server
  → Production use

{'='*80}
"""

with open('pipeline_final_report.txt', 'w') as f:
    f.write(report)

print(report)

# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "="*80)
print("PIPELINE EXECUTION COMPLETE!".center(80))
print("="*80)

print(f"\n✓ All 5 steps completed successfully!")
print(f"✓ Total execution time: {time.time() - start_time:.1f} seconds")
print(f"\nNext steps:")
print(f"  1. Review the generated visualizations and reports")
print(f"  2. Run: streamlit run streamlit_dashboard_final.py")
print(f"  3. Deploy models to production\n")



      COMPLETE YOUTUBE CONTENT OPTIMIZATION PIPELINE - WITH PROGRESS BARS       

Configuration:
  Sectors: 6
  Videos per sector: 2,000
  Total target: 12,000

                  [STEP 1] DOWNLOADING DATA (2000 per sector)                   



Overall Progress:   0%|          | 0/12000 [00:00<?]

Resolving data files:   0%|          | 0/1357 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1357 [00:00<?, ?it/s]


[A

[A[A


[A[A[A



[A[A[A[A




Overall Progress:   0%|          | 1/12000 [00:32<109:55:04]
[A



[A[A[A[A


Overall Progress:   0%|          | 4/12000 [00:33<20:53:51] 
[A



Overall Progress:   0%|          | 12/12000 [00:33<5:08:47]

[A[A

[A[A
Overall Progress:   0%|          | 17/12000 [00:33<3:07:23]




[A[A[A[A[A

Overall Progress:   0%|          | 24/12000 [00:33<1:45:13]




Overall Progress:   0%|          | 28/12000 [00:50<1:45:11]
[A



Overall Progress:   0%|          | 29/12000 [01:01<7:06:54]
Overall Progress:   0%|          | 30/12000 [01:02<6:33:30]



[A[A[A[A
Overall Progress:   0%|          | 35/12000 [01:02<4:09:46]




[A[A[A[A[A



[A[A[A[A
Overall Progress:   0%|          | 41/12000 [01:02<2:34:08]


Overall Progress:   0%|          | 46/12000 [01:02<1:46:49]

[A[A


[A[A[A


Overall Progress:   0%|          | 50/12000 [01:20<1:46:47]
Overall Progress:   0%|          | 51/12000 [01:33<7:39:05]




[A[A[A[A[A



✓ Download Complete! (21554.3s)
  Total records: 12,000
    Education           : 2,000
    Entertainment       : 2,000
    Science & Technology: 2,000
    Lifestyle           : 2,000
    Sports              : 2,000
    News & Politics     : 2,000

                     [STEP 2] ENHANCED FEATURE ENGINEERING                      



Feature Engineering Stages: 100%|██████████| 5/5



✓ Feature Engineering Complete!
  Total features: 59
  Engagement Tiers:
    LOW: 3,960 (33.0%)
    MID: 3,960 (33.0%)
    HIGH: 4,080 (34.0%)

                    [STEP 3] DATA VALIDATION & PREPROCESSING                    



Preprocessing: 100%|██████████| 4/4


✓ Data preprocessing complete
  Final dataset size: 11,884 rows
  Selected features: 48

                [STEP 4] MODEL TRAINING WITH RIGOROUS VALIDATION                






ValueError: could not convert string to float: '640x360'