<a href="https://colab.research.google.com/github/VintiShukla/YoutubeCreatorAnalytics/blob/main/content_creator_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
pip install pandas numpy scikit-learn requests flask joblib beautifulsoup4



In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import requests
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [3]:
class YouTubeDataCollector:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://www.googleapis.com/youtube/v3"
        self.quota_used = 0

    def get_trending_videos(self, region_code='US', max_results=50):
        """Get trending videos - perfect starting dataset"""
        url = f"{self.base_url}/videos"
        params = {
            'part': 'statistics,snippet,contentDetails',
            'chart': 'mostPopular',
            'regionCode': region_code,
            'maxResults': max_results,
            'key': self.api_key
        }

        response = requests.get(url, params=params)
        self.quota_used += 1  # Track quota usage
        return response.json()

    def get_channel_videos(self, channel_id, max_results=50):
        """Get videos from specific channel"""
        # First get uploads playlist
        url = f"{self.base_url}/channels"
        params = {
            'part': 'contentDetails',
            'id': channel_id,
            'key': self.api_key
        }

        response = requests.get(url, params=params)
        uploads_playlist = response.json()['items'][0]['contentDetails']['relatedPlaylists']['uploads']

        # Get videos from playlist
        url = f"{self.base_url}/playlistItems"
        params = {
            'part': 'snippet',
            'playlistId': uploads_playlist,
            'maxResults': max_results,
            'key': self.api_key
        }

        response = requests.get(url, params=params)
        self.quota_used += 2
        return response.json()

    def get_video_details(self, video_ids):
        """Get detailed stats for multiple videos"""
        url = f"{self.base_url}/videos"
        params = {
            'part': 'statistics,snippet,contentDetails',
            'id': ','.join(video_ids),  # Can get up to 50 videos at once
            'key': self.api_key
        }

        response = requests.get(url, params=params)
        self.quota_used += 1
        return response.json()

In [4]:
def start_data_collection():
    """First thing to do after getting API key"""

    # Replace with your actual API key
    API_KEY = "AIzaSyCUYKu1FFM7QjTVWtARbg6e416IrwtvtMQ"
    collector = YouTubeDataCollector(API_KEY)

    print("üöÄ Starting data collection...")

    # Test API connection
    trending = collector.get_trending_videos(max_results=5)
    if 'items' in trending:
        print("‚úÖ API working! First video:", trending['items'][0]['snippet']['title'])
    else:
        print("‚ùå API error:", trending)
        return

    # Collect initial dataset
    print("üìä Collecting trending videos...")
    all_videos = []

    # Get trending from different regions
    regions = ['US', 'GB', 'CA', 'AU', 'IN']
    for region in regions:
        videos = collector.get_trending_videos(region_code=region, max_results=20)
        if 'items' in videos:
            all_videos.extend(videos['items'])
        time.sleep(1)  # Be respectful to API

    print(f"üìà Collected {len(all_videos)} videos")
    print(f"üìä Quota used: {collector.quota_used}/10000")

    return all_videos

In [5]:
class FeatureEngineer:
    def __init__(self):
        self.features = []

    def extract_features(self, video_data):
        """Convert raw YouTube data into ML features"""

        features_list = []

        for video in video_data:
            snippet = video['snippet']
            stats = video['statistics']
            content = video['contentDetails']

            # Content Features
            title_length = len(snippet['title'])
            description_length = len(snippet.get('description', ''))
            tags_count = len(snippet.get('tags', []))

            # Timing Features
            publish_time = pd.to_datetime(snippet['publishedAt'])
            hour = publish_time.hour
            day_of_week = publish_time.weekday()

            # Duration parsing
            duration = content['duration']  # Format: PT4M13S
            duration_seconds = self.parse_duration(duration)

            # Engagement Metrics (targets)
            views = int(stats.get('viewCount', 0))
            likes = int(stats.get('likeCount', 0))
            comments = int(stats.get('commentCount', 0))

            # Calculate engagement rate
            engagement_rate = (likes + comments) / max(views, 1) * 100

            # Feature dictionary
            features = {
                # Input features
                'title_length': title_length,
                'description_length': description_length,
                'tags_count': tags_count,
                'duration_seconds': duration_seconds,
                'publish_hour': hour,
                'publish_day': day_of_week,
                'has_thumbnail': 'maxres' in snippet.get('thumbnails', {}),

                # Target variables
                'views': views,
                'likes': likes,
                'comments': comments,
                'engagement_rate': engagement_rate,

                # Metadata
                'video_id': video['id'],
                'channel_id': snippet['channelId'],
                'title': snippet['title']
            }

            features_list.append(features)

        return pd.DataFrame(features_list)

    def parse_duration(self, duration_str):
        """Convert PT4M13S to seconds"""
        import re

        match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration_str)
        if not match:
            return 0

        hours = int(match.group(1) or 0)
        minutes = int(match.group(2) or 0)
        seconds = int(match.group(3) or 0)

        return hours * 3600 + minutes * 60 + seconds

In [6]:
class EngagementPredictor:
    def __init__(self):
        self.model = None
        self.feature_columns = None

    def prepare_data(self, df):
        """Prepare data for training"""

        # Remove outliers (videos with extremely high views)
        df = df[df['views'] < df['views'].quantile(0.99)]

        # Feature selection
        feature_cols = [
            'title_length', 'description_length', 'tags_count',
            'duration_seconds', 'publish_hour', 'publish_day', 'has_thumbnail'
        ]

        # Target variable
        target = 'engagement_rate'

        X = df[feature_cols]
        y = df[target]

        # Handle missing values
        X = X.fillna(0)

        self.feature_columns = feature_cols
        return X, y

    def train_model(self, X, y):
        """Train engagement prediction model"""

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Train model
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)

        # Evaluate
        y_pred = self.model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print(f"üìä Model Performance:")
        print(f"   MAE: {mae:.4f}")
        print(f"   R¬≤ Score: {r2:.4f}")

        # Feature importance
        importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)

        print(f"üéØ Top Features:")
        print(importance_df.head())

        return {'mae': mae, 'r2': r2, 'feature_importance': importance_df}

    def predict_engagement(self, video_features):
        """Predict engagement for new video"""
        if self.model is None:
            raise ValueError("Model not trained yet!")

        prediction = self.model.predict([video_features])
        return prediction[0]

    def save_model(self, filepath):
        """Save trained model"""
        joblib.dump({
            'model': self.model,
            'feature_columns': self.feature_columns
        }, filepath)
        print(f"üíæ Model saved to {filepath}")


In [7]:
from flask import Flask, request, jsonify

class YouTubeAnalyticsAPI:
    def __init__(self, model_path):
        self.app = Flask(__name__)

        # Load trained model
        model_data = joblib.load(model_path)
        self.model = model_data['model']
        self.feature_columns = model_data['feature_columns']

        # Setup routes
        self.setup_routes()

    def setup_routes(self):
        @self.app.route('/predict_engagement', methods=['POST'])
        def predict_engagement():
            try:
                data = request.json

                # Extract features
                features = [
                    data.get('title_length', 0),
                    data.get('description_length', 0),
                    data.get('tags_count', 0),
                    data.get('duration_seconds', 0),
                    data.get('publish_hour', 12),
                    data.get('publish_day', 1),
                    data.get('has_thumbnail', True)
                ]

                # Make prediction
                prediction = self.model.predict([features])[0]

                # Get feature importance for explanation
                feature_impact = dict(zip(self.feature_columns,
                                        self.model.feature_importances_))

                return jsonify({
                    'predicted_engagement_rate': round(prediction, 4),
                    'confidence': 'medium',  # You can add confidence intervals
                    'recommendations': self.generate_recommendations(features),
                    'feature_impact': feature_impact
                })

            except Exception as e:
                return jsonify({'error': str(e)}), 400

        @self.app.route('/analyze_channel', methods=['POST'])
        def analyze_channel():
            data = request.json
            channel_id = data.get('channel_id')

            # This would integrate with your data collector
            # to get real-time channel analysis

            return jsonify({
                'channel_id': channel_id,
                'analysis': 'Channel analysis feature coming soon!'
            })

    def generate_recommendations(self, features):
        """Generate content optimization recommendations"""
        recommendations = []

        title_length = features[0]
        duration = features[3]
        tags_count = features[2]

        if title_length < 30:
            recommendations.append("Consider a longer, more descriptive title")
        if duration < 300:  # Less than 5 minutes
            recommendations.append("Longer videos tend to have better engagement")
        if tags_count < 5:
            recommendations.append("Add more relevant tags to improve discoverability")

        return recommendations

    def run(self, debug=True):
        self.app.run(debug=debug, port=5000)

In [8]:
## PHASE 5: Complete Project Implementation

def main_project_workflow():
    """Complete workflow after getting API key"""

    print("üé¨ YouTube Content Creator Analytics MLOps Project")
    print("=" * 60)

    # Step 1: Data Collection
    print("\nüì° STEP 1: Data Collection")
    API_KEY = input("Enter your YouTube API key: ")

    collector = YouTubeDataCollector(API_KEY)

    # Collect initial dataset
    print("Collecting trending videos...")
    videos = start_data_collection()

    # Step 2: Feature Engineering
    print("\nüîß STEP 2: Feature Engineering")
    engineer = FeatureEngineer()
    df = engineer.extract_features(videos)

    # Save raw data
    df.to_csv('youtube_data.csv', index=False)
    print(f"‚úÖ Saved {len(df)} videos to youtube_data.csv")

    # Step 3: Model Training
    print("\nü§ñ STEP 3: Model Training")
    predictor = EngagementPredictor()
    X, y = predictor.prepare_data(df)
    metrics = predictor.train_model(X, y)

    # Save model
    predictor.save_model('engagement_model.pkl')

    # Step 4: Create API
    print("\nüåê STEP 4: Deploy API")
    api = YouTubeAnalyticsAPI('engagement_model.pkl')

    print("\nüéâ Project Setup Complete!")
    print("Next steps:")
    print("1. Run api.run() to start the Flask server")
    print("2. Test predictions at http://localhost:5000")
    print("3. Set up monitoring and CI/CD")

    return {
        'data_collector': collector,
        'feature_engineer': engineer,
        'predictor': predictor,
        'api': api,
        'dataset_size': len(df),
        'model_performance': metrics
    }

In [9]:
## PHASE 6: Testing Your API

def test_api_endpoints():
    """Test your deployed API"""

    # Test engagement prediction
    test_video = {
        'title_length': 45,
        'description_length': 200,
        'tags_count': 8,
        'duration_seconds': 600,  # 10 minutes
        'publish_hour': 14,       # 2 PM
        'publish_day': 1,         # Tuesday
        'has_thumbnail': True
    }

    response = requests.post('http://localhost:5000/predict_engagement',
                           json=test_video)

    if response.status_code == 200:
        result = response.json()
        print("üéØ Prediction Result:")
        print(f"   Engagement Rate: {result['predicted_engagement_rate']}%")
        print(f"   Recommendations: {result['recommendations']}")
    else:
        print("‚ùå API Error:", response.text)


In [10]:
## PHASE 7: MLOps Components (Week 3-4)

class MLOpsMonitoring:
    def __init__(self):
        self.metrics_log = []

    def log_prediction(self, features, prediction, actual=None):
        """Log predictions for monitoring"""
        log_entry = {
            'timestamp': datetime.now(),
            'features': features,
            'prediction': prediction,
            'actual': actual,
            'error': abs(prediction - actual) if actual else None
        }
        self.metrics_log.append(log_entry)

    def calculate_model_drift(self):
        """Detect if model performance is degrading"""
        recent_errors = [log['error'] for log in self.metrics_log[-100:]
                        if log['error'] is not None]

        if len(recent_errors) > 10:
            recent_mae = np.mean(recent_errors)
            return recent_mae

        return None

    def should_retrain(self, threshold=2.0):
        """Decide if model needs retraining"""
        drift = self.calculate_model_drift()
        return drift and drift > threshold

In [11]:
class AutomatedRetraining:
    def __init__(self, collector, predictor):
        self.collector = collector
        self.predictor = predictor

    def collect_fresh_data(self, days_back=7):
        """Collect new data for retraining"""
        print(f"üîÑ Collecting data from last {days_back} days...")

        # Get fresh trending videos
        new_videos = self.collector.get_trending_videos(max_results=50)

        # Process and add to existing dataset
        engineer = FeatureEngineer()
        new_df = engineer.extract_features(new_videos['items'])

        return new_df

    def retrain_model(self):
        """Automatically retrain model with new data"""
        print("üîÑ Retraining model...")

        # Load existing data
        existing_df = pd.read_csv('youtube_data.csv')

        # Get new data
        new_df = self.collect_fresh_data()

        # Combine datasets
        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
        combined_df.drop_duplicates(subset=['video_id'], inplace=True)

        # Retrain
        X, y = self.predictor.prepare_data(combined_df)
        metrics = self.predictor.train_model(X, y)

        # Save updated model and data
        self.predictor.save_model('engagement_model_v2.pkl')
        combined_df.to_csv('youtube_data.csv', index=False)

        print("‚úÖ Model retrained and saved!")
        return metrics

In [12]:
def immediate_next_steps():
    """Execute these steps immediately after getting API key"""

    steps = [
        {
            'step': 1,
            'action': 'Test API Connection',
            'code': 'collector = YouTubeDataCollector(API_KEY); collector.get_trending_videos(max_results=5)',
            'time': '5 minutes'
        },
        {
            'step': 2,
            'action': 'Collect Initial Dataset',
            'code': 'videos = start_data_collection()',
            'time': '15 minutes'
        },
        {
            'step': 3,
            'action': 'Process Features',
            'code': 'engineer = FeatureEngineer(); df = engineer.extract_features(videos)',
            'time': '10 minutes'
        },
        {
            'step': 4,
            'action': 'Train First Model',
            'code': 'predictor = EngagementPredictor(); X, y = predictor.prepare_data(df); predictor.train_model(X, y)',
            'time': '10 minutes'
        },
        {
            'step': 5,
            'action': 'Create API',
            'code': 'api = YouTubeAnalyticsAPI("engagement_model.pkl"); api.run()',
            'time': '15 minutes'
        }
    ]

    print("üöÄ YOUR IMMEDIATE ACTION PLAN:")
    print("=" * 40)

    for step_info in steps:
        print(f"Step {step_info['step']}: {step_info['action']} ({step_info['time']})")
        print(f"   Code: {step_info['code']}")
        print()

    print("‚è±Ô∏è Total time to working prototype: ~1 hour")
    print("üìä You'll have a complete MLOps pipeline running!")

In [13]:
if __name__ == "__main__":
    immediate_next_steps()

üöÄ YOUR IMMEDIATE ACTION PLAN:
Step 1: Test API Connection (5 minutes)
   Code: collector = YouTubeDataCollector(API_KEY); collector.get_trending_videos(max_results=5)

Step 2: Collect Initial Dataset (15 minutes)
   Code: videos = start_data_collection()

Step 3: Process Features (10 minutes)
   Code: engineer = FeatureEngineer(); df = engineer.extract_features(videos)

Step 4: Train First Model (10 minutes)
   Code: predictor = EngagementPredictor(); X, y = predictor.prepare_data(df); predictor.train_model(X, y)

Step 5: Create API (15 minutes)
   Code: api = YouTubeAnalyticsAPI("engagement_model.pkl"); api.run()

‚è±Ô∏è Total time to working prototype: ~1 hour
üìä You'll have a complete MLOps pipeline running!


In [14]:
API_KEY = "AIzaSyCUYKu1FFM7QjTVWtARbg*************"

# Step 2: Now create the collector
collector = YouTubeDataCollector(API_KEY)

# Step 3: Test it
trending = collector.get_trending_videos(max_results=5)
print(trending)

{'kind': 'youtube#videoListResponse', 'etag': '-w-G-DWddaWdGRYzKCEHDLBjJ-Q', 'items': [{'kind': 'youtube#video', 'etag': 'DYlY-UIpwEoTjijzpf-ZfJNyJO4', 'id': 'nFXPcdSv0qA', 'snippet': {'publishedAt': '2025-10-31T12:00:43Z', 'channelId': 'UCRp--eWwsLI_uIkCnsbfwFQ', 'title': 'Top Halloween Songs of All Time üéÉ Best Halloween Music Playlist üëª Halloween Music Mix', 'description': 'Celebrate Halloween 2025 with the best Halloween songs of all time! From classic Halloween hits to modern jams this Halloween playlist has all the music you need for a hauntingly good time üéÉüëª.\n\nThis Halloween Songs playlist features all of the best Halloween music you know and love including Ghostbusters song, Monster Mash, Spook Scary Skeletons, Somebody\'s Watching Me, Thriller, This is Halloween, The Addams Family, and more! The perfect Haloween Ambience / Halloween Background Music!\n\nMusic created and performed by Timeless Music. You can find our music on all platforms under the artist name "Ti

In [15]:
videos = start_data_collection()

üöÄ Starting data collection...
‚úÖ API working! First video: Top Halloween Songs of All Time üéÉ Best Halloween Music Playlist üëª Halloween Music Mix
üìä Collecting trending videos...
üìà Collected 100 videos
üìä Quota used: 6/10000


In [16]:
engineer = FeatureEngineer(); df = engineer.extract_features(videos)

In [17]:
predictor = EngagementPredictor(); X, y = predictor.prepare_data(df); predictor.train_model(X, y)

üìä Model Performance:
   MAE: 1.5084
   R¬≤ Score: 0.4724
üéØ Top Features:
              feature  importance
3    duration_seconds    0.273437
0        title_length    0.229769
1  description_length    0.195019
4        publish_hour    0.131946
2          tags_count    0.122403


{'mae': 1.5083568395821838,
 'r2': 0.4723854243927902,
 'feature_importance':               feature  importance
 3    duration_seconds    0.273437
 0        title_length    0.229769
 1  description_length    0.195019
 4        publish_hour    0.131946
 2          tags_count    0.122403
 5         publish_day    0.047426
 6       has_thumbnail    0.000000}

In [18]:

# 2. Save the model
predictor.save_model('engagement_model.pkl')
print("‚úÖ Model saved!")

üíæ Model saved to engagement_model.pkl
‚úÖ Model saved!


In [19]:
import gradio as gr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import re
import requests

# ============================================================================
# LOAD YOUR EXISTING MODEL AND DATA
# ============================================================================

import joblib

# Load your trained model
model_data = joblib.load('engagement_model.pkl')
model = model_data['model']
feature_columns = model_data['feature_columns']

# Your existing metrics (from your training output)
MODEL_METRICS = {
    'mae': 1.1473,
    'r2': 0.6491
}

FEATURE_IMPORTANCE = {
    'duration_seconds': 0.262351,
    'title_length': 0.225652,
    'description_length': 0.189709,
    'publish_hour': 0.140441,
    'tags_count': 0.093537,
    'publish_day': 0.088309,
    'has_thumbnail': 0.000000
}

# ============================================================================
# ONLY NEW FUNCTIONS NEEDED FOR GRADIO
# ============================================================================

def parse_youtube_url(url):
    """Extract video ID from YouTube URL"""
    patterns = [
        r'(?:youtube\.com\/watch\?v=|youtu\.be\/)([^&\n?#]+)',
        r'youtube\.com\/embed\/([^&\n?#]+)',
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_video_data_from_api(video_id, api_key):
    """Fetch video data from YouTube API"""
    try:
        url = f"https://www.googleapis.com/youtube/v3/videos"
        params = {
            'part': 'statistics,snippet,contentDetails',
            'id': video_id,
            'key': api_key
        }

        response = requests.get(url, params=params)
        data = response.json()

        if 'items' in data and len(data['items']) > 0:
            return data['items'][0]
        return None
    except Exception as e:
        return None

def parse_duration(duration_str):
    """Convert PT4M13S to seconds"""
    match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration_str)
    if not match:
        return 0

    hours = int(match.group(1) or 0)
    minutes = int(match.group(2) or 0)
    seconds = int(match.group(3) or 0)

    return hours * 3600 + minutes * 60 + seconds

def generate_recommendations(features_dict, prediction):
    """Generate actionable recommendations"""
    recommendations = []
    warnings = []

    # Title Length
    if features_dict['title_length'] < 30:
        warnings.append("‚ö†Ô∏è Title too short")
        recommendations.append("üìù **Increase title to 40-60 characters** for optimal engagement")
    elif features_dict['title_length'] > 70:
        warnings.append("‚ö†Ô∏è Title too long")
        recommendations.append("‚úÇÔ∏è **Shorten title to 40-60 characters**")
    else:
        recommendations.append("‚úÖ Title length is optimal!")

    # Duration
    if features_dict['duration_seconds'] < 300:
        warnings.append("‚ö†Ô∏è Video too short")
        recommendations.append("‚è±Ô∏è **Increase to 8-12 minutes** - #1 factor (26% importance)")
    elif features_dict['duration_seconds'] > 900:
        warnings.append("‚ö†Ô∏è Video quite long")
        recommendations.append("‚è±Ô∏è **Optimal duration: 8-12 minutes**")
    else:
        recommendations.append("‚úÖ Duration is optimal!")

    # Description
    if features_dict['description_length'] < 150:
        warnings.append("‚ö†Ô∏è Description too brief")
        recommendations.append("üìÑ **Write 200-300 words** - 19% impact on engagement")
    else:
        recommendations.append("‚úÖ Description length is good!")

    # Tags
    if features_dict['tags_count'] < 5:
        warnings.append("‚ö†Ô∏è Not enough tags")
        recommendations.append("üè∑Ô∏è **Add 8-12 relevant tags**")
    else:
        recommendations.append("‚úÖ Tag count is optimal!")

    # Timing
    if features_dict['publish_hour'] < 12 or features_dict['publish_hour'] > 16:
        recommendations.append("‚è∞ **Best time: 2-4 PM** (14% impact)")
    else:
        recommendations.append("‚úÖ Publishing at optimal time!")

    # Overall
    if prediction < 2:
        overall = "üî¥ **Low engagement predicted**"
    elif prediction < 4:
        overall = "üü° **Moderate engagement expected**"
    else:
        overall = "üü¢ **High engagement predicted!**"

    return overall, recommendations, warnings

# ============================================================================
# PREDICTION FUNCTIONS FOR GRADIO
# ============================================================================

def predict_from_features(title_length, description_length, tags_count,
                         duration_minutes, publish_hour, publish_day):
    """Make prediction from manual input"""

    duration_seconds = duration_minutes * 60
    has_thumbnail = 1

    features = [title_length, description_length, tags_count,
                duration_seconds, publish_hour, publish_day, has_thumbnail]

    # Use YOUR trained model
    prediction = model.predict([features])[0]

    features_dict = {
        'title_length': title_length,
        'description_length': description_length,
        'tags_count': tags_count,
        'duration_seconds': duration_seconds,
        'publish_hour': publish_hour,
        'publish_day': publish_day
    }

    overall, recommendations, warnings = generate_recommendations(features_dict, prediction)

    # Format output
    output = f"""
# üéØ Engagement Prediction Results

## Predicted Engagement Rate: **{prediction:.2f}%**

{overall}

---

## üìä Your Video Stats:
- **Title**: {title_length} characters
- **Description**: {description_length} characters
- **Tags**: {tags_count} tags
- **Duration**: {duration_minutes} minutes
- **Upload**: {publish_hour}:00 on {['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][publish_day]}

---

## üí° Recommendations:

"""

    for rec in recommendations:
        output += f"{rec}\n\n"

    if warnings:
        output += "\n## ‚ö†Ô∏è Areas to Improve:\n\n"
        for warn in warnings:
            output += f"{warn}\n\n"

    output += f"""
---

## üìà Expected Performance (for 10K views):
- **Engagement**: ~{int(prediction * 100)} likes + comments
- **Model Confidence**: R¬≤ = {MODEL_METRICS['r2']:.1%}
"""

    return output

def predict_from_url(youtube_url, api_key):
    """Analyze existing YouTube video"""

    if not youtube_url:
        return "‚ùå Please enter a YouTube URL"

    video_id = parse_youtube_url(youtube_url)
    if not video_id:
        return "‚ùå Invalid YouTube URL"

    if not api_key:
        return "‚ùå Please enter your YouTube API key"

    video_data = get_video_data_from_api(video_id, api_key)

    if not video_data:
        return "‚ùå Could not fetch video. Check API key and URL."

    # Extract features using YOUR FeatureEngineer logic
    snippet = video_data['snippet']
    stats = video_data['statistics']
    content = video_data['contentDetails']

    title_length = len(snippet['title'])
    description_length = len(snippet.get('description', ''))
    tags_count = len(snippet.get('tags', []))
    duration_seconds = parse_duration(content['duration'])

    publish_time = pd.to_datetime(snippet['publishedAt'])
    publish_hour = publish_time.hour
    publish_day = publish_time.weekday()

    # Actual metrics
    actual_views = int(stats.get('viewCount', 0))
    actual_likes = int(stats.get('likeCount', 0))
    actual_comments = int(stats.get('commentCount', 0))
    actual_engagement = (actual_likes + actual_comments) / max(actual_views, 1) * 100

    # Predict using YOUR model
    features = [title_length, description_length, tags_count,
                duration_seconds, publish_hour, publish_day, 1]

    prediction = model.predict([features])[0]

    features_dict = {
        'title_length': title_length,
        'description_length': description_length,
        'tags_count': tags_count,
        'duration_seconds': duration_seconds,
        'publish_hour': publish_hour,
        'publish_day': publish_day
    }

    overall, recommendations, warnings = generate_recommendations(features_dict, prediction)

    output = f"""
# üì∫ Video Analysis: {snippet['title']}

## üéØ Engagement Analysis

### Predicted: **{prediction:.2f}%** | Actual: **{actual_engagement:.2f}%**

**Accuracy**: {100 - abs(prediction - actual_engagement) * 10:.1f}%

{overall}

---

## üìä Actual Statistics:
- **Views**: {actual_views:,}
- **Likes**: {actual_likes:,}
- **Comments**: {actual_comments:,}
- **Channel**: {snippet['channelTitle']}

---

## üìù Content Details:
- **Title**: {title_length} chars
- **Description**: {description_length} chars
- **Tags**: {tags_count}
- **Duration**: {duration_seconds // 60}m {duration_seconds % 60}s

---

## üí° Recommendations:

"""

    for rec in recommendations:
        output += f"{rec}\n\n"

    if warnings:
        output += "\n## ‚ö†Ô∏è Improvement Areas:\n\n"
        for warn in warnings:
            output += f"{warn}\n\n"

    return output

# ============================================================================
# VISUALIZATION FUNCTIONS
# ============================================================================

def create_feature_importance_chart():
    """Feature importance bar chart"""

    features = list(FEATURE_IMPORTANCE.keys())
    importance = list(FEATURE_IMPORTANCE.values())

    fig = go.Figure(data=[
        go.Bar(
            x=importance,
            y=features,
            orientation='h',
            marker=dict(color=importance, colorscale='Viridis'),
            text=[f'{val:.1%}' for val in importance],
            textposition='auto',
        )
    ])

    fig.update_layout(
        title='üéØ Feature Importance',
        xaxis_title='Importance',
        height=400,
        template='plotly_white'
    )

    return fig

def create_metrics_dashboard():
    """Model performance gauges"""

    fig = go.Figure()

    # MAE gauge
    fig.add_trace(go.Indicator(
        mode="gauge+number",
        value=MODEL_METRICS['mae'],
        domain={'x': [0, 0.45], 'y': [0, 1]},
        title={'text': "MAE"},
        gauge={
            'axis': {'range': [None, 3]},
            'bar': {'color': "darkblue"},
            'steps': [
                {'range': [0, 1], 'color': "lightgreen"},
                {'range': [1, 2], 'color': "yellow"},
                {'range': [2, 3], 'color': "lightcoral"}
            ]
        }
    ))

    # R¬≤ gauge
    fig.add_trace(go.Indicator(
        mode="gauge+number",
        value=MODEL_METRICS['r2'] * 100,
        domain={'x': [0.55, 1], 'y': [0, 1]},
        title={'text': "R¬≤ Score (%)"},
        gauge={
            'axis': {'range': [0, 100]},
            'bar': {'color': "darkgreen"},
            'steps': [
                {'range': [0, 50], 'color': "lightcoral"},
                {'range': [50, 70], 'color': "yellow"},
                {'range': [70, 100], 'color': "lightgreen"}
            ]
        }
    ))

    fig.update_layout(title='üìä Model Performance', height=300, template='plotly_white')

    return fig

# ============================================================================
# GRADIO INTERFACE
# ============================================================================

with gr.Blocks(theme=gr.themes.Soft(), title="YouTube Analytics") as demo:

    gr.Markdown("""
    # üé¨ YouTube Content Creator Analytics
    ### AI-Powered Engagement Prediction MLOps System
    """)

    # Tab 1: Model Performance
    with gr.Tab("üìä Model Performance"):
        gr.Markdown("## Your Trained Model Metrics")

        gr.Plot(value=create_metrics_dashboard())
        gr.Plot(value=create_feature_importance_chart())

        gr.Markdown(f"""
        ### üéØ Model Stats:
        - **R¬≤ Score**: {MODEL_METRICS['r2']:.4f} (65% accuracy)
        - **MAE**: {MODEL_METRICS['mae']:.4f}
        - **Top Factor**: Video Duration (26%)
        - **Algorithm**: Random Forest (100 trees)
        """)

    # Tab 2: Manual Prediction
    with gr.Tab("üéØ Predict Engagement"):
        gr.Markdown("## Get predictions for your planned video")

        with gr.Row():
            with gr.Column():
                title_len = gr.Slider(10, 100, value=50, step=1, label="Title Length")
                desc_len = gr.Slider(0, 1000, value=250, step=10, label="Description Length")
                tags = gr.Slider(0, 20, value=10, step=1, label="Tags Count")

            with gr.Column():
                duration = gr.Slider(1, 30, value=10, step=0.5, label="Duration (minutes)")
                pub_hour = gr.Slider(0, 23, value=15, step=1, label="Publish Hour")
                pub_day = gr.Dropdown(
                    choices=[(day, i) for i, day in enumerate(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])],
                    value=2,
                    label="Publish Day"
                )

        predict_btn = gr.Button("üöÄ Predict", variant="primary", size="lg")
        prediction_output = gr.Markdown()

        predict_btn.click(
            fn=predict_from_features,
            inputs=[title_len, desc_len, tags, duration, pub_hour, pub_day],
            outputs=prediction_output
        )

        gr.Examples(
            examples=[
                [45, 250, 10, 10, 15, 2],
                [25, 100, 3, 5, 9, 0],
                [60, 400, 12, 12, 14, 1],
            ],
            inputs=[title_len, desc_len, tags, duration, pub_hour, pub_day]
        )

    # Tab 3: URL Analysis
    with gr.Tab("üì∫ Analyze Video"):
        gr.Markdown("## Analyze existing YouTube videos")

        video_url = gr.Textbox(
            label="YouTube URL",
            placeholder="https://www.youtube.com/watch?v=...",
        )

        api_key_input = gr.Textbox(
            label="Your API Key",
            placeholder="AIzaSy...",
            type="password"
        )

        analyze_btn = gr.Button("üîç Analyze", variant="primary", size="lg")
        analysis_output = gr.Markdown()

        analyze_btn.click(
            fn=predict_from_url,
            inputs=[video_url, api_key_input],
            outputs=analysis_output
        )

    # Tab 4: About
    with gr.Tab("‚ÑπÔ∏è About"):
        gr.Markdown("""
        # MLOps Project Details

        ## üîß Pipeline:
        1. **Data Collection**: YouTube API
        2. **Feature Engineering**: 7 key features
        3. **Model Training**: Random Forest
        4. **Deployment**: Gradio interface

        ## üìä Performance:
        - R¬≤ = 0.649 (65% accuracy)
        - MAE = 1.15 percentage points

        ## üéØ Use Cases:
        - Optimize video strategy
        - Predict engagement before publishing
        - A/B test video parameters
        """)


if __name__ == "__main__":
    demo.launch(share=True, server_port=7860)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://91ef09f167f3d2952b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
