In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# YouTube Data Exploratory Analysis\n",
    "\n",
    "This notebook explores the YouTube data fetched using the YouTube Data API and processed with PySpark. We'll analyze channel metrics, video performance, and engagement patterns to identify insights."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Import necessary libraries\n",
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "# Add project root to path to import local modules\n",
    "sys.path.append(os.path.abspath(os.path.join('..')))\n",
    "\n",
    "# Local imports\n",
    "from src.config import DATA_PROCESSED_DIR\n",
    "from src.analyze_data import YouTubeDataAnalyzer\n",
    "\n",
    "# Set display options\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', 100)\n",
    "pd.set_option('display.float_format', '{:.2f}'.format)\n",
    "\n",
    "# Set plot style\n",
    "plt.style.use('seaborn-whitegrid')\n",
    "sns.set_palette('viridis')\n",
    "plt.rcParams['figure.figsize'] = (12, 8)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize PySpark Analyzer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Initialize the analyzer\n",
    "analyzer = YouTubeDataAnalyzer()\n",
    "\n",
    "# Load data\n",
    "channel_df, videos_df, comments_df = analyzer.load_data()\n",
    "\n",
    "# Convert PySpark DataFrames to Pandas for easier exploration\n",
    "channel_pd = channel_df.toPandas()\n",
    "videos_pd = videos_df.toPandas()\n",
    "comments_pd = comments_df.toPandas() if comments_df.count() > 0 else pd.DataFrame()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Channel Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Display channel information\n",
    "channel_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Basic channel metrics\n",
    "print(f\"Channel Title: {channel_pd['channel_title'].iloc[0]}\")\n",
    "print(f\"Subscriber Count: {channel_pd['subscriber_count'].iloc[0]:,}\")\n",
    "print(f\"Total Views: {channel_pd['view_count'].iloc[0]:,}\")\n",
    "print(f\"Video Count: {channel_pd['video_count'].iloc[0]:,}\")\n",
    "print(f\"Published At: {channel_pd['published_at'].iloc[0]}\")\n",
    "print(f\"Channel Age (days): {(datetime.now() - channel_pd['published_at'].iloc[0]).days:,}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Video Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Overview of the video dataset\n",
    "videos_pd.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Summary statistics for key metrics\n",
    "videos_pd[['view_count', 'like_count', 'comment_count', 'duration_seconds', 'engagement_rate']].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Distribution of views\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.histplot(videos_pd['view_count'], kde=True, bins=30)\n",
    "plt.title('Distribution of Video Views')\n",
    "plt.xlabel('Views')\n",
    "plt.ylabel('Frequency')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Distribution of video durations\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.histplot(videos_pd['duration_seconds']/60, kde=True, bins=30)\n",
    "plt.title('Distribution of Video Durations (minutes)')\n",
    "plt.xlabel('Duration (minutes)')\n",
    "plt.ylabel('Frequency')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Top 10 videos by views\n",
    "top_videos = videos_pd.sort_values('view_count', ascending=False).head(10)\n",
    "top_videos[['title', 'view_count', 'like_count', 'comment_count', 'engagement_rate', 'published_at']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Video Performance by Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Video count by category\n",
    "category_counts = videos_pd['category_name'].value_counts().reset_index()\n",
    "category_counts.columns = ['Category', 'Count']\n",
    "\n",
    "fig = px.bar(category_counts, x='Category', y='Count', color='Count',\n",
    "             title='Video Count by Category')\n",
    "fig.update_layout(xaxis_title='Category',\n",
    "                 yaxis_title='Number of Videos')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Average views by category\n",
    "category_views = videos_pd.groupby('category_name')['view_count'].mean().reset_index()\n",
    "category_views.columns = ['Category', 'Average Views']\n",
    "category_views = category_views.sort_values('Average Views', ascending=False)\n",
    "\n",
    "fig = px.bar(category_views, x='Category', y='Average Views', color='Average Views',\n",
    "             title='Average Views by Category')\n",
    "fig.update_layout(xaxis_title='Category',\n",
    "                 yaxis_title='Average Views')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Engagement rate by category\n",
    "category_engagement = videos_pd.groupby('category_name')['engagement_rate'].mean().reset_index()\n",
    "category_engagement.columns = ['Category', 'Average Engagement Rate']\n",
    "category_engagement = category_engagement.sort_values('Average Engagement Rate', ascending=False)\n",
    "\n",
    "fig = px.bar(category_engagement, x='Category', y='Average Engagement Rate', color='Average Engagement Rate',\n",
    "             title='Average Engagement Rate by Category')\n",
    "fig.update_layout(xaxis_title='Category',\n",
    "                 yaxis_title='Engagement Rate (%)')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Video Performance by Duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Create duration categories\n",
    "videos_pd['duration_category'] = pd.cut(\n",
    "    videos_pd['duration_seconds'],\n",
    "    bins=[0, 60, 300, 600, 1200, float('inf')],\n",
    "    labels=['< 1 min', '1-5 mins', '5-10 mins', '10-20 mins', '> 20 mins']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Video count by duration category\n",
    "duration_counts = videos_pd['duration_category'].value_counts().reset_index()\n",
    "duration_counts.columns = ['Duration', 'Count']\n",
    "\n",
    "# Define the correct order for duration categories\n",
    "duration_order = ['< 1 min', '1-5 mins', '5-10 mins', '10-20 mins', '> 20 mins']\n",
    "duration_counts['Duration'] = pd.Categorical(duration_counts['Duration'], categories=duration_order, ordered=True)\n",
    "duration_counts = duration_counts.sort_values('Duration')\n",
    "\n",
    "fig = px.bar(duration_counts, x='Duration', y='Count', color='Count',\n",
    "             title='Video Count by Duration')\n",
    "fig.update_layout(xaxis_title='Duration',\n",
    "                 yaxis_title='Number of Videos')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Average views by duration category\n",
    "duration_views = videos_pd.groupby('duration_category')['view_count'].mean().reset_index()\n",
    "duration_views.columns = ['Duration', 'Average Views']\n",
    "\n",
    "# Set categorical order\n",
    "duration_views['Duration'] = pd.Categorical(duration_views['Duration'], categories=duration_order, ordered=True)\n",
    "duration_views = duration_views.sort_values('Duration')\n",
    "\n",
    "fig = px.bar(duration_views, x='Duration', y='Average Views', color='Average Views',\n",
    "             title='Average Views by Video Duration')\n",
    "fig.update_layout(xaxis_title='Duration',\n",
    "                 yaxis_title='Average Views')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Engagement rate by duration category\n",
    "duration_engagement = videos_pd.groupby('duration_category')['engagement_rate'].mean().reset_index()\n",
    "duration_engagement.columns = ['Duration', 'Average Engagement Rate']\n",
    "\n",
    "# Set categorical order\n",
    "duration_engagement['Duration'] = pd.Categorical(duration_engagement['Duration'], categories=duration_order, ordered=True)\n",
    "duration_engagement = duration_engagement.sort_values('Duration')\n",
    "\n",
    "fig = px.bar(duration_engagement, x='Duration', y='Average Engagement Rate', color='Average Engagement Rate',\n",
    "             title='Average Engagement Rate by Video Duration')\n",
    "fig.update_layout(xaxis_title='Duration',\n",
    "                 yaxis_title='Engagement Rate (%)')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Publishing Patterns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Add day of week\n",
    "videos_pd['day_of_week'] = videos_pd['published_at'].dt.day_name()\n",
    "\n",
    "# Video count by day of week\n",
    "day_counts = videos_pd['day_of_week'].value_counts().reset_index()\n",
    "day_counts.columns = ['Day', 'Count']\n",
    "\n",
    "# Define the correct order for days\n",
    "day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
    "day_counts['Day'] = pd.Categorical(day_counts['Day'], categories=day_order, ordered=True)\n",
    "day_counts = day_counts.sort_values('Day')\n",
    "\n",
    "fig = px.bar(day_counts, x='Day', y='Count', color='Count',\n",
    "             title='Video Publishing Frequency by Day of Week')\n",
    "fig.update_layout(xaxis_title='Day of Week',\n",
    "                 yaxis_title='Number of Videos')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Average views by day of week\n",
    "day_views = videos_pd.groupby('day_of_week')['view_count'].mean().reset_index()\n",
    "day_views.columns = ['Day', 'Average Views']\n",
    "\n",
    "# Set categorical order\n",
    "day_views['Day'] = pd.Categorical(day_views['Day'], categories=day_order, ordered=True)\n",
    "day_views = day_views.sort_values('Day')\n",
    "\n",
    "fig = px.bar(day_views, x='Day', y='Average Views', color='Average Views',\n",
    "             title='Average Views by Publishing Day')\n",
    "fig.update_layout(xaxis_title='Day of Week',\n",
    "                 yaxis_title='Average Views')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Add hour of day\n",
    "videos_pd['hour_of_day'] = videos_pd['published_at'].dt.hour\n",
    "\n",
    "# Video count by hour of day\n",
    "hour_counts = videos_pd['hour_of_day'].value_counts().reset_index()\n",
    "hour_counts.columns = ['Hour', 'Count']\n",
    "hour_counts = hour_counts.sort_values('Hour')\n",
    "\n",
    "fig = px.bar(hour_counts, x='Hour', y='Count', color='Count',\n",
    "             title='Video Publishing Frequency by Hour of Day')\n",
    "fig.update_layout(xaxis_title='Hour of Day (UTC)',\n",
    "                 yaxis_title='Number of Videos')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Channel Growth Over Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Add month and year\n",
    "videos_pd['year_month'] = videos_pd['published_at'].dt.to_period('M')\n",
    "\n",
    "# Videos published per month\n",
    "monthly_videos = videos_pd.groupby('year_month').size().reset_index()\n",
    "monthly_videos.columns = ['Year-Month', 'Videos Published']\n",
    "monthly_videos['Year-Month'] = monthly_videos['Year-Month'].astype(str)\n",
    "\n",
    "fig = px.line(monthly_videos, x='Year-Month', y='Videos Published', markers=True,\n",
    "              title='Videos Published by Month')\n",
    "fig.update_layout(xaxis_title='Month',\n",
    "                 yaxis_title='Number of Videos')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Calculate cumulative views\n",
    "videos_pd = videos_pd.sort_values('published_at')\n",
    "videos_pd['cumulative_views'] = videos_pd['view_count'].cumsum()\n",
    "\n",
    "# Plot cumulative views over time\n",
    "fig = px.line(videos_pd, x='published_at', y='cumulative_views',\n",
    "              title='Cumulative Channel Views Over Time')\n",
    "fig.update_layout(xaxis_title='Date',\n",
    "                 yaxis_title='Cumulative Views')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Engagement Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Correlation between metrics\n",
    "correlation = videos_pd[['view_count', 'like_count', 'comment_count', 'duration_seconds', 'engagement_rate']].corr()\n",
    "\n",
    "fig = px.imshow(correlation, text_auto=True, aspect=\"auto\",\n",
    "                title='Correlation Between Video Metrics')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Scatter plot of views vs. likes\n",
    "fig = px.scatter(videos_pd, x='view_count', y='like_count', color='engagement_rate',\n",
    "                 hover_name='title', size='comment_count', log_x=True, log_y=True,\n",
    "                 title='Video Views vs. Likes')\n",
    "fig.update_layout(xaxis_title='Views (log scale)',\n",
    "                 yaxis_title='Likes (log scale)')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comment Analysis (if available)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Check if comments data is available\n",
    "if not comments_pd.empty:\n",
    "    print(f\"Total comments: {len(comments_pd)}\")\n",
    "    \n",
    "    # Top commenters\n",
    "    top_commenters = comments_pd['author_name'].value_counts().head(10).reset_index()\n",
    "    top_commenters.columns = ['Author', 'Comment Count']\n",
    "    \n",
    "    fig = px.bar(top_commenters, x='Author', y='Comment Count', color='Comment Count',\n",
    "                 title='Top 10 Commenters')\n",
    "    fig.update_layout(xaxis_title='Author',\n",
    "                     yaxis_title='Number of Comments')\n",
    "    fig.show()\n",
    "    \n",
    "    # Comments over time\n",
    "    comments_pd['date'] = pd.to_datetime(comments_pd['published_at']).dt.date\n",
    "    comments_over_time = comments_pd.groupby('date').size().reset_index()\n",
    "    comments_over_time.columns = ['Date', 'Comment Count']\n",
    "    \n",
    "    fig = px.line(comments_over_time, x='Date', y='Comment Count',\n",
    "                  title='Comments Over Time')\n",
    "    fig.update_layout(xaxis_title='Date',\n",
    "                     yaxis_title='Number of Comments')\n",
    "    fig.show()\n",
    "else:\n",
    "    print(\"No comments data available.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Keyword Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Define common keywords to search for\n",
    "keywords = ['how to', 'tutorial', 'review', 'guide', 'tips', 'introduction']\n",
    "keyword_stats = []\n",
    "\n",
    "for keyword in keywords:\n",
    "    # Find videos containing the keyword in title or description\n",
    "    mask = (videos_pd['title'].str.contains(keyword, case=False) | \n",
    "            videos_pd['description'].str.contains(keyword, case=False))\n",
    "    matching_videos = videos_pd[mask]\n",
    "    \n",
    "    if len(matching_videos) > 0:\n",
    "        keyword_stats.append({\n",
    "            'keyword': keyword,\n",
    "            'video_count': len(matching_videos),\n",
    "            'avg_views': matching_videos['view_count'].mean(),\n",
    "            'avg_likes': matching_videos['like_count'].mean(),\n",
    "            'avg_comments': matching_videos['comment_count'].mean(),\n",
    "            'avg_engagement': matching_videos['engagement_rate'].mean()\n",
    "        })\n",
    "\n",
    "# Convert to DataFrame\n",
    "keyword_df = pd.DataFrame(keyword_stats)\n",
    "\n",
    "if not keyword_df.empty:\n",
    "    # Sort by video count\n",
    "    keyword_df = keyword_df.sort_values('video_count', ascending=False)\n",
    "    \n",
    "    # Display keyword statistics\n",
    "    keyword_df\n",
    "    \n",
    "    # Plot video count by keyword\n",
    "    fig = px.bar(keyword_df, x='keyword', y='video_count', color='avg_views',\n",
    "                 title='Video Count by Keyword')\n",
    "    fig.update_layout(xaxis_title='Keyword',\n",
    "                     yaxis_title='Number of Videos')\n",
    "    fig.show()\n",
    "    \n",
    "    # Plot average views by keyword\n",
    "    fig = px.bar(keyword_df, x='keyword', y='avg_views', color='avg_engagement',\n",
    "                 title='Average Views by Keyword')\n",
    "    fig.update_layout(xaxis_title='Keyword',\n",
    "                     yaxis_title='Average Views')\n",
    "    fig.show()\n",
    "else:\n",
    "    print(\"No keyword matches found.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusions\n",
    "\n",
    "Based on the analysis above, here are the key insights:\n",
    "\n",
    "1. **Video Performance**: [Add observations about top-performing videos]\n",