---
## Uncovering Patterns in Video Titles

This section marks the beginning of a deeper exploration into the **titles and comments** of videos, using Natural Language Processing (NLP) techniques to extract meaningful insights. By focusing on high-engagement videos (those with an engagement rate greater than 10%), the analysis aims to identify the most frequently used words in titles. These words may provide clues about the content strategies driving audience engagement. This marks the foundation of a dedicated notebook that will delve into NLP methods, enabling a richer understanding of audience behavior and preferences through textual data.

In [None]:
# 1. Clean and extract words (fixed version)
all_words = []
for title in video_df[video_df['engagement_rate'] > 0.1]['title']:
    # Remove punctuation and split
    words = re.findall(r'\b[a-z]+\b', title.lower())  # Cleans better than str.replace
    all_words.extend(words)

# 2. Count and filter
word_counts = Counter(all_words)
stopwords = set(['the', 'and', 'to', 'of', 'a', 'in', 'is', 'it', 'that', 'this'])
top_words = {word: count for word, count in word_counts.items() 
             if word not in stopwords and len(word) > 3}  # Filter short words

# 3. Get top 15
top_15 = Counter(top_words).most_common(15)

# 4. Plot with improved formatting
plt.figure(figsize=(10, 6))
sns.barplot(
    x=[count for word, count in top_15],
    y=[word for word, count in top_15],
    palette='magma_r',
    edgecolor='black'
)

# 5. Professional annotations
plt.title("Top 15 Title Words in High-Engagement Videos (ER > 10%)", pad=15)
plt.xlabel("Frequency Count")
plt.ylabel("")
for i, (word, count) in enumerate(top_15):
    plt.text(count + 5, i, f"{count}", va='center')

plt.tight_layout()
plt.show()

In [None]:
# 1. Sample and merge data (ensure 'video_id' exists in both DataFrames)
comment_sample = comments_df.sample(1000, random_state=42).copy()
merged_data = comment_sample.merge(
    video_df[['video_id', 'viewCount']],  # Make sure video_df has 'video_id'
    on='video_id',
    how='left'
)

# 2. Calculate sentiment ON THE MERGED DATA
merged_data['sentiment'] = merged_data['comment'].apply(
    lambda x: TextBlob(str(x)).sentiment.polarity if pd.notnull(x) else np.nan
)

# 3. Filter valid data
plot_data = merged_data[
    (merged_data['viewCount'] > 0) & 
    (merged_data['sentiment'].notna())
].copy()

# 4. Create visualization
if len(plot_data) > 0:
    plt.figure(figsize=(12, 6))
    hexbin = plt.hexbin(
        x=plot_data['sentiment'],
        y=np.log10(plot_data['viewCount'] + 1),  # +1 to avoid log(0)
        gridsize=20,
        cmap='Reds',
        mincnt=1,
        extent=[-1, 1, 0, np.log10(plot_data['viewCount'].max() + 1)]
    )
    
    # Formatting
    plt.title(f"Comment Sentiment vs. Video Popularity (n={len(plot_data)})", pad=20)
    plt.xlabel("Comment Sentiment (Negative to Positive)")
    plt.ylabel("Log10(View Count + 1)")
    cb = plt.colorbar(hexbin, label='Number of Comments')
    plt.axvline(0, color='black', linestyle=':', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
else:
    print("Warning: No valid data points after filtering!")
    print("Check if: 1) Comments merged correctly, 2) View counts exist, 3) Sentiment was calculated")