In [1]:
# =======================================================
# 5. Trend Identification Using NLP
# Goal: Use NLP to analyze TikTok video descriptions and hashtags to identify emerging trends.
# =======================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
from collections import Counter
from datetime import timedelta
from textblob import TextBlob

# --- Configuration ---
N_TOPICS = 5
N_TOP_WORDS = 10
N_TOP_ENTITIES = 20
WINDOW_DAYS = 30  # Rolling window for trend analysis

# ======================
# DATA LOADING & PREPARATION
# ======================
print("Loading data...")
df = pd.read_csv('data/processed/tiktok_processed.csv')
df['create_time'] = pd.to_datetime(df['create_time'])
df = df.sort_values('create_time')
df['clean_description'] = df['clean_description'].fillna('')

# Use virality score from q4 instead of engagement rate
def calculate_virality_score(group):
    """Calculate virality score using correlation-based weights"""
    corr_views_likes = group['plays'].corr(group['likes']) or 0.5
    corr_views_comments = group['plays'].corr(group['comments']) or 0.5
    corr_views_shares = group['plays'].corr(group['shares']) or 0.5
    
    return (group['plays'] + 
            (1 - corr_views_likes) * group['likes'] + 
            (1 - corr_views_comments) * group['comments'] + 
            (1 - corr_views_shares) * group['shares'])

# Calculate rolling virality scores
df['virality_score'] = df.groupby(pd.Grouper(key='create_time', freq=f'{WINDOW_DAYS}D')).apply(
    lambda x: calculate_virality_score(x)
).reset_index(level=0, drop=True)

# Define trending threshold based on virality score
virality_threshold = df['virality_score'].quantile(0.80)
df['is_trending'] = (df['virality_score'] >= virality_threshold).astype(int)

# ======================
# TEMPORAL TREND ANALYSIS
# ======================
def analyze_emerging_topics(df, window_days=30):
    """Analyze topic evolution over time"""
    results = []
    for end_date in df['create_time'].unique():
        start_date = end_date - pd.Timedelta(days=window_days)
        window_data = df[(df['create_time'] >= start_date) & (df['create_time'] <= end_date)]
        
        if len(window_data) < 100:  # Skip if too few samples
            continue
            
        # Vectorize text
        vectorizer = CountVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))
        dtm = vectorizer.fit_transform(window_data['clean_description'])
        
        # Apply LDA
        lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=42)
        topic_dist = lda.fit_transform(dtm)
        
        # Get top words for trending content
        trending_mask = window_data['is_trending'] == 1
        if trending_mask.any():
            trending_topics = topic_dist[trending_mask].mean(axis=0)
            results.append({
                'date': end_date,
                'top_topics': trending_topics,
                'vocab': vectorizer.get_feature_names_out()
            })
    
    return pd.DataFrame(results)

# Analyze topic evolution
print("\nAnalyzing topic evolution...")
topic_evolution = analyze_emerging_topics(df)

# ======================
# ENHANCED SENTIMENT ANALYSIS
# ======================
def detailed_sentiment_analysis(text):
    """Perform detailed sentiment analysis"""
    blob = TextBlob(text)
    return {
        'polarity': blob.sentiment.polarity,
        'subjectivity': blob.sentiment.subjectivity,
        'word_count': len(blob.words),
        'has_exclamation': '!' in text,
        'has_question': '?' in text,
        'has_emoji': any(char in text for char in ['😀', '😂', '❤️', '🔥'])  # Add more emojis
    }

print("\nPerforming enhanced sentiment analysis...")
sentiment_features = df['clean_description'].apply(detailed_sentiment_analysis)
df = pd.concat([df, pd.DataFrame(sentiment_features.tolist())], axis=1)

# ======================
# NAMED ENTITY TRACKING
# ======================
def track_entity_trends(df, window_days=30):
    """Track entity popularity over time"""
    nlp = spacy.load("en_core_web_sm")
    entity_trends = []
    
    for end_date in df['create_time'].unique():
        start_date = end_date - pd.Timedelta(days=window_days)
        window_data = df[(df['create_time'] >= start_date) & (df['create_time'] <= end_date)]
        
        entities = []
        for text in window_data['clean_description']:
            doc = nlp(text)
            entities.extend([ent.text.lower() for ent in doc.ents])
        
        if entities:
            entity_counts = Counter(entities)
            entity_trends.append({
                'date': end_date,
                'top_entities': dict(entity_counts.most_common(10)),
                'total_mentions': len(entities)
            })
    
    return pd.DataFrame(entity_trends)

print("\nTracking entity trends...")
entity_trends = track_entity_trends(df)

# ======================
# VISUALIZATION & REPORTING
# ======================

# 1. Topic Evolution Plot
plt.figure(figsize=(15, 6))
for topic_idx in range(N_TOPICS):
    plt.plot(topic_evolution['date'], 
             topic_evolution['top_topics'].apply(lambda x: x[topic_idx]),
             label=f'Topic {topic_idx+1}')
plt.title('Topic Evolution Over Time')
plt.xlabel('Date')
plt.ylabel('Topic Prominence')
plt.legend()
plt.grid(True)
plt.show()

# 2. Sentiment Analysis Visualization
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='polarity', y='virality_score', 
                hue='is_trending', alpha=0.5)
plt.title('Sentiment Polarity vs Virality Score')
plt.show()

# 3. Entity Trend Analysis
plt.figure(figsize=(15, 6))
entity_trends.set_index('date')['total_mentions'].plot()
plt.title('Entity Mention Volume Over Time')
plt.grid(True)
plt.show()

# Save results
output_path = 'data/processed/nlp_trends_analysis.csv'
df.to_csv(output_path, index=False)
print(f"\nAnalysis complete. Results saved to {output_path}")

Loading data...

Analyzing topic evolution...


  df['virality_score'] = df.groupby(pd.Grouper(key='create_time', freq=f'{WINDOW_DAYS}D')).apply(


KeyboardInterrupt: 