In [None]:
# Tạo file requirements.txt
"""
pymongo==4.6.0
pandas==2.1.3
numpy==1.24.3
matplotlib==3.8.2
seaborn==0.13.0
plotly==5.18.0
dash==2.14.2
dash-bootstrap-components==1.5.0
beautifulsoup4==4.12.2
requests==2.31.0
snscrape==0.7.0.20230622
praw==7.7.1
textblob==0.17.1
vaderSentiment==3.3.2
nltk==3.8.1
wordcloud==1.9.3
pyvi==0.1.1
underthesea==6.7.0
scikit-learn==1.3.2
pyspark==3.5.0
streamlit==1.29.0
"""

# Cài đặt trong Jupyter Notebook
%pip install -r requirements.txt

In [32]:
# cell 1: Import & Setup
import pymongo
from pymongo import MongoClient
import os
from datetime import datetime

# Kết nối MongoDB
MONGO_URI = "mongodb+srv://TrumBeoo:1xr1R8BRdLafRzTg@trumbeoo.c0hnfng.mongodb.net/social_media_analysis?" \
"retryWrites=true&w=majority&appName=TrumBeoo"

client = MongoClient(MONGO_URI)
db = client['social_media_analysis']

# Tạo collections
posts_collection = db['posts']
analysis_collection = db['sentiment_analysis']
trends_collection = db['trends']

# Tạo indexes để tối ưu query
posts_collection.create_index([("created_at", -1)])
posts_collection.create_index([("hashtags", 1)])
posts_collection.create_index([("topic", 1)])

print("✅ MongoDB connected successfully!")

✅ MongoDB connected successfully!


In [None]:
# cell 2: Twitter Data Collection với snscrape (không cần API key)
import snscrape.modules.twitter as sntwitter
import json
import re
from datetime import datetime, timedelta

class TwitterCrawler:
    def __init__(self):
        pass
    
    def search_tweets(self, query, max_results=100, since_date=None):
        """
        Thu thập tweets về chủ đề cụ thể bằng snscrape
        """
        tweets_data = []
        
        try:
            # Tạo query với ngày nếu có
            if since_date:
                search_query = f"{query} since:{since_date}"
            else:
                # Mặc định lấy tweets trong 7 ngày qua
                since_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
                search_query = f"{query} since:{since_date}"
            
            print(f"Searching: {search_query}")
            
            # Sử dụng snscrape để thu thập tweets
            tweets = sntwitter.TwitterSearchScraper(search_query).get_items()
            
            count = 0
            for tweet in tweets:
                if count >= max_results:
                    break
                
                # Extract hashtags từ text
                hashtags = re.findall(r'#\w+', tweet.content)
                hashtags = [tag[1:] for tag in hashtags]  # Bỏ dấu #
                
                tweet_doc = {
                    'tweet_id': str(tweet.id),
                    'text': tweet.content,
                    'created_at': tweet.date,
                    'likes': tweet.likeCount or 0,
                    'retweets': tweet.retweetCount or 0,
                    'replies': tweet.replyCount or 0,
                    'hashtags': hashtags,
                    'lang': tweet.lang or 'unknown',
                    'username': tweet.user.username,
                    'user_followers': tweet.user.followersCount or 0,
                    'topic': query,
                    'source': 'twitter',
                    'collected_at': datetime.now()
                }
                tweets_data.append(tweet_doc)
                count += 1
            
            return tweets_data
        
        except Exception as e:
            print(f"Error collecting tweets: {e}")
            return []
    
    def save_to_mongodb(self, tweets_data):
        """Lưu vào MongoDB"""
        if tweets_data:
            # Kiểm tra trùng lặp trước khi insert
            new_tweets = []
            for tweet in tweets_data:
                existing = posts_collection.find_one({'tweet_id': tweet['tweet_id']})
                if not existing:
                    new_tweets.append(tweet)
            
            if new_tweets:
                posts_collection.insert_many(new_tweets)
                print(f"✅ Saved {len(new_tweets)} new tweets to MongoDB")
            else:
                print("ℹ️ No new tweets to save")

# Sử dụng snscrape (không cần API key)
crawler = TwitterCrawler()

# Thu thập dữ liệu về các chủ đề
topics = [
    "AI education",
    "trí tuệ nhân tạo giáo dục", 
    "AI học tập",
    "#AIEducation",
    "machine learning giáo dục"
]

for topic in topics:
    print(f"🔍 Collecting tweets about: {topic}")
    tweets = crawler.search_tweets(topic, max_results=50)
    crawler.save_to_mongodb(tweets)
    print(f"Found {len(tweets)} tweets\n")

In [34]:
# cell 3: Reddit Data Collection
import praw
from datetime import datetime

class RedditCrawler:
    def __init__(self, client_id, client_secret, user_agent):
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )
    
    def search_posts(self, query, subreddit_name='all', limit=100):
        """Thu thập posts từ Reddit"""
        posts_data = []
        
        try:
            subreddit = self.reddit.subreddit(subreddit_name)
            
            for post in subreddit.search(query, limit=limit, sort='new'):
                post_doc = {
                    'post_id': post.id,
                    'title': post.title,
                    'text': post.selftext,
                    'created_at': datetime.fromtimestamp(post.created_utc),
                    'score': post.score,
                    'num_comments': post.num_comments,
                    'upvote_ratio': post.upvote_ratio,
                    'subreddit': post.subreddit.display_name,
                    'author': str(post.author),
                    'url': post.url,
                    'topic': query,
                    'source': 'reddit',
                    'collected_at': datetime.now()
                }
                posts_data.append(post_doc)
            
            return posts_data
        
        except Exception as e:
            print(f"Error collecting Reddit posts: {e}")
            return []
    
    def save_to_mongodb(self, posts_data):
        if posts_data:
            posts_collection.insert_many(posts_data)
            print(f"✅ Saved {len(posts_data)} Reddit posts to MongoDB")

# Sử dụng
reddit_crawler = RedditCrawler(
    client_id="k6ozqL3mwwC0cGNUSmcdlQ",
    client_secret="JR6XLrrWpp2oNi5RNk0uV2GrrCaelw",
    user_agent="windows:ai-trend-collector:v1.0 (by /u/trung_it)"
)

reddit_topics = ["AI education", "artificial intelligence learning", "EdTech"]
for topic in reddit_topics:
    posts = reddit_crawler.search_posts(topic, limit=100)
    reddit_crawler.save_to_mongodb(posts)

✅ Saved 100 Reddit posts to MongoDB
✅ Saved 100 Reddit posts to MongoDB
✅ Saved 100 Reddit posts to MongoDB


In [35]:
# cell 4: Generate Mock Data
import random
from datetime import datetime, timedelta

class MockDataGenerator:
    def __init__(self):
        self.topics = ["AI trong giáo dục", "trí tuệ nhân tạo", "học máy", "EdTech"]
        self.hashtags = [
            "AIEducation", "EdTech", "MachineLearning", "DeepLearning",
            "GiáoDục", "HọcTậpAI", "TríTuệNhânTạo", "CôngNghệGiáoDục"
        ]
        self.positive_words = [
            "tuyệt vời", "hiệu quả", "hữu ích", "tiện lợi", "sáng tạo",
            "amazing", "excellent", "great", "helpful", "innovative"
        ]
        self.negative_words = [
            "khó", "phức tạp", "tốn kém", "lo ngại", "rủi ro",
            "difficult", "complex", "expensive", "worried", "risk"
        ]
        self.neutral_words = [
            "nghiên cứu", "phát triển", "ứng dụng", "thảo luận",
            "research", "development", "application", "discussion"
        ]
    
    def generate_post(self, sentiment_type='mixed'):
        """Tạo post giả lập"""
        if sentiment_type == 'positive':
            words = self.positive_words
        elif sentiment_type == 'negative':
            words = self.negative_words
        else:
            words = self.positive_words + self.negative_words + self.neutral_words
        
        text = f"{random.choice(self.topics)} {random.choice(words)} " \
               f"trong {random.choice(['lớp học', 'trường học', 'đại học', 'khóa học'])}. " \
               f"#{random.choice(self.hashtags)} #{random.choice(self.hashtags)}"
        
        days_ago = random.randint(0, 90)
        created_at = datetime.now() - timedelta(days=days_ago)
        
        return {
            'text': text,
            'created_at': created_at,
            'likes': random.randint(0, 1000),
            'retweets': random.randint(0, 500),
            'replies': random.randint(0, 200),
            'hashtags': random.sample(self.hashtags, random.randint(1, 3)),
            'topic': random.choice(self.topics),
            'source': 'mock_data',
            'collected_at': datetime.now()
        }
    
    def generate_dataset(self, num_posts=1000):
        """Tạo dataset với tỷ lệ cảm xúc cân bằng"""
        posts = []
        
        # 40% positive, 30% negative, 30% neutral
        for _ in range(int(num_posts * 0.4)):
            posts.append(self.generate_post('positive'))
        
        for _ in range(int(num_posts * 0.3)):
            posts.append(self.generate_post('negative'))
        
        for _ in range(int(num_posts * 0.3)):
            posts.append(self.generate_post('mixed'))
        
        return posts
    
    def save_to_mongodb(self, posts):
        if posts:
            posts_collection.insert_many(posts)
            print(f"✅ Generated and saved {len(posts)} mock posts")

# Tạo dữ liệu mẫu
generator = MockDataGenerator()
mock_posts = generator.generate_dataset(1000)
generator.save_to_mongodb(mock_posts)

# Kiểm tra dữ liệu
print(f"\n📊 Total posts in database: {posts_collection.count_documents({})}")

✅ Generated and saved 1000 mock posts

📊 Total posts in database: 1600


In [36]:
# cell 5: Vietnamese Sentiment Analysis
from underthesea import sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import re

class SentimentAnalyzer:
    def __init__(self):
        self.vader = SentimentIntensityAnalyzer()
        
        # Từ điển cảm xúc tiếng Việt tùy chỉnh
        self.vietnamese_positive = [
            'tốt', 'hay', 'tuyệt', 'hiệu quả', 'hữu ích', 'tiện lợi', 
            'sáng tạo', 'xuất sắc', 'hoàn hảo', 'thú vị'
        ]
        self.vietnamese_negative = [
            'xấu', 'kém', 'tệ', 'khó', 'phức tạp', 'lo ngại', 
            'rủi ro', 'nguy hiểm', 'thất bại', 'không tốt'
        ]
    
    def clean_text(self, text):
        """Làm sạch text"""
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'#', '', text)  # Remove hashtag symbol
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text.lower().strip()
    
    def analyze_vietnamese(self, text):
        """Phân tích cảm xúc tiếng Việt"""
        cleaned_text = self.clean_text(text)
        
        # Đếm từ tích cực và tiêu cực
        positive_count = sum(1 for word in self.vietnamese_positive if word in cleaned_text)
        negative_count = sum(1 for word in self.vietnamese_negative if word in cleaned_text)
        
        # Sử dụng underthesea
        try:
            underthesea_result = sentiment(text)
            if underthesea_result == 'positive':
                base_score = 0.6
            elif underthesea_result == 'negative':
                base_score = -0.6
            else:
                base_score = 0.0
        except:
            base_score = 0.0
        
        # Tính điểm cuối cùng
        score = base_score + (positive_count * 0.1) - (negative_count * 0.1)
        score = max(-1, min(1, score))  # Giới hạn [-1, 1]
        
        # Phân loại
        if score >= 0.2:
            label = 'positive'
        elif score <= -0.2:
            label = 'negative'
        else:
            label = 'neutral'
        
        return {
            'score': round(score, 3),
            'label': label,
            'positive_words': positive_count,
            'negative_words': negative_count
        }
    
    def analyze_english(self, text):
        """Phân tích cảm xúc tiếng Anh"""
        # VADER Sentiment
        vader_scores = self.vader.polarity_scores(text)
        compound_score = vader_scores['compound']
        
        if compound_score >= 0.05:
            label = 'positive'
        elif compound_score <= -0.05:
            label = 'negative'
        else:
            label = 'neutral'
        
        return {
            'score': round(compound_score, 3),
            'label': label,
            'vader_scores': vader_scores
        }
    
    def analyze(self, text, lang='auto'):
        """Phân tích tự động dựa vào ngôn ngữ"""
        if lang == 'auto':
            # Phát hiện ngôn ngữ đơn giản
            if any(ord(char) > 127 for char in text):
                lang = 'vi'
            else:
                lang = 'en'
        
        if lang == 'vi':
            return self.analyze_vietnamese(text)
        else:
            return self.analyze_english(text)

# Áp dụng phân tích cảm xúc cho tất cả posts
analyzer = SentimentAnalyzer()

def analyze_all_posts():
    """Phân tích cảm xúc cho tất cả posts trong database"""
    posts = posts_collection.find({'sentiment': {'$exists': False}})
    count = 0
    
    for post in posts:
        sentiment_result = analyzer.analyze(post.get('text', ''))
        
        # Cập nhật document
        posts_collection.update_one(
            {'_id': post['_id']},
            {'$set': {
                'sentiment': sentiment_result['label'],
                'sentiment_score': sentiment_result['score'],
                'analyzed_at': datetime.now()
            }}
        )
        count += 1
        
        if count % 100 == 0:
            print(f"Analyzed {count} posts...")
    
    print(f"✅ Completed sentiment analysis for {count} posts")

analyze_all_posts()

Analyzed 100 posts...
Analyzed 200 posts...
Analyzed 300 posts...
Analyzed 400 posts...
Analyzed 500 posts...
Analyzed 600 posts...
Analyzed 700 posts...
Analyzed 800 posts...
Analyzed 900 posts...
Analyzed 1000 posts...
Analyzed 1100 posts...
Analyzed 1200 posts...
Analyzed 1300 posts...
✅ Completed sentiment analysis for 1300 posts


In [37]:
# cell 6: Topic-based Analysis
import pandas as pd
from collections import Counter

def analyze_by_topic():
    """Phân tích cảm xúc theo từng chủ đề"""
    pipeline = [
        {
            '$group': {
                '_id': '$topic',
                'total_posts': {'$sum': 1},
                'positive': {
                    '$sum': {'$cond': [{'$eq': ['$sentiment', 'positive']}, 1, 0]}
                },
                'negative': {
                    '$sum': {'$cond': [{'$eq': ['$sentiment', 'negative']}, 1, 0]}
                },
                'neutral': {
                    '$sum': {'$cond': [{'$eq': ['$sentiment', 'neutral']}, 1, 0]}
                },
                'avg_sentiment_score': {'$avg': '$sentiment_score'},
                'avg_likes': {'$avg': '$likes'}
            }
        },
        {'$sort': {'total_posts': -1}}
    ]
    
    results = list(posts_collection.aggregate(pipeline))
    
    # Chuyển sang DataFrame
    df = pd.DataFrame(results)
    df['positive_pct'] = (df['positive'] / df['total_posts'] * 100).round(2)
    df['negative_pct'] = (df['negative'] / df['total_posts'] * 100).round(2)
    df['neutral_pct'] = (df['neutral'] / df['total_posts'] * 100).round(2)
    
    return df

topic_analysis = analyze_by_topic()
print("\n📊 Sentiment Analysis by Topic:")
print(topic_analysis)

# Lưu vào MongoDB collection riêng
analysis_records = topic_analysis.to_dict('records')
analysis_collection.delete_many({})  # Clear old data
analysis_collection.insert_many(analysis_records)


📊 Sentiment Analysis by Topic:
                                _id  total_posts  positive  negative  neutral  \
0                            EdTech          441        99       171      171   
1                 AI trong giáo dục          264        43       170       51   
2                           học máy          253        45       153       55   
3                  trí tuệ nhân tạo          242        39       136       67   
4                      AI education          200        21        29      150   
5  artificial intelligence learning          200        52        23      125   

   avg_sentiment_score   avg_likes  positive_pct  negative_pct  neutral_pct  
0            -0.073454  497.315353         22.45         38.78        38.78  
1            -0.309848  489.193182         16.29         64.39        19.32  
2            -0.275494  532.252964         17.79         60.47        21.74  
3            -0.255372  498.152893         16.12         56.20        27.69  
4         

InsertManyResult(['EdTech', 'AI trong giáo dục', 'học máy', 'trí tuệ nhân tạo', 'AI education', 'artificial intelligence learning'], acknowledged=True)

In [38]:
# cell 7: Trend Analysis (CORRECTED VERSION)
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import re

class TrendAnalyzer:
    def __init__(self):
        self.df = pd.DataFrame(list(posts_collection.find()))
    
    def get_top_hashtags(self, limit=10):
        """Lấy top hashtags phổ biến"""
        all_hashtags = []
        
        # Kiểm tra xem có cột hashtags không
        if 'hashtags' in self.df.columns:
            for hashtags in self.df['hashtags'].dropna():
                if isinstance(hashtags, list):
                    all_hashtags.extend(hashtags)
        else:
            # Nếu không có hashtags, extract từ text
            for text in self.df['text'].dropna():
                hashtags = re.findall(r'#\w+', str(text))
                hashtags = [tag[1:] for tag in hashtags]  # Bỏ dấu #
                all_hashtags.extend(hashtags)
        
        if not all_hashtags:
            return [("No hashtags found", 0)]
        
        hashtag_counts = Counter(all_hashtags)
        return hashtag_counts.most_common(limit)
    
    def get_sentiment_trend(self, days=30):
        """Phân tích xu hướng cảm xúc theo thời gian"""
        # Lọc dữ liệu trong N ngày gần đây
        recent_date = datetime.now() - timedelta(days=days)
        recent_posts = self.df[self.df['created_at'] >= recent_date]
        
        # Group by date và sentiment
        daily_sentiment = recent_posts.groupby([
            recent_posts['created_at'].dt.date, 'sentiment'
        ]).size().unstack(fill_value=0)
        
        return daily_sentiment
    
    def get_engagement_stats(self):
        """Thống kê engagement"""
        stats = {
            'avg_likes': self.df['likes'].mean() if 'likes' in self.df.columns else 0,
            'avg_retweets': self.df['retweets'].mean() if 'retweets' in self.df.columns else 0,
            'avg_replies': self.df['replies'].mean() if 'replies' in self.df.columns else 0,
            'avg_score': self.df['score'].mean() if 'score' in self.df.columns else 0,
            'total_posts': len(self.df),
            'sentiment_distribution': self.df['sentiment'].value_counts().to_dict() if 'sentiment' in self.df.columns else {}
        }
        return stats
    
    def save_trends_to_db(self):
        """Lưu kết quả phân tích vào database"""
        trends_data = {
            'analysis_date': datetime.now(),
            'top_hashtags': dict(self.get_top_hashtags()),
            'engagement_stats': self.get_engagement_stats(),
            'total_posts_analyzed': len(self.df)
        }
        
        trends_collection.delete_many({})  # Clear old data
        trends_collection.insert_one(trends_data)
        print("Trends analysis saved to database")

trend_analyzer = TrendAnalyzer()

print("\nTop 10 Hashtags:")
for hashtag, count in trend_analyzer.get_top_hashtags(10):
    print(f"#{hashtag}: {count} mentions")

print("\nEngagement Statistics:")
stats = trend_analyzer.get_engagement_stats()
for key, value in stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")

trend_analyzer.save_trends_to_db()


Top 10 Hashtags:
#TríTuệNhânTạo: 262 mentions
#CôngNghệGiáoDục: 258 mentions
#AIEducation: 256 mentions
#GiáoDục: 252 mentions
#HọcTậpAI: 249 mentions
#EdTech: 248 mentions
#DeepLearning: 242 mentions
#MachineLearning: 241 mentions

Engagement Statistics:
avg_likes: 504.21
avg_retweets: 251.63
avg_replies: 100.44
avg_score: 5.30
total_posts: 1600
sentiment_distribution: {'negative': 682, 'neutral': 619, 'positive': 299}
Trends analysis saved to database


In [39]:
# cell 8: Dashboard Setup
import dash
from dash import dcc, html, Input, Output
import dash_bootstrap_components as dbc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Load data từ MongoDB
def load_data():
    posts = list(posts_collection.find())
    df = pd.DataFrame(posts)
    
    if 'created_at' in df.columns:
        df['created_at'] = pd.to_datetime(df['created_at'])
        df['date'] = df['created_at'].dt.date
        df['month'] = df['created_at'].dt.to_period('M').astype(str)
    
    return df

df = load_data()
trend_data = trends_collection.find_one()

# Colors
COLORS = {
    'positive': '#2ecc71',
    'negative': '#e74c3c',
    'neutral': '#95a5a6',
    'background': '#f8f9fa',
    'card': '#ffffff'
}

In [40]:
# cell 9: Dashboard Layout
app.layout = dbc.Container([
    # Header
    dbc.Row([
        dbc.Col([
            html.H1("📊 Dashboard Phân tích Dữ liệu Xã hội", 
                   className="text-center mb-4 mt-4",
                   style={'color': '#2c3e50', 'fontWeight': 'bold'}),
            html.H5("Chủ đề: AI trong Giáo dục", 
                   className="text-center mb-4",
                   style={'color': '#7f8c8d'})
        ])
    ]),
    
    # Summary Cards
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    html.H4("📝 Tổng Posts", className="card-title"),
                    html.H2(f"{len(df):,}", style={'color': '#3498db'})
                ])
            ], style={'backgroundColor': COLORS['card']})
        ], width=3),
        
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    html.H4("😊 Tích cực", className="card-title"),
                    html.H2(f"{len(df[df['sentiment']=='positive']):,}", 
                           style={'color': COLORS['positive']})
                ])
            ], style={'backgroundColor': COLORS['card']})
        ], width=3),
        
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    html.H4("😞 Tiêu cực", className="card-title"),
                    html.H2(f"{len(df[df['sentiment']=='negative']):,}", 
                           style={'color': COLORS['negative']})
                ])
            ], style={'backgroundColor': COLORS['card']})
        ], width=3),
        
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    html.H4("😐 Trung lập", className="card-title"),
                    html.H2(f"{len(df[df['sentiment']=='neutral']):,}", 
                           style={'color': COLORS['neutral']})
                ])
            ], style={'backgroundColor': COLORS['card']})
        ], width=3),
    ], className="mb-4"),
    
    # Charts Row 1
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("📈 Xu hướng thảo luận theo thời gian"),
                dbc.CardBody([
                    dcc.Graph(id='timeline-chart')
                ])
            ])
        ], width=8),
        
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("🎯 Phân bố cảm xúc"),
                dbc.CardBody([
                    dcc.Graph(id='sentiment-pie')
                ])
            ])
        ], width=4),
    ], className="mb-4"),
    
    # Charts Row 2
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("🔥 Top 15 Hashtags"),
                dbc.CardBody([
                    dcc.Graph(id='hashtag-chart')
                ])
            ])
        ], width=6),
        
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("📊 Cảm xúc theo chủ đề"),
                dbc.CardBody([
                    dcc.Graph(id='topic-sentiment-chart')
                ])
            ])
        ], width=6),
    ], className="mb-4"),
    
    # Charts Row 3
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("📅 Phân tích theo tháng"),
                dbc.CardBody([
                    dcc.Graph(id='monthly-trend')
                ])
            ])
        ], width=12),
    ], className="mb-4"),
    
    # Charts Row 4
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("💬 WordCloud - Từ khóa phổ biến"),
                dbc.CardBody([
                    html.Img(id='wordcloud-img', style={'width': '100%'})
                ])
            ])
        ], width=6),
        
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("🎭 Phân tích chi tiết cảm xúc"),
                dbc.CardBody([
                    dcc.Graph(id='sentiment-detail')
                ])
            ])
        ], width=6),
    ], className="mb-4"),
    
], fluid=True, style={'backgroundColor': COLORS['background']})

In [41]:
# cell 10: Dashboard Callbacks
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import io
import base64

@app.callback(
    Output('timeline-chart', 'figure'),
    Input('timeline-chart', 'id')
)
def update_timeline(id):
    """Biểu đồ xu hướng theo thời gian"""
    daily = df.groupby(['date', 'sentiment']).size().unstack(fill_value=0)
    
    fig = go.Figure()
    
    for sentiment in ['positive', 'negative', 'neutral']:
        if sentiment in daily.columns:
            fig.add_trace(go.Scatter(
                x=daily.index,
                y=daily[sentiment],
                name=sentiment.capitalize(),
                mode='lines+markers',
                line=dict(color=COLORS[sentiment], width=2),
                fill='tonexty' if sentiment != 'positive' else None
            ))
    
    fig.update_layout(
        title="Xu hướng thảo luận theo ngày",
        xaxis_title="Ngày",
        yaxis_title="Số lượng posts",
        hovermode='x unified',
        template='plotly_white',
        height=400
    )
    
    return fig

@app.callback(
    Output('sentiment-pie', 'figure'),
    Input('sentiment-pie', 'id')
)
def update_sentiment_pie(id):
    """Biểu đồ tròn phân bố cảm xúc"""
    sentiment_counts = df['sentiment'].value_counts()
    
    fig = go.Figure(data=[go.Pie(
        labels=[s.capitalize() for s in sentiment_counts.index],
        values=sentiment_counts.values,
        marker=dict(colors=[COLORS[s] for s in sentiment_counts.index]),
        hole=0.4,
        textinfo='label+percent',
        textfont_size=12
    )])
    
    fig.update_layout(
        title="Tỷ lệ cảm xúc",
        height=400,
        template='plotly_white',
        showlegend=True
    )
    
    return fig

@app.callback(
    Output('hashtag-chart', 'figure'),
    Input('hashtag-chart', 'id')
)
def update_hashtag_chart(id):
    """Biểu đồ top hashtags"""
    top_hashtags = trend_data.get('top_hashtags', {}) if trend_data else {}
    
    if not top_hashtags:
        # Fallback: tính từ dataframe
        all_hashtags = []
        for hashtags in df['hashtags'].dropna():
            if isinstance(hashtags, list):
                all_hashtags.extend(hashtags)
        from collections import Counter
        top_hashtags = dict(Counter(all_hashtags).most_common(15))
    
    hashtags = list(top_hashtags.keys())[:15]
    counts = list(top_hashtags.values())[:15]
    
    fig = go.Figure(data=[
        go.Bar(
            x=counts,
            y=hashtags,
            orientation='h',
            marker=dict(
                color=counts,
                colorscale='Viridis',
                showscale=True
            ),
            text=counts,
            textposition='auto'
        )
    ])
    
    fig.update_layout(
        title="Top 15 Hashtags phổ biến nhất",
        xaxis_title="Số lượng",
        yaxis_title="Hashtag",
        height=400,
        template='plotly_white',
        yaxis={'categoryorder': 'total ascending'}
    )
    
    return fig

@app.callback(
    Output('topic-sentiment-chart', 'figure'),
    Input('topic-sentiment-chart', 'id')
)
def update_topic_sentiment(id):
    """Biểu đồ cảm xúc theo chủ đề"""
    topic_sentiment = df.groupby(['topic', 'sentiment']).size().unstack(fill_value=0)
    
    fig = go.Figure()
    
    for sentiment in ['positive', 'neutral', 'negative']:
        if sentiment in topic_sentiment.columns:
            fig.add_trace(go.Bar(
                name=sentiment.capitalize(),
                x=topic_sentiment.index,
                y=topic_sentiment[sentiment],
                marker_color=COLORS[sentiment]
            ))
    
    fig.update_layout(
        title="Phân bố cảm xúc theo chủ đề",
        xaxis_title="Chủ đề",
        yaxis_title="Số lượng posts",
        barmode='stack',
        height=400,
        template='plotly_white'
    )
    
    return fig

@app.callback(
    Output('monthly-trend', 'figure'),
    Input('monthly-trend', 'id')
)
def update_monthly_trend(id):
    """Biểu đồ phân tích theo tháng"""
    monthly = df.groupby('month').agg({
        'text': 'count',
        'sentiment_score': 'mean',
        'likes': 'sum'
    }).reset_index()
    
    monthly.columns = ['month', 'post_count', 'avg_sentiment', 'total_likes']
    
    # Tạo subplot với 2 trục y
    fig = make_subplots(
        rows=1, cols=1,
        specs=[[{"secondary_y": True}]]
    )
    
    # Số lượng posts
    fig.add_trace(
        go.Bar(
            x=monthly['month'],
            y=monthly['post_count'],
            name='Số posts',
            marker_color='#3498db',
            yaxis='y'
        ),
        secondary_y=False
    )
    
    # Điểm cảm xúc trung bình
    fig.add_trace(
        go.Scatter(
            x=monthly['month'],
            y=monthly['avg_sentiment'],
            name='Điểm cảm xúc TB',
            mode='lines+markers',
            line=dict(color='#e74c3c', width=3),
            marker=dict(size=8),
            yaxis='y2'
        ),
        secondary_y=True
    )
    
    fig.update_xaxes(title_text="Tháng")
    fig.update_yaxes(title_text="Số lượng posts", secondary_y=False)
    fig.update_yaxes(title_text="Điểm cảm xúc trung bình", secondary_y=True)
    
    fig.update_layout(
        title="Xu hướng theo tháng",
        height=400,
        template='plotly_white',
        hovermode='x unified'
    )
    
    return fig

@app.callback(
    Output('wordcloud-img', 'src'),
    Input('wordcloud-img', 'id')
)
def update_wordcloud(id):
    """Tạo WordCloud"""
    # Lấy tất cả text
    text = ' '.join(df['text'].dropna().astype(str))
    
    # Tạo wordcloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap='viridis',
        max_words=100,
        relative_scaling=0.5,
        min_font_size=10
    ).generate(text)
    
    # Convert to image
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    
    # Save to bytes
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', dpi=100)
    buf.seek(0)
    plt.close()
    
    # Encode to base64
    encoded = base64.b64encode(buf.read()).decode('utf-8')
    
    return f'data:image/png;base64,{encoded}'

@app.callback(
    Output('sentiment-detail', 'figure'),
    Input('sentiment-detail', 'id')
)
def update_sentiment_detail(id):
    """Phân tích chi tiết điểm cảm xúc"""
    fig = go.Figure()
    
    for sentiment in ['positive', 'neutral', 'negative']:
        sentiment_data = df[df['sentiment'] == sentiment]['sentiment_score']
        
        fig.add_trace(go.Box(
            y=sentiment_data,
            name=sentiment.capitalize(),
            marker_color=COLORS[sentiment],
            boxmean='sd'
        ))
    
    fig.update_layout(
        title="Phân bố điểm cảm xúc (Sentiment Score)",
        yaxis_title="Điểm cảm xúc (-1 đến 1)",
        height=400,
        template='plotly_white',
        showlegend=True
    )
    
    return fig

# Chạy app
if __name__ == '__main__':
    print("\n🚀 Starting Dashboard...")
    print("📍 Dashboard URL: http://127.0.0.1:8050")
    app.run_server(debug=True, port=8050)


🚀 Starting Dashboard...
📍 Dashboard URL: http://127.0.0.1:8050


In [42]:
# Cell 11 Fixed - Statistical Analysis
import pandas as pd
from datetime import datetime
import numpy as np
from collections import Counter

# Load data from MongoDB
df = pd.DataFrame(list(posts_collection.find()))

class StatisticalAnalyzer:
    def __init__(self, df):
        self.df = df.copy()
        # Add day_of_week column if it doesn't exist
        if 'created_at' in self.df.columns and 'day_of_week' not in self.df.columns:
            self.df['day_of_week'] = pd.to_datetime(self.df['created_at']).dt.day_name()
    
    def get_basic_stats(self):
        """Thống kê cơ bản"""
        return {
            'total_posts': len(self.df),
            'unique_topics': self.df['topic'].nunique() if 'topic' in self.df.columns else 0,
            'date_range': {
                'start': str(self.df['created_at'].min()) if 'created_at' in self.df.columns else None,
                'end': str(self.df['created_at'].max()) if 'created_at' in self.df.columns else None
            }
        }
    
    def get_sentiment_stats(self):
        """Thống kê cảm xúc"""
        if 'sentiment' not in self.df.columns:
            return {'error': 'No sentiment data available'}
        
        sentiment_counts = self.df['sentiment'].value_counts()
        total = len(self.df)
        
        return {
            'distribution': sentiment_counts.to_dict(),
            'percentages': (sentiment_counts / total * 100).round(2).to_dict(),
            'avg_sentiment_score': float(self.df['sentiment_score'].mean()) if 'sentiment_score' in self.df.columns else None
        }
    
    def get_engagement_stats(self):
        """Thống kê tương tác"""
        engagement_cols = ['likes', 'retweets', 'replies', 'score']
        stats = {}
        
        for col in engagement_cols:
            if col in self.df.columns:
                stats[col] = {
                    'mean': float(self.df[col].mean()),
                    'median': float(self.df[col].median()),
                    'max': int(self.df[col].max()),
                    'min': int(self.df[col].min())
                }
        
        return stats
    
    def get_time_analysis(self):
        """Phân tích theo thời gian"""
        if 'created_at' not in self.df.columns:
            return {'error': 'No time data available'}
        
        # Ensure day_of_week exists
        if 'day_of_week' not in self.df.columns:
            self.df['day_of_week'] = pd.to_datetime(self.df['created_at']).dt.day_name()
        
        return {
            'posts_by_day': self.df['day_of_week'].value_counts().to_dict(),
            'posts_by_hour': pd.to_datetime(self.df['created_at']).dt.hour.value_counts().to_dict()
        }
    
    def get_top_hashtags(self, limit=10):
        """Top hashtags"""
        if 'hashtags' not in self.df.columns:
            return []
        
        all_hashtags = []
        for hashtags in self.df['hashtags'].dropna():
            if isinstance(hashtags, list):
                all_hashtags.extend(hashtags)
        
        return Counter(all_hashtags).most_common(limit)
    
    def generate_comprehensive_report(self):
        """Tạo báo cáo thống kê toàn diện"""
        report = {
            'basic_stats': self.get_basic_stats(),
            'sentiment_stats': self.get_sentiment_stats(),
            'time_analysis': self.get_time_analysis(),
            'engagement_stats': self.get_engagement_stats(),
            'top_hashtags': self.get_top_hashtags(),
            'generated_at': datetime.now()
        }
        return report

# Tạo báo cáo
stat_analyzer = StatisticalAnalyzer(df)
comprehensive_report = stat_analyzer.generate_comprehensive_report()

# In báo cáo
print("📊 COMPREHENSIVE SOCIAL MEDIA ANALYSIS REPORT")
print("=" * 50)

print(f"\n📈 Basic Statistics:")
basic = comprehensive_report['basic_stats']
print(f"Total Posts: {basic['total_posts']}")
print(f"Unique Topics: {basic['unique_topics']}")
print(f"Date Range: {basic['date_range']['start']} to {basic['date_range']['end']}")

print(f"\n💭 Sentiment Analysis:")
sentiment = comprehensive_report['sentiment_stats']
if 'error' not in sentiment:
    print("Distribution:")
    for sentiment_type, count in sentiment['distribution'].items():
        pct = sentiment['percentages'][sentiment_type]
        print(f"  {sentiment_type.title()}: {count} posts ({pct}%)")
    print(f"Average Sentiment Score: {sentiment['avg_sentiment_score']:.3f}")

print(f"\n📅 Time Analysis:")
time_analysis = comprehensive_report['time_analysis']
if 'error' not in time_analysis:
    print("Posts by Day of Week:")
    for day, count in sorted(time_analysis['posts_by_day'].items(), key=lambda x: x[1], reverse=True)[:3]:
        print(f"  {day}: {count} posts")

print(f"\n🔥 Top Hashtags:")
for hashtag, count in comprehensive_report['top_hashtags'][:5]:
    print(f"  #{hashtag}: {count} mentions")

print(f"\n📊 Engagement Statistics:")
engagement = comprehensive_report['engagement_stats']
for metric, stats in engagement.items():
    print(f"  {metric.title()}:")
    print(f"    Average: {stats['mean']:.2f}")
    print(f"    Max: {stats['max']}")

print("\n✅ Analysis completed successfully!")

📊 COMPREHENSIVE SOCIAL MEDIA ANALYSIS REPORT

📈 Basic Statistics:
Total Posts: 1600
Unique Topics: 6
Date Range: 2025-07-17 23:22:59.207000 to 2025-10-15 23:22:59.229000

💭 Sentiment Analysis:
Distribution:
  Negative: 682 posts (42.62%)
  Neutral: 619 posts (38.69%)
  Positive: 299 posts (18.69%)
Average Sentiment Score: -0.135

📅 Time Analysis:
Posts by Day of Week:
  Wednesday: 655 posts
  Tuesday: 244 posts
  Thursday: 154 posts

🔥 Top Hashtags:
  #TríTuệNhânTạo: 262 mentions
  #CôngNghệGiáoDục: 258 mentions
  #AIEducation: 256 mentions
  #GiáoDục: 252 mentions
  #HọcTậpAI: 249 mentions

📊 Engagement Statistics:
  Likes:
    Average: 504.21
    Max: 1000
  Retweets:
    Average: 251.63
    Max: 500
  Replies:
    Average: 100.44
    Max: 200
  Score:
    Average: 5.30
    Max: 386

✅ Analysis completed successfully!


In [43]:
# cell 12: Advanced Analysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
warnings.filterwarnings('ignore')

class AdvancedAnalyzer:
    def __init__(self, df):
        self.df = df
    
    def topic_modeling(self, n_topics=5):
        """Topic Modeling với LDA"""
        texts = self.df['text'].fillna('').tolist()
        
        # TF-IDF
        vectorizer = TfidfVectorizer(
            max_features=100,
            min_df=2,
            max_df=0.8,
            stop_words='english'
        )
        
        tfidf_matrix = vectorizer.fit_transform(texts)
        
        # LDA
        lda = LatentDirichletAllocation(
            n_components=n_topics,
            random_state=42,
            max_iter=10
        )
        
        lda.fit(tfidf_matrix)
        
        # Lấy top words cho mỗi topic
        feature_names = vectorizer.get_feature_names_out()
        topics = []
        
        for topic_idx, topic in enumerate(lda.components_):
            top_indices = topic.argsort()[-10:][::-1]
            top_words = [feature_names[i] for i in top_indices]
            topics.append({
                'topic_id': topic_idx,
                'top_words': top_words
            })
        
        return topics
    
    def sentiment_correlation(self):
        """Phân tích correlation giữa sentiment và engagement"""
        correlation = self.df[['sentiment_score', 'likes', 'retweets', 'replies']].corr()
        return correlation
    
    def time_series_forecast(self):
        """Dự báo xu hướng đơn giản"""
        daily = self.df.groupby('date').size().reset_index()
        daily.columns = ['date', 'count']
        
        # Simple moving average
        daily['MA_7'] = daily['count'].rolling(window=7).mean()
        daily['MA_30'] = daily['count'].rolling(window=30).mean()
        
        return daily
    
    def sentiment_by_engagement(self):
        """Phân tích cảm xúc theo mức độ engagement"""
        # Phân loại engagement
        self.df['engagement_level'] = pd.cut(
            self.df['likes'],
            bins=[0, 10, 50, 100, float('inf')],
            labels=['Low', 'Medium', 'High', 'Viral']
        )
        
        engagement_sentiment = self.df.groupby(['engagement_level', 'sentiment']).size().unstack(fill_value=0)
        
        return engagement_sentiment
    
    def keyword_extraction(self, top_n=20):
        """Trích xuất từ khóa quan trọng"""
        texts = ' '.join(self.df['text'].fillna(''))
        
        vectorizer = TfidfVectorizer(max_features=top_n, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform([texts])
        
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray()[0]
        
        keywords = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
        
        return keywords

# Chạy phân tích nâng cao
advanced_analyzer = AdvancedAnalyzer(df)

print("\n🔬 PHÂN TÍCH NÂNG CAO\n")

print("📚 TOPIC MODELING (LDA):")
topics = advanced_analyzer.topic_modeling(n_topics=5)
for topic in topics:
    print(f"\n  Topic {topic['topic_id']}:")
    print(f"  Top words: {', '.join(topic['top_words'][:10])}")

print("\n📊 CORRELATION ANALYSIS:")
correlation = advanced_analyzer.sentiment_correlation()
print(correlation)

print("\n🔑 TOP 15 KEYWORDS (TF-IDF):")
keywords = advanced_analyzer.keyword_extraction(15)
for i, (word, score) in enumerate(keywords, 1):
    print(f"  {i}. {word}: {score:.4f}")

print("\n💪 SENTIMENT BY ENGAGEMENT LEVEL:")
engagement_sentiment = advanced_analyzer.sentiment_by_engagement()
print(engagement_sentiment)


🔬 PHÂN TÍCH NÂNG CAO

📚 TOPIC MODELING (LDA):

  Topic 0:
  Top words: ai, free, tools, growth, data, market, content, 2025, company, education

  Topic 1:
  Top words: https, com, www, png, intelligence, openai, news, 10, nse, models

  Topic 2:
  Top words: trong, edtech, học, giáo, dục, ai, họctậpai, đại, lớp, machinelearning

  Topic 3:
  Top words: học, tạo, trong, máy, tuệ, trí, nhân, côngnghệgiáodục, trường, machinelearning

  Topic 4:
  Top words: just, ai, like, tech, time, work, right, help, know, make

📊 CORRELATION ANALYSIS:
                 sentiment_score     likes  retweets   replies
sentiment_score         1.000000  0.023084 -0.052486  0.025731
likes                   0.023084  1.000000  0.039456 -0.046213
retweets               -0.052486  0.039456  1.000000  0.033473
replies                 0.025731 -0.046213  0.033473  1.000000

🔑 TOP 15 KEYWORDS (TF-IDF):
  1. ai: 0.5952
  2. https: 0.4096
  3. com: 0.3207
  4. trong: 0.2801
  5. học: 0.2799
  6. data: 0.1842
  7. l

In [44]:
# cell 13: Export Results
import json
from datetime import datetime

class ReportExporter:
    def __init__(self, df, report_data):
        self.df = df
        self.report_data = report_data
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    def export_to_csv(self):
        """Export DataFrame to CSV"""
        filename = f'social_media_analysis_{self.timestamp}.csv'
        self.df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"✅ Exported to {filename}")
        return filename
    
    def export_to_excel(self):
        """Export to Excel with multiple sheets"""
        filename = f'social_media_report_{self.timestamp}.xlsx'
        
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            # Main data
            self.df.to_excel(writer, sheet_name='Raw Data', index=False)
            
            # Summary statistics
            summary = self.df.groupby('sentiment').agg({
                'text': 'count',
                'likes': 'sum',
                'sentiment_score': 'mean'
            })
            summary.to_excel(writer, sheet_name='Summary')
            
            # Topic analysis
            topic_df = pd.DataFrame(self.report_data['topic_analysis'])
            topic_df.to_excel(writer, sheet_name='Topic Analysis', index=False)
            
            # Daily trend
            daily_trend = self.df.groupby('date').agg({
                'text': 'count',
                'likes': 'sum',
                'sentiment_score': 'mean'
            })
            daily_trend.to_excel(writer, sheet_name='Daily Trend')
        
        print(f"✅ Exported to {filename}")
        return filename
    
    def export_to_json(self):
        """Export report to JSON"""
        filename = f'analysis_report_{self.timestamp}.json'
        
        report = {
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'total_records': len(self.df),
                'date_range': {
                    'start': self.df['created_at'].min().isoformat(),
                    'end': self.df['created_at'].max().isoformat()
                }
            },
            'analysis': self.report_data
        }
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2, default=str)
        
        print(f"✅ Exported to {filename}")
        return filename
    
    def create_presentation_summary(self):
        """Tạo file summary cho presentation"""
        filename = f'presentation_summary_{self.timestamp}.txt'
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("="*80 + "\n")
            f.write("SOCIAL MEDIA ANALYSIS SUMMARY\n")
            f.write("="*80 + "\n\n")
            
            f.write(f"Dataset Size: {len(self.df):,} posts\n")
            f.write(f"Date Range: {self.df['created_at'].min().date()} to {self.df['created_at'].max().date()}\n\n")
            
            f.write("SENTIMENT DISTRIBUTION:\n")
            sentiment_counts = self.df['sentiment'].value_counts()
            for sentiment, count in sentiment_counts.items():
                pct = count / len(self.df) * 100
                f.write(f"  - {sentiment.capitalize()}: {count:,} ({pct:.1f}%)\n")
            
            f.write("\nTOP 5 TOPICS:\n")
            for i, topic in enumerate(self.report_data['topic_analysis'][:5], 1):
                f.write(f"  {i}. {topic['topic']}: {topic['total_posts']:,} posts\n")
            
            f.write("\nTOP 10 HASHTAGS:\n")
            hashtag_items = list(self.report_data['hashtag_analysis'].items())[:10]
            for i, (tag, data) in enumerate(hashtag_items, 1):
                f.write(f"  {i}. #{tag}: {data['count']} mentions\n")
        
        print(f"✅ Created summary: {filename}")
        return filename

# Export results
exporter = ReportExporter(df, comprehensive_report)

print("\n📤 EXPORTING RESULTS...\n")
exporter.export_to_csv()
exporter.export_to_excel()
exporter.export_to_json()
exporter.create_presentation_summary()


📤 EXPORTING RESULTS...

✅ Exported to social_media_analysis_20251015_232813.csv


ModuleNotFoundError: No module named 'openpyxl'

In [None]:
# cell 14: Streamlit Dashboard (save as streamlit_dashboard.py)
"""
Chạy file này riêng với lệnh: streamlit run streamlit_dashboard.py
"""

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pymongo import MongoClient
import sys

# Page config
st.set_page_config(
    page_title="Social Media Analytics Dashboard",
    page_icon="📊",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
    <style>
    .main {
        background-color: #f8f9fa;
    }
    .stMetric {
        background-color: white;
        padding: 15px;
        border-radius: 10px;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    </style>
""", unsafe_allow_html=True)

# Load data
@st.cache_data
def load_data():
    client = MongoClient("mongodb://localhost:27017/")
    db = client['social_media_analysis']
    posts = list(db['posts'].find())
    df = pd.DataFrame(posts)
    
    if 'created_at' in df.columns:
        df['created_at'] = pd.to_datetime(df['created_at'])
        df['date'] = df['created_at'].dt.date
        df['month'] = df['created_at'].dt.to_period('M').astype(str)
    
    return df

df = load_data()

# Sidebar
st.sidebar.title("🎛️ Bộ lọc")

# Date range filter
if 'created_at' in df.columns:
    min_date = df['created_at'].min().date()
    max_date = df['created_at'].max().date()
    
    date_range = st.sidebar.date_input(
        "Chọn khoảng thời gian",
        value=(min_date, max_date),
        min_value=min_date,
        max_value=max_date
    )
    
    if len(date_range) == 2:
        mask = (df['created_at'].dt.date >= date_range[0]) & (df['created_at'].dt.date <= date_range[1])
        df_filtered = df[mask]
    else:
        df_filtered = df
else:
    df_filtered = df

# Sentiment filter
sentiment_filter = st.sidebar.multiselect(
    "Cảm xúc",
    options=['positive', 'negative', 'neutral'],
    default=['positive', 'negative', 'neutral']
)

df_filtered = df_filtered[df_filtered['sentiment'].isin(sentiment_filter)]

# Topic filter
if 'topic' in df_filtered.columns:
    topics = df_filtered['topic'].unique().tolist()
    selected_topics = st.sidebar.multiselect(
        "Chủ đề",
        options=topics,
        default=topics
    )
    df_filtered = df_filtered[df_filtered['topic'].isin(selected_topics)]

# Main content
st.title("📊 Dashboard Phân tích Dữ liệu Xã hội")
st.markdown("### Chủ đề: AI trong Giáo dục")

# Metrics
col1, col2, col3, col4 = st.columns(4)

with col1:
    st.metric("📝 Tổng Posts", f"{len(df_filtered):,}")

with col2:
    positive_count = len(df_filtered[df_filtered['sentiment'] == 'positive'])
    st.metric("😊 Tích cực", f"{positive_count:,}", delta=f"{positive_count/len(df_filtered)*100:.1f}%")

with col3:
    negative_count = len(df_filtered[df_filtered['sentiment'] == 'negative'])
    st.metric("😞 Tiêu cực", f"{negative_count:,}", delta=f"{negative_count/len(df_filtered)*100:.1f}%")

with col4:
    avg_sentiment = df_filtered['sentiment_score'].mean()
    st.metric("📈 Điểm TB", f"{avg_sentiment:.3f}")

st.markdown("---")

# Charts
tab1, tab2, tab3, tab4 = st.tabs(["📈 Xu hướng", "🎯 Cảm xúc", "#️⃣ Hashtags", "📊 Thống kê"])

with tab1:
    st.subheader("Xu hướng thảo luận theo thời gian")
    
    daily_data = df_filtered.groupby(['date', 'sentiment']).size().unstack(fill_value=0)
    
    fig = go.Figure()
    for sentiment in ['positive', 'negative', 'neutral']:
        if sentiment in daily_data.columns:
            fig.add_trace(go.Scatter(
                x=daily_data.index,
                y=daily_data[sentiment],
                name=sentiment.capitalize(),
                mode='lines+markers'
            ))
    
    fig.update_layout(height=400, hovermode='x unified')
    st.plotly_chart(fig, use_container_width=True)

with tab2:
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("Phân bố cảm xúc")
       sentiment_counts = df_filtered['sentiment'].value_counts()
    
    fig = px.pie(
        values=sentiment_counts.values,
        names=sentiment_counts.index,
        color=sentiment_counts.index,
        color_discrete_map={
            'positive': '#2ecc71',
            'negative': '#e74c3c',
            'neutral': '#95a5a6'
        },
        hole=0.4
    )
    fig.update_layout(height=400)
    st.plotly_chart(fig, use_container_width=True)

with col2:
    st.subheader("Cảm xúc theo chủ đề")
    
    topic_sentiment = df_filtered.groupby(['topic', 'sentiment']).size().unstack(fill_value=0)
    
    fig = go.Figure()
    for sentiment in ['positive', 'neutral', 'negative']:
        if sentiment in topic_sentiment.columns:
            fig.add_trace(go.Bar(
                name=sentiment.capitalize(),
                x=topic_sentiment.index,
                y=topic_sentiment[sentiment]
            ))
    
    fig.update_layout(barmode='stack', height=400)
    st.plotly_chart(fig, use_container_width=True)

with tab3:
    st.subheader("Top Hashtags phổ biến")
    all_hashtags = []
for hashtags in df_filtered['hashtags'].dropna():
    if isinstance(hashtags, list):
        all_hashtags.extend(hashtags)

from collections import Counter
hashtag_counts = Counter(all_hashtags).most_common(20)

hashtag_df = pd.DataFrame(hashtag_counts, columns=['Hashtag', 'Count'])

fig = px.bar(
    hashtag_df,
    x='Count',
    y='Hashtag',
    orientation='h',
    color='Count',
    color_continuous_scale='Viridis'
)
fig.update_layout(height=600, yaxis={'categoryorder': 'total ascending'})
st.plotly_chart(fig, use_container_width=True)

with tab4:
    st.subheader("Thống kê chi tiết")
    col1, col2 = st.columns(2)

with col1:
    st.markdown("#### Thống kê theo tháng")
    monthly = df_filtered.groupby('month').agg({
        'text': 'count',
        'sentiment_score': 'mean',
        'likes': 'sum'
    }).reset_index()
    monthly.columns = ['Tháng', 'Số posts', 'Điểm CB', 'Tổng likes']
    st.dataframe(monthly, use_container_width=True)

with col2:
    st.markdown("#### Top Posts có engagement cao")
    top_posts = df_filtered.nlargest(10, 'likes')[['text', 'likes', 'sentiment', 'created_at']]
    top_posts['text'] = top_posts['text'].str[:100] + '...'
    st.dataframe(top_posts, use_container_width=True)
    
