In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
import os
import numpy as np
from collections import Counter
import re
from datetime import datetime, timedelta

# K·∫øt n·ªëi MongoDB
mongo_uri = os.environ.get('MONGO_URI')
client = MongoClient(mongo_uri)
db = client.get_database()
collection = db['pred_news']

# L·∫•y d·ªØ li·ªáu t·ª´ collection
data = list(collection.find())
df = pd.DataFrame(data)

# Th√™m c·ªôt th·ªùi gian v√† chuy·ªÉn ƒë·ªïi sang datetime
if 'published_date' in df.columns:
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
    # S·∫Øp x·∫øp theo th·ªùi gian
    df = df.sort_values('published_date')
else:
    print("Kh√¥ng t√¨m th·∫•y c·ªôt 'published_date', s·ª≠ d·ª•ng ch·ªâ s·ªë ƒë·ªÉ thay th·∫ø")
    df['published_date'] = pd.to_datetime('today') - pd.to_timedelta(np.arange(len(df)), 'D')

# T·∫°o h√†m ph√¢n t√≠ch xu h∆∞·ªõng
def analyze_trend(dataframe):
    # Ph√¢n t√≠ch xu h∆∞·ªõng g·∫ßn ƒë√¢y (30 ng√†y g·∫ßn nh·∫•t ho·∫∑c t·∫•t c·∫£ n·∫øu √≠t h∆°n)
    recent_days = 30
    today = pd.to_datetime('today')
    if 'published_date' in dataframe.columns:
        recent_df = dataframe[dataframe['published_date'] >= (today - timedelta(days=recent_days))]
        if len(recent_df) < 5:  # N·∫øu kh√¥ng ƒë·ªß d·ªØ li·ªáu, l·∫•y 30% d·ªØ li·ªáu g·∫ßn nh·∫•t
            recent_df = dataframe.iloc[-int(len(dataframe)*0.3):]
    else:
        recent_df = dataframe.iloc[-int(len(dataframe)*0.3):]
    
    # Ph√¢n t√≠ch c·∫£m x√∫c g·∫ßn ƒë√¢y
    sentiment_counts = recent_df['sentiment'].value_counts()
    sentiment_percentage = (sentiment_counts / sentiment_counts.sum() * 100).round(1)
    
    # Ph√¢n t√≠ch t·ª´ kh√≥a ph·ªï bi·∫øn
    if 'title' in recent_df.columns:
        titles = ' '.join(recent_df['title'].astype(str).fillna(''))
        words = re.findall(r'\b[A-Za-z√Ä-·ªπ][A-Za-z√Ä-·ªπ]+\b', titles)
        
        # Lo·∫°i b·ªè c√°c t·ª´ d·ª´ng trong ti·∫øng Vi·ªát
        vietnamese_stopwords = [
            'v√†', 'c·ªßa', 'c√≥', 'cho', 'c√°c', 'v·ªõi', 'l√†', 'ƒë∆∞·ª£c', 'trong', 'ƒë√£', 't·∫°i', 
            't·ª´', 'theo', 'nh·ªØng', 'ƒë·ªÉ', 'kh√¥ng', 'n√†y', 'ƒë·∫øn', 'v·ªÅ', 'c√≥ th·ªÉ', 'khi',
            's·∫Ω', 'ƒëang', 'nhi·ªÅu', 'nh∆∞', 'nƒÉm', 'tr√™n', 'nh∆∞ng', 'sau', 'ph·∫£i', 'c≈©ng',
            'm·ªôt', 'ƒë√¢y', 'l√†m', 'hai', 'c√≤n'
        ]
        
        filtered_words = [word.lower() for word in words if word.lower() not in vietnamese_stopwords and len(word) > 2]
        word_counts = Counter(filtered_words)
        
        # L·∫•y 10 t·ª´ ph·ªï bi·∫øn nh·∫•t
        common_words = word_counts.most_common(10)
    else:
        common_words = []
    
    # Ph√¢n t√≠ch xu h∆∞·ªõng c·∫£m x√∫c theo th·ªùi gian
    if len(recent_df) > 5 and 'published_date' in recent_df.columns:
        # Nh√≥m theo ng√†y v√† t√≠nh t·ª∑ l·ªá c·∫£m x√∫c
        recent_df['date'] = recent_df['published_date'].dt.date
        sentiment_by_date = recent_df.groupby('date')['sentiment'].value_counts(normalize=True).unstack().fillna(0)
        
        if not sentiment_by_date.empty:
            # T√≠nh xu h∆∞·ªõng (tƒÉng/gi·∫£m) cho m·ªói lo·∫°i c·∫£m x√∫c
            trend = {}
            for col in sentiment_by_date.columns:
                if len(sentiment_by_date[col]) >= 3:  # C·∫ßn √≠t nh·∫•t 3 ƒëi·ªÉm ƒë·ªÉ x√°c ƒë·ªãnh xu h∆∞·ªõng
                    values = sentiment_by_date[col].values
                    if np.mean(values[-3:]) > np.mean(values[:3]):
                        trend[col] = "tƒÉng"
                    elif np.mean(values[-3:]) < np.mean(values[:3]):
                        trend[col] = "gi·∫£m"
                    else:
                        trend[col] = "·ªïn ƒë·ªãnh"
                else:
                    trend[col] = "kh√¥ng ƒë·ªß d·ªØ li·ªáu"
        else:
            trend = {"positive": "kh√¥ng ƒë·ªß d·ªØ li·ªáu", "negative": "kh√¥ng ƒë·ªß d·ªØ li·ªáu", "neutral": "kh√¥ng ƒë·ªß d·ªØ li·ªáu"}
    else:
        trend = {"positive": "kh√¥ng ƒë·ªß d·ªØ li·ªáu", "negative": "kh√¥ng ƒë·ªß d·ªØ li·ªáu", "neutral": "kh√¥ng ƒë·ªß d·ªØ li·ªáu"}
    
    return {
        "sentiment_percentage": sentiment_percentage.to_dict(),
        "common_words": common_words,
        "sentiment_trend": trend
    }

# Ph√¢n t√≠ch d·ªØ li·ªáu
analysis_result = analyze_trend(df)

# T·∫°o danh s√°ch t·ª´ kh√≥a li√™n quan ƒë·∫øn c√°c c·ªï phi·∫øu/ng√†nh c·ª• th·ªÉ
stock_keywords = {
    "ng√¢n h√†ng": ["ng√¢n h√†ng", "vietcombank", "techcombank", "sacombank", "vietinbank", "vcb", "tcb", "stb"],
    "b·∫•t ƒë·ªông s·∫£n": ["b·∫•t ƒë·ªông s·∫£n", "ƒë·∫•t", "nh√†", "cƒÉn h·ªô", "vinhomes", "novaland", "ƒë·∫ßu t∆∞", "x√¢y d·ª±ng"],
    "ch·ª©ng kho√°n": ["ch·ª©ng kho√°n", "c·ªï phi·∫øu", "vn-index", "hnx", "upcom", "ssi", "vnindex"],
    "nƒÉng l∆∞·ª£ng": ["ƒëi·ªán", "d·∫ßu kh√≠", "nƒÉng l∆∞·ª£ng", "ƒëi·ªán l·ª±c", "pvn", "pvgas", "nhi·ªát ƒëi·ªán"],
    "c√¥ng ngh·ªá": ["fpt", "c√¥ng ngh·ªá", "ph·∫ßn m·ªÅm", "s·ªë h√≥a", "vnpay", "vi·ªÖn th√¥ng"],
    "th·ª±c ph·∫©m": ["masan", "vinamilk", "sabeco", "th·ª±c ph·∫©m", "ƒë·ªì u·ªëng"],
    "h√†ng kh√¥ng": ["vietjet", "vietnam airlines", "h√†ng kh√¥ng", "s√¢n bay", "bamboo", "vjc", "hvn", "adb"]
}

# H√†m ƒë∆∞a ra ph√¢n t√≠ch v√† l·ªùi khuy√™n ƒë·∫ßu t∆∞
def generate_investment_advice(analysis_result, df):
    sentiment_percentage = analysis_result["sentiment_percentage"]
    common_words = analysis_result["common_words"]
    sentiment_trend = analysis_result["sentiment_trend"]
    
    # T√¨m t·ª´ kh√≥a ng√†nh xu·∫•t hi·ªán trong danh s√°ch t·ª´ kh√≥a ph·ªï bi·∫øn
    mentioned_sectors = {}
    for sector, keywords in stock_keywords.items():
        mentions = 0
        for word, count in common_words:
            if any(keyword in word.lower() for keyword in keywords):
                mentions += count
        if mentions > 0:
            mentioned_sectors[sector] = mentions
    
    # T√¨m c√°c b√†i b√°o v·ªõi c·∫£m x√∫c t√≠ch c·ª±c/ti√™u c·ª±c cho t·ª´ng ng√†nh
    sector_sentiment = {}
    for sector, keywords in stock_keywords.items():
        sector_df = df[df['title'].astype(str).str.lower().apply(
            lambda x: any(keyword in x.lower() for keyword in keywords))]
        
        if len(sector_df) > 0:
            pos_pct = len(sector_df[sector_df['sentiment'] == 'positive']) / len(sector_df) * 100
            neg_pct = len(sector_df[sector_df['sentiment'] == 'negative']) / len(sector_df) * 100
            neu_pct = len(sector_df[sector_df['sentiment'] == 'neutral']) / len(sector_df) * 100
            
            sector_sentiment[sector] = {
                'positive': pos_pct, 
                'negative': neg_pct, 
                'neutral': neu_pct,
                'count': len(sector_df)
            }
    
    # T·∫°o b√°o c√°o ph√¢n t√≠ch
    advice_text = "# B√°o c√°o ph√¢n t√≠ch th·ªã tr∆∞·ªùng ch·ª©ng kho√°n d·ª±a tr√™n tin t·ª©c\n\n"
    
    # Ph√¢n t√≠ch t·ªïng quan
    advice_text += "## T·ªïng quan th·ªã tr∆∞·ªùng\n\n"
    
    # X√°c ƒë·ªãnh t√¢m l√Ω th·ªã tr∆∞·ªùng
    if 'positive' in sentiment_percentage and 'negative' in sentiment_percentage:
        total_sentiment_score = sentiment_percentage.get('positive', 0) - sentiment_percentage.get('negative', 0)
        if total_sentiment_score > 20:
            market_mood = "T√¢m l√Ω th·ªã tr∆∞·ªùng ƒëang r·∫•t t√≠ch c·ª±c"
        elif total_sentiment_score > 10:
            market_mood = "T√¢m l√Ω th·ªã tr∆∞·ªùng t√≠ch c·ª±c"
        elif total_sentiment_score < -20:
            market_mood = "T√¢m l√Ω th·ªã tr∆∞·ªùng ƒëang r·∫•t ti√™u c·ª±c"
        elif total_sentiment_score < -10:
            market_mood = "T√¢m l√Ω th·ªã tr∆∞·ªùng ti√™u c·ª±c"
        else:
            market_mood = "T√¢m l√Ω th·ªã tr∆∞·ªùng trung l·∫≠p"
    else:
        market_mood = "Kh√¥ng ƒë·ªß d·ªØ li·ªáu ƒë·ªÉ ƒë√°nh gi√° t√¢m l√Ω th·ªã tr∆∞·ªùng"
    
    advice_text += f"- {market_mood}\n"
    advice_text += f"- Tin t·ª©c t√≠ch c·ª±c: {sentiment_percentage.get('positive', 0):.1f}%\n"
    advice_text += f"- Tin t·ª©c ti√™u c·ª±c: {sentiment_percentage.get('negative', 0):.1f}%\n"
    advice_text += f"- Tin t·ª©c trung l·∫≠p: {sentiment_percentage.get('neutral', 0):.1f}%\n\n"
    
    # Ph√¢n t√≠ch xu h∆∞·ªõng
    advice_text += "## Xu h∆∞·ªõng th·ªã tr∆∞·ªùng\n\n"
    
    if 'positive' in sentiment_trend:
        if sentiment_trend['positive'] == "tƒÉng":
            advice_text += "- Tin t·ª©c t√≠ch c·ª±c ƒëang c√≥ xu h∆∞·ªõng tƒÉng üìà\n"
        elif sentiment_trend['positive'] == "gi·∫£m":
            advice_text += "- Tin t·ª©c t√≠ch c·ª±c ƒëang c√≥ xu h∆∞·ªõng gi·∫£m üìâ\n"
        else:
            advice_text += "- Tin t·ª©c t√≠ch c·ª±c ƒëang ·ªïn ƒë·ªãnh\n"
    
    if 'negative' in sentiment_trend:
        if sentiment_trend['negative'] == "tƒÉng":
            advice_text += "- Tin t·ª©c ti√™u c·ª±c ƒëang c√≥ xu h∆∞·ªõng tƒÉng üìà\n"
        elif sentiment_trend['negative'] == "gi·∫£m":
            advice_text += "- Tin t·ª©c ti√™u c·ª±c ƒëang c√≥ xu h∆∞·ªõng gi·∫£m üìâ\n"
        else:
            advice_text += "- Tin t·ª©c ti√™u c·ª±c ƒëang ·ªïn ƒë·ªãnh\n"
    
    advice_text += "\n## T·ª´ kh√≥a n·ªïi b·∫≠t g·∫ßn ƒë√¢y\n\n"
    for word, count in common_words[:7]:
        advice_text += f"- {word}: {count} l·∫ßn xu·∫•t hi·ªán\n"
    
    # Ph√¢n t√≠ch theo ng√†nh
    advice_text += "\n## Ph√¢n t√≠ch theo ng√†nh\n\n"
    
    if sector_sentiment:
        # S·∫Øp x·∫øp ng√†nh theo s·ªë l·∫ßn xu·∫•t hi·ªán
        sorted_sectors = sorted(sector_sentiment.items(), 
                               key=lambda x: (x[1]['positive'] - x[1]['negative'], x[1]['count']), 
                               reverse=True)
        
        for sector, stats in sorted_sectors:
            advice_text += f"### {sector.title()}\n"
            advice_text += f"- S·ªë tin t·ª©c: {stats['count']}\n"
            advice_text += f"- T√≠ch c·ª±c: {stats['positive']:.1f}%\n"
            advice_text += f"- Ti√™u c·ª±c: {stats['negative']:.1f}%\n"
            advice_text += f"- Trung l·∫≠p: {stats['neutral']:.1f}%\n"
            
            # ƒê∆∞a ra l·ªùi khuy√™n
            sentiment_balance = stats['positive'] - stats['negative']
            if sentiment_balance > 20 and stats['count'] >= 5:
                advice_text += f"- **Khuy·∫øn ngh·ªã:** C√¢n nh·∫Øc MUA/N·∫ÆM GI·ªÆ c·ªï phi·∫øu ng√†nh {sector} d·ª±a tr√™n tin t·ª©c t√≠ch c·ª±c\n"
            elif sentiment_balance < -20 and stats['count'] >= 5:
                advice_text += f"- **Khuy·∫øn ngh·ªã:** C√¢n nh·∫Øc B√ÅN/TR√ÅNH c·ªï phi·∫øu ng√†nh {sector} do tin t·ª©c ti√™u c·ª±c\n"
            elif stats['count'] >= 5:
                advice_text += f"- **Khuy·∫øn ngh·ªã:** THEO D√ïI ng√†nh {sector}, ch∆∞a c√≥ xu h∆∞·ªõng r√µ r√†ng\n"
            else:
                advice_text += f"- **Khuy·∫øn ngh·ªã:** C·∫ßn th√™m d·ªØ li·ªáu ƒë·ªÉ ƒë∆∞a ra nh·∫≠n ƒë·ªãnh ch√≠nh x√°c\n"
            
            advice_text += "\n"
    else:
        advice_text += "Kh√¥ng ƒë·ªß d·ªØ li·ªáu ƒë·ªÉ ph√¢n t√≠ch theo ng√†nh\n\n"
    
    # K·∫øt lu·∫≠n
    advice_text += "## K·∫øt lu·∫≠n v√† l·ªùi khuy√™n t·ªïng th·ªÉ\n\n"
    
    if 'positive' in sentiment_percentage and 'negative' in sentiment_percentage:
        total_sentiment_score = sentiment_percentage.get('positive', 0) - sentiment_percentage.get('negative', 0)
        
        if total_sentiment_score > 30:
            advice_text += "Th·ªã tr∆∞·ªùng ch·ª©ng kho√°n ƒëang trong tr·∫°ng th√°i R·∫§T T√çCH C·ª∞C. ƒê√¢y c√≥ th·ªÉ l√† th·ªùi ƒëi·ªÉm t·ªët ƒë·ªÉ xem x√©t MUA V√ÄO, ƒë·∫∑c bi·ªát l√† c√°c c·ªï phi·∫øu trong c√°c ng√†nh ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p t√≠ch c·ª±c ·ªü tr√™n.\n\n"
        elif total_sentiment_score > 15:
            advice_text += "Th·ªã tr∆∞·ªùng ch·ª©ng kho√°n ƒëang trong tr·∫°ng th√°i T√çCH C·ª∞C. C√≥ th·ªÉ c√¢n nh·∫Øc GIA TƒÇNG v·ªã th·∫ø v√†o c√°c m√£ c√≥ tri·ªÉn v·ªçng t·ªët.\n\n"
        elif total_sentiment_score < -30:
            advice_text += "Th·ªã tr∆∞·ªùng ch·ª©ng kho√°n ƒëang trong tr·∫°ng th√°i R·∫§T TI√äU C·ª∞C. N√™n c√¢n nh·∫Øc B·∫¢O TO√ÄN V·ªêN, tr√°nh mua v√†o trong giai ƒëo·∫°n n√†y v√† c√≥ th·ªÉ xem x√©t B√ÅN b·ªõt c√°c c·ªï phi·∫øu r·ªßi ro cao.\n\n"
        elif total_sentiment_score < -15:
            advice_text += "Th·ªã tr∆∞·ªùng ch·ª©ng kho√°n ƒëang trong tr·∫°ng th√°i TI√äU C·ª∞C. N√™n TH·∫¨N TR·ªåNG v√† c√≥ chi·∫øn l∆∞·ª£c b·∫£o v·ªá danh m·ª•c ƒë·∫ßu t∆∞.\n\n"
        else:
            advice_text += "Th·ªã tr∆∞·ªùng ch·ª©ng kho√°n ƒëang trong tr·∫°ng th√°i TRUNG L·∫¨P. N√™n THEO D√ïI k·ªπ th·ªã tr∆∞·ªùng v√† ch·ªâ giao d·ªãch khi c√≥ t√≠n hi·ªáu r√µ r√†ng.\n\n"
    else:
        advice_text += "Kh√¥ng ƒë·ªß d·ªØ li·ªáu ƒë·ªÉ ƒë∆∞a ra l·ªùi khuy√™n t·ªïng th·ªÉ.\n\n"
    
    advice_text += "**L∆∞u √Ω:** ƒê√¢y ch·ªâ l√† ph√¢n t√≠ch d·ª±a tr√™n tin t·ª©c, nh√† ƒë·∫ßu t∆∞ n√™n k·∫øt h·ª£p v·ªõi ph√¢n t√≠ch k·ªπ thu·∫≠t v√† c∆° b·∫£n kh√°c tr∆∞·ªõc khi ƒë∆∞a ra quy·∫øt ƒë·ªãnh ƒë·∫ßu t∆∞.\n"
    
    return advice_text

# T·∫°o l·ªùi khuy√™n ƒë·∫ßu t∆∞
investment_advice = generate_investment_advice(analysis_result, df)

# Hi·ªÉn th·ªã l·ªùi khuy√™n
print(investment_advice)

# L∆∞u b√°o c√°o ra file
with open('bao_cao_dau_tu.md', 'w', encoding='utf-8') as f:
    f.write(investment_advice)

print("\nƒê√£ l∆∞u b√°o c√°o ƒë·∫ßu t∆∞ v√†o file 'bao_cao_dau_tu.md'")

# T·∫°o bi·ªÉu ƒë·ªì ph√¢n t√≠ch t√¢m l√Ω th·ªã tr∆∞·ªùng theo ng√†nh
if 'sector_sentiment' in locals() and len(sector_sentiment) > 0:
    sectors = list(sector_sentiment.keys())
    positive_values = [sector_sentiment[s]['positive'] for s in sectors]
    negative_values = [sector_sentiment[s]['negative'] for s in sectors]
    
    plt.figure(figsize=(12, 8))
    x = np.arange(len(sectors))
    width = 0.35
    
    plt.bar(x - width/2, positive_values, width, label='T√≠ch c·ª±c', color='green')
    plt.bar(x + width/2, negative_values, width, label='Ti√™u c·ª±c', color='red')
    
    plt.xlabel('Ng√†nh')
    plt.ylabel('T·ª∑ l·ªá (%)')
    plt.title('T√¢m l√Ω th·ªã tr∆∞·ªùng theo ng√†nh')
    plt.xticks(x, [sector.title() for sector in sectors], rotation=45, ha='right')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('phan_tich_nganh.png', dpi=300)
    plt.show()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Kh√¥ng t√¨m th·∫•y c·ªôt 'published_date', s·ª≠ d·ª•ng ch·ªâ s·ªë ƒë·ªÉ thay th·∫ø
# B√°o c√°o ph√¢n t√≠ch th·ªã tr∆∞·ªùng ch·ª©ng kho√°n d·ª±a tr√™n tin t·ª©c

## T·ªïng quan th·ªã tr∆∞·ªùng

- T√¢m l√Ω th·ªã tr∆∞·ªùng ƒëang r·∫•t t√≠ch c·ª±c
- Tin t·ª©c t√≠ch c·ª±c: 53.3%
- Tin t·ª©c ti√™u c·ª±c: 30.0%
- Tin t·ª©c trung l·∫≠p: 16.7%

## Xu h∆∞·ªõng th·ªã tr∆∞·ªùng

- Tin t·ª©c t√≠ch c·ª±c ƒëang ·ªïn ƒë·ªãnh
- Tin t·ª©c ti√™u c·ª±c ƒëang ·ªïn ƒë·ªãnh

## T·ª´ kh√≥a n·ªïi b·∫≠t g·∫ßn ƒë√¢y

- phi·∫øu: 17 l·∫ßn xu·∫•t hi·ªán
- tƒÉng: 11 l·∫ßn xu·∫•t hi·ªán
- ch·ª©ng: 9 l·∫ßn xu·∫•t hi·ªán
- kho√°n: 9 l·∫ßn xu·∫•t hi·ªán
- tr·∫ßn: 8 l·∫ßn xu·∫•t hi·ªán
- k·ªãch: 4 l·∫ßn xu·∫•t hi·ªán
- kh·ªëi: 4 l·∫ßn xu·∫•t hi·ªán

## Ph√¢n t√≠ch theo ng√†nh

### NƒÉng L∆∞·ª£ng
- S·ªë tin t·ª©c: 2
- T√≠ch c·ª±c: 100.0%
- Ti√™u c·ª±c: 0.0%
- Trung l·∫≠p: 0.0%
- **Khuy·∫øn ngh·ªã:** C·∫ßn th√™m d·ªØ li·ªáu ƒë·ªÉ ƒë∆∞a ra nh·∫≠n ƒë·ªãnh ch√≠nh x√°c

### H√†ng Kh√¥ng
- S·ªë tin t·ª©c: 2
- T√≠ch c·ª±c: 100.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_df['date'] = recent_df['published_date'].dt.date
