In [3]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from wordcloud import STOPWORDS
from tqdm import tqdm
import re
import time

# ---------- CONFIG ---------- #
DB_PARAMS = "postgresql+psycopg2://postgres:Ilpmnl!69gg@localhost:5432/twt_snt"
END_DATE = time.strftime('%Y-%m-%d', time.gmtime(time.time()))
START_DATE = (datetime.now() - timedelta(days=45)).strftime('%Y-%m-%d')
engine = create_engine(DB_PARAMS)

# ---------- FUNCTIONS ---------- #
def fetch_articles(start_date, end_date, engine):
    query = """
        SELECT title, summary, url, date, pulled_date, published_date, term,
               vader_neg, vader_neu, vader_pos, vader_compound,
               roberta_neg, roberta_pos
        FROM articles_tbl
        WHERE date BETWEEN %s AND %s
        ORDER BY date DESC;
    """
    df = pd.read_sql(query, engine, params=(start_date, end_date))
    df['date'] = pd.to_datetime(df['date'])  # KEEP datetime64[ns]
    return df

def fetch_price_data(start_date, end_date, terms, engine):
    query = """
        SELECT date, term, open, high, low, close, adj_close, volume,
               close_ma_7, close_ma_21, close_ma_50, close_ma_100, close_ma_200
        FROM yahoo_price_tbl
        WHERE term = ANY(%s) AND date BETWEEN %s AND %s;
    """
    terms_list = terms if isinstance(terms, list) else [terms]
    df = pd.read_sql(query, engine, params=(terms_list, start_date, end_date))
    df['date'] = pd.to_datetime(df['date'])  # KEEP datetime64[ns]
    return df

def include_price_data(grouped_df, start_date, end_date, engine):
    terms = grouped_df['term'].unique().tolist()
    if not terms:
        return grouped_df
    print("🔎 Fetching price data in single query")
    try:
        price_df = fetch_price_data(start_date, end_date, terms, engine)
    except Exception as e:
        print(f"⚠️ Price data fetch failed: {e}")
        return grouped_df
    if price_df.empty:
        print("⚠️ No price data found for any terms.")
        return grouped_df
    return grouped_df.merge(price_df, on=['date', 'term'], how='left')

def assign_sentiment_labels(df):
    conditions = [
        df['vader_compound'] > 0.05,
        df['vader_compound'] < -0.05
    ]
    choices = ['Positive', 'Negative']
    df['sentiment_label'] = np.select(conditions, choices, default='Neutral')
    return df

def calculate_article_metrics(df):
    df['abs_vader_compound'] = df['vader_compound'].abs()
    article_counts = df.groupby(['date', 'term']).size().reset_index(name='article_count')
    sentiment_dist = (df.groupby(['date', 'term'])['sentiment_label']
                      .value_counts(normalize=True)
                      .unstack(fill_value=0)
                      .mul(100)
                      .reset_index())
    sentiment_cols = ['Positive', 'Neutral', 'Negative']
    sentiment_dist['majority_sentiment'] = sentiment_dist[sentiment_cols].idxmax(axis=1)
    df = df.merge(article_counts, on=['date', 'term'])
    df = df.merge(sentiment_dist, on=['date', 'term'])
    df[sentiment_cols] = df[sentiment_cols].fillna(0)
    df['impact_score'] = df['article_count'] * df['abs_vader_compound']
    return df

def create_grouped_df(df):
    grouped = df.groupby(['date', 'pulled_date', 'term'], as_index=False).agg({
        'title': 'first', 'summary': 'first', 'url': 'first',
        'vader_compound': 'mean', 'article_count': 'first',
        'Positive': 'first', 'Neutral': 'first', 'Negative': 'first',
        'abs_vader_compound': 'max', 'impact_score': 'sum',
        'vader_neg': 'first', 'vader_neu': 'first', 'vader_pos': 'first',
        'roberta_neg': 'first', 'roberta_pos': 'first',
        'majority_sentiment': 'first'
    })
    grouped['sentiment_3d_MA'] = grouped['vader_compound'].rolling(3, min_periods=1).mean()
    grouped['sentiment_momentum'] = grouped['vader_compound'] - grouped['sentiment_3d_MA']
    grouped['trend_reversal'] = np.where(grouped['sentiment_momentum'].diff() < 0, 'Reversal', 'No Change')
    return grouped

def perform_clustering(df):
    kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
    features = df[['vader_compound', 'article_count']].fillna(0)
    df['sentiment_cluster'] = kmeans.fit_predict(features)
    label_map = {
        0: "Low Volume, Positive Sentiment",
        1: "High Volume, Positive Sentiment",
        2: "High Volume, Negative Sentiment",
        3: "Low Volume, Negative Sentiment"
    }
    df['cluster_label'] = df['sentiment_cluster'].map(label_map).fillna("Mixed Sentiment")
    return df

def is_valid_word(word):
    return len(word) > 2 and word.lower() not in STOPWORDS and not re.search(r'^\d+$', word)

def extract_sentiment_keywords(df, text_column='combined_text', top_n=5):
    sentiments = ['Positive', 'Negative', 'Neutral']
    rows = []
    grouped = df.groupby(['date', 'term'])
    for (date, term), group in tqdm(grouped, desc="Sentiment keyword extraction"):
        row = {'date': date, 'term': term}
        for sentiment in sentiments:
            text = ' '.join(group[group['sentiment_label'] == sentiment][text_column].dropna().astype(str)).lower()
            words = [re.sub(r'[^a-zA-Z]', '', w) for w in text.split() if is_valid_word(w)]
            top = [w for w, _ in Counter(words).most_common(top_n)]
            row[f'top_keywords_{sentiment.lower()}'] = ', '.join(top)
        rows.append(row)
    return pd.DataFrame(rows)

def tfidf_keywords_exploded_with_sentiment(df, text_column='combined_text', top_n=10):
    rows = []
    grouped = df.groupby(['date', 'term'])
    for (date, term), group in tqdm(grouped, desc="TF-IDF keyword explosion"):
        corpus = group[text_column].dropna().astype(str)
        if corpus.empty:
            continue
        try:
            vec = TfidfVectorizer(
                stop_words='english',
                max_features=1000,
                token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z0-9_-]{2,}\b'
            )
            X = vec.fit_transform(corpus)
            keywords = vec.get_feature_names_out()
            tfidf_scores = X.sum(axis=0).A1
            score_df = pd.DataFrame({'keyword': keywords, 'tfidf_score': tfidf_scores})
            score_df = score_df[score_df['keyword'].apply(is_valid_word)]

            full_text = ' '.join(corpus).lower().split()
            word_counts = Counter(full_text)
            score_df['keyword_count'] = score_df['keyword'].map(word_counts).fillna(0).astype(int)

            sentiments = []
            for kw in score_df['keyword']:
                pos = group.loc[group['sentiment_label'] == 'Positive', text_column].dropna().str.contains(kw, case=False, na=False).sum()
                neg = group.loc[group['sentiment_label'] == 'Negative', text_column].dropna().str.contains(kw, case=False, na=False).sum()
                neu = group.loc[group['sentiment_label'] == 'Neutral', text_column].dropna().str.contains(kw, case=False, na=False).sum()
                total = max(pos, neg, neu)
                sentiments.append('Positive' if total == pos else 'Negative' if total == neg else 'Neutral')

            score_df['sentiment_label'] = sentiments
            score_df['date'] = pd.to_datetime(date)
            score_df['term'] = term
            score_df['week'] = score_df['date'].dt.isocalendar().week
            score_df['month'] = score_df['date'].dt.to_period('M').astype(str)

            rows.append(score_df.sort_values('tfidf_score', ascending=False).head(top_n))
        except Exception as e:
            print(f"⚠️ TF-IDF failed for ({date}, {term}): {str(e)[:100]}")
    return pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()

def add_additional_metrics(df):
    df['sentiment_disagreement'] = ((df['vader_compound'] > 0) & (df['roberta_neg'] > df['roberta_pos'])) | \
                                   ((df['vader_compound'] < 0) & (df['roberta_pos'] > df['roberta_neg']))
    df['sentiment_spike'] = (df['vader_compound'].diff().abs() > 0.5).astype(int)
    df['volume_spike'] = (df['article_count'] > df['article_count'].rolling(7).mean() + 2 * df['article_count'].rolling(7).std()).astype(int)
    df['vader_day_change'] = df['vader_compound'].diff()
    df['impact_day_change'] = df['impact_score'].diff()
    df['sentiment_consistent'] = df['majority_sentiment'].eq(df['majority_sentiment'].shift(1)) & df['majority_sentiment'].eq(df['majority_sentiment'].shift(2))
    return df

def flag_daily_top_terms(df):
    daily_top = df.groupby(['date', 'term'])['impact_score'].sum().reset_index()
    daily_top = daily_top.sort_values(['date', 'impact_score'], ascending=[True, False])
    daily_top = daily_top.groupby('date').head(10)
    daily_top['daily_top10_flag'] = True
    return df.merge(daily_top[['date', 'term', 'daily_top10_flag']], on=['date', 'term'], how='left').fillna(False)

def merge_most_impactful_article(df, article_df):
    impactful = article_df.loc[article_df.groupby(['date', 'term'])['impact_score'].idxmax()]
    df = df.merge(impactful[['date', 'term', 'summary', 'url', 'sentiment_label']],
                  on=['date', 'term'], how='left', suffixes=('', '_impactful'))
    df['impactful_summary_with_url'] = df.apply(
        lambda x: f"{x['summary']}\n\n{x['url']}" if pd.notnull(x['summary']) else "", axis=1)
    return df.rename(columns={'sentiment_label_impactful': 'impactful_article_sentiment'})

# ---------- MAIN ---------- #
if __name__ == "__main__":
    try:
        print("🚀 Fetching articles...")
        articles_df = fetch_articles(START_DATE, END_DATE, engine)
        articles_df = assign_sentiment_labels(articles_df)
        articles_df = calculate_article_metrics(articles_df)
        articles_df['combined_text'] = articles_df['title'].fillna('') + ' ' + \
                                       articles_df['summary'].fillna('') + ' ' + \
                                       articles_df['url'].fillna('')

        print("📊 Creating aggregated dataframe...")
        grouped_df = create_grouped_df(articles_df)
        grouped_df = perform_clustering(grouped_df)
        grouped_df = include_price_data(grouped_df, START_DATE, END_DATE, engine)
        grouped_df = add_additional_metrics(grouped_df)
        grouped_df = flag_daily_top_terms(grouped_df)
        grouped_df = merge_most_impactful_article(grouped_df, articles_df)

        print("🔠 Extracting top keywords...")
        keywords_df = extract_sentiment_keywords(articles_df, text_column='combined_text', top_n=5)
        grouped_df = grouped_df.merge(keywords_df, on=['date', 'term'], how='left')

        print("📅 Adding time dimensions...")
        grouped_df['week'] = grouped_df['date'].dt.isocalendar().week
        grouped_df['month'] = grouped_df['date'].dt.to_period('M').astype(str)
        grouped_df['weekday'] = grouped_df['date'].dt.day_name()

        print("📊 Exploding TF-IDF keywords...")
        exploded_keywords_df = tfidf_keywords_exploded_with_sentiment(articles_df, text_column='combined_text', top_n=10)
        exploded_keywords_df.to_csv("tfidf_keywords_exploded.csv", index=False)

        print("📂 Saving outputs...")
        # Convert to ISO string for Tableau compatibility
        #articles_df['date'] = articles_df['date'].dt.strftime('%Y-%m-%d')
        #grouped_df['date'] = grouped_df['date'].dt.strftime('%Y-%m-%d')
        #exploded_keywords_df['date'] = exploded_keywords_df['date'].dt.strftime('%Y-%m-%d')
        
        
        grouped_df['term'] = grouped_df['term'].str.strip().str.upper()
        articles_df['term'] = articles_df['term'].str.strip().str.upper()
        
        grouped_df['date'] = pd.to_datetime(grouped_df['date']).dt.date
  
        articles_df.to_csv("article_level_sentiment.csv", index=False)
        grouped_df.to_csv("aggregated_news_per_term_per_day.csv", index=False)

        print("✅ All outputs saved for Tableau 🎉")
    except Exception as e:
        print(f"❌ Error: {e}")
    finally:
        engine.dispose()


🚀 Fetching articles...
📊 Creating aggregated dataframe...
🔎 Fetching price data in single query
⚠️ Price data fetch failed: not all arguments converted during string formatting
🔠 Extracting top keywords...


Sentiment keyword extraction: 100%|██████████████████████████████████████████████████| 909/909 [00:10<00:00, 84.69it/s]


📅 Adding time dimensions...
📊 Exploding TF-IDF keywords...


TF-IDF keyword explosion: 100%|██████████████████████████████████████████████████████| 909/909 [18:38<00:00,  1.23s/it]


📂 Saving outputs...
✅ All outputs saved for Tableau 🎉


In [4]:
print(grouped_df.dtypes)

date                                  object
pulled_date                   datetime64[ns]
term                                  object
title                                 object
summary                               object
url                                   object
vader_compound                       float64
article_count                          int64
Positive                             float64
Neutral                              float64
Negative                             float64
abs_vader_compound                   float64
impact_score                         float64
vader_neg                            float64
vader_neu                            float64
vader_pos                            float64
roberta_neg                          float64
roberta_pos                          float64
majority_sentiment                    object
sentiment_3d_MA                      float64
sentiment_momentum                   float64
trend_reversal                        object
sentiment_