actual topic modelling: finding actual topics and aggregating


**setup**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/topic-modelling

/content/drive/MyDrive/topic-modelling


installing packages

In [3]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [4]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Load data**

In [5]:
eth_df = pd.read_csv("extracted_datasets/eth_reddit_data.csv")
btc_df = pd.read_csv("extracted_datasets/btc_reddit_data.csv")
cryptomarkets_df = pd.read_csv("extracted_datasets/cryptomarkets_reddit_data.csv")
combined_df = pd.read_csv("extracted_datasets/combined_reddit_data.csv")


In [6]:

class RedditDataPreprocessor:
    def __init__(self, min_score=-5, min_text_length=20, max_text_length=5000, min_words=5, random_state=42):
        """
        Initialize the preprocessor with filtering parameters

        Parameters:
        - min_score: Minimum post score to keep (default: -5)
        - min_text_length: Minimum character length for combined text
        - max_text_length: Maximum character length for combined text
        - min_words: Minimum words after preprocessing
        - random_state: Random seed for sampling
        """
        self.min_score = min_score
        self.min_text_length = min_text_length
        self.max_text_length = max_text_length
        self.min_words = min_words
        self.random_state = random_state

        # initialize NLTK components
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

        # add crypto-specific stop words
        crypto_stopwords = {
            'crypto', 'cryptocurrency', 'btc', 'bitcoin', 'eth', 'ethereum',
            'coin', 'token', 'blockchain', 'would', 'could', 'one', 'get',
            'like', 'think', 'know', 'people', 'time', 'good', 'make',
            'price', 'market', 'trading', 'buy', 'sell', 'hold'
        }
        self.stop_words.update(crypto_stopwords)

    def is_crypto_relevant(self, text):
        """Check if text contains crypto/finance relevant content"""
        if pd.isna(text):
            return False

        text_lower = text.lower()

        # chatgpted crypto-relevant keywords
        crypto_keywords = {
            # core crypto terms
            'crypto', 'bitcoin', 'btc', 'ethereum', 'eth', 'blockchain', 'mining',
            'wallet', 'exchange', 'trading', 'price', 'market', 'investment',
            'portfolio', 'hodl', 'bull', 'bear', 'moon', 'dump', 'pump',

            # technical terms
            'defi', 'nft', 'token', 'coin', 'satoshi', 'hash', 'node',
            'altcoin', 'staking', 'yield', 'liquidity', 'dex', 'centralized',
            'decentralized', 'smart contract', 'gas fee', 'transaction',

            # market/regulatory terms
            'regulation', 'sec', 'institutional', 'adoption', 'volatility',
            'futures', 'options', 'leverage', 'margin', 'short', 'long',

            # popular coins/projects
            'doge', 'ada', 'sol', 'matic', 'link', 'dot', 'uni', 'aave',
            'chainlink', 'polygon', 'solana', 'cardano', 'binance', 'coinbase'
        }

        return any(keyword in text_lower for keyword in crypto_keywords)

    def is_spam_or_irrelevant(self, title, selftext):
        """Detect spam, promotional, or low-quality content"""
        combined_text = f"{title} {selftext}".lower() if not pd.isna(title) and not pd.isna(selftext) else ""

        # spam/promotional patterns
        spam_patterns = [
            r'check out my.*channel',
            r'subscribe.*youtube',
            r'follow me on',
            r'dm me',
            r'private message',
            r'\$\d+.*guaranteed',
            r'100% profit',
            r'risk free',
            r'click here',
            r'limited time',
            r'act now',
            r'free money',
            r'telegram.*group',
            r'discord.*server',
            r'referral.*link',
            r'use my code'
        ]

        for pattern in spam_patterns:
            if re.search(pattern, combined_text):
                return True

        # check for excessive links (likely spam)
        if combined_text.count('http') > 2:
            return True

        # check for very short, low-effort posts
        if len(combined_text.strip()) < 10:
            return True

        # check for posts that are just emojis or special characters
        if re.sub(r'[^a-zA-Z0-9\s]', '', combined_text).strip() == '':
            return True

        return False

    def preprocess_text(self, text):
        """Clean and preprocess text for topic modeling"""
        if pd.isna(text):
            return ""

        # convert to lowercase
        text = text.lower()

        # remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)

        # remove Reddit-specific formatting
        text = re.sub(r'/u/\S+', '', text)  # Remove user mentions
        text = re.sub(r'/r/\S+', '', text)  # Remove subreddit mentions
        text = re.sub(r'\[deleted\]|\[removed\]', '', text)

        # remove special characters but keep some punctuation
        text = re.sub(r'[^a-zA-Z\s\.\!\?]', ' ', text)

        # remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # tokenize
        tokens = word_tokenize(text)

        # remove stopwords, lemmatize, and filter short words
        tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and len(token) > 2 and token.isalpha()
        ]

        return ' '.join(tokens)

    def extract_date_from_filename(self, filename):
        """Extract date from filename pattern"""
        if pd.isna(filename):
            return None

        # pattern: reddit_data_eth_2025-08-07.csv
        date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filename)
        if date_match:
            return pd.to_datetime(date_match.group(1))
        return None

    def filter_relevant_posts(self, df):
        """Apply comprehensive filtering to remove irrelevant/low-quality posts"""
        print(f"Initial dataset size: {len(df):,}")

        # create a copy to avoid modifying original
        df_filtered = df.copy()

        # remove posts with missing essential data
        initial_size = len(df_filtered)
        df_filtered = df_filtered.dropna(subset=['score'])
        if len(df_filtered) < initial_size:
            print(f"Removed {initial_size - len(df_filtered)} posts with missing scores")

        # remove posts with very low scores (heavily downvoted)
        df_filtered = df_filtered[df_filtered['score'] >= self.min_score]
        print(f"After score filter (>= {self.min_score}): {len(df_filtered):,}")

        # combine title and selftext
        df_filtered['combined_text'] = (
            df_filtered['title'].fillna('') + ' ' + df_filtered['selftext'].fillna('')
        ).str.strip()

        # remove posts that are too short or too long
        text_lengths = df_filtered['combined_text'].str.len()
        df_filtered = df_filtered[
            (text_lengths >= self.min_text_length) &
            (text_lengths <= self.max_text_length)
        ]
        print(f"After text length filter ({self.min_text_length}-{self.max_text_length} chars): {len(df_filtered):,}")

        # remove deleted/removed posts
        deleted_mask = (
            (df_filtered['title'].isin(['[deleted]', '[removed]'])) |
            (df_filtered['selftext'].isin(['[deleted]', '[removed]'])) |
            (df_filtered['combined_text'].str.contains(r'\[deleted\]|\[removed\]', na=False))
        )
        df_filtered = df_filtered[~deleted_mask]
        print(f"After removing deleted posts: {len(df_filtered):,}")

        # apply relevance and spam filtering
        print("Filtering for crypto relevance and removing spam...")
        relevant_mask = df_filtered.apply(
            lambda row: (
                self.is_crypto_relevant(row['combined_text']) and
                not self.is_spam_or_irrelevant(row['title'], row['selftext'])
            ),
            axis=1
        )
        df_filtered = df_filtered[relevant_mask]
        print(f"After relevance and spam filter: {len(df_filtered):,}")

        return df_filtered

    def smart_sampling(self, df, max_posts=25000):
        """Apply smart sampling for large datasets"""
        if len(df) <= max_posts:
            return df

        print(f"Dataset large ({len(df):,} posts), applying engagement-based sampling...")

        # define high engagement threshold (top 30%)
        engagement_threshold = df['score'].quantile(0.7)

        high_engagement = df[df['score'] >= engagement_threshold]
        low_engagement = df[df['score'] < engagement_threshold]

        # keep all high engagement posts, sample from low engagement
        remaining_slots = max_posts - len(high_engagement)

        if remaining_slots > 0 and len(low_engagement) > 0:
            sample_size = min(remaining_slots, len(low_engagement))
            low_engagement_sample = low_engagement.sample(
                n=sample_size,
                random_state=self.random_state
            )

            df_sampled = pd.concat([high_engagement, low_engagement_sample], ignore_index=True)
        else:
            # if too many high engagement posts, sample from those too
            df_sampled = high_engagement.sample(n=max_posts, random_state=self.random_state)

        print(f"After smart sampling: {len(df_sampled):,}")
        print(f"  - High engagement posts: {len(high_engagement):,}")
        if 'low_engagement_sample' in locals():
            print(f"  - Low engagement sample: {len(low_engagement_sample):,}")

        return df_sampled

    def preprocess_dataset(self, df, apply_sampling=True, max_posts=25000):
        """Complete preprocessing pipeline for a dataset"""

        # apply filtering
        df_filtered = self.filter_relevant_posts(df)

        if len(df_filtered) == 0:
            print("Warning: No posts remaining after filtering!")
            return df_filtered

        # apply smart sampling if requested and dataset is large
        if apply_sampling:
            df_filtered = self.smart_sampling(df_filtered, max_posts)

        # extract dates from filename
        df_filtered['date'] = df_filtered['filename'].apply(self.extract_date_from_filename)

        # preprocess text
        print("Preprocessing text...")
        df_filtered['processed_text'] = df_filtered['combined_text'].apply(self.preprocess_text)

        # final filter: remove texts that are too short after preprocessing
        word_counts = df_filtered['processed_text'].apply(lambda x: len(x.split()) if x else 0)
        initial_size = len(df_filtered)
        df_filtered = df_filtered[word_counts >= self.min_words]

        if len(df_filtered) < initial_size:
            print(f"After preprocessing filter (>= {self.min_words} words): {len(df_filtered):,}")

        return df_filtered

    def save_preprocessed_data(self, df, filename):
        """Save preprocessed data to CSV"""
        df.to_csv(filename, index=False)
        print(f"Saved preprocessed data to: {filename}")

def preprocess_all_datasets():
    """Main function to preprocess all crypto datasets"""

    # initialize preprocessor
    preprocessor = RedditDataPreprocessor(
        min_score=-5,
        min_text_length=20,
        max_text_length=5000,
        min_words=5,
        random_state=42
    )

    # load datasets
    print("Loading datasets...")
    try:
        eth_df = pd.read_csv('extracted_datasets/eth_reddit_data.csv')
        btc_df = pd.read_csv('extracted_datasets/btc_reddit_data.csv')
        cryptomarkets_df = pd.read_csv('extracted_datasets/cryptomarkets_reddit_data.csv')
    except FileNotFoundError as e:
        print(f"Error loading datasets: {e}")
        return None

    print(f"Raw dataset sizes:")
    print(f"  ETH: {len(eth_df):,}")
    print(f"  BTC: {len(btc_df):,}")
    print(f"  CryptoMarkets: {len(cryptomarkets_df):,}")
    print(f"  Total: {len(eth_df) + len(btc_df) + len(cryptomarkets_df):,}")

    # preprocess each dataset
    results = {}

    for name, df in [('eth', eth_df), ('btc', btc_df), ('cryptomarkets', cryptomarkets_df)]:
        print(f"\n{'='*60}")
        print(f"PREPROCESSING {name.upper()} DATASET")
        print(f"{'='*60}")

        preprocessed_df = preprocessor.preprocess_dataset(df, apply_sampling=True)

        if len(preprocessed_df) > 0:

            results[name] = preprocessed_df
        else:
            print(f"No data remaining for {name} after preprocessing")
            results[name] = pd.DataFrame()

    # create combined preprocessed dataset
    print(f"\n{'='*60}")
    print("CREATING COMBINED PREPROCESSED DATASET")
    print(f"{'='*60}")

    combined_preprocessed = pd.concat([
        df for df in results.values() if len(df) > 0
    ], ignore_index=True)

    if len(combined_preprocessed) > 0:
        # apply smart sampling to combined dataset if it's too large
        if len(combined_preprocessed) > 30000:
            combined_preprocessed = preprocessor.smart_sampling(combined_preprocessed, max_posts=30000)

        results['combined'] = combined_preprocessed

    # print summary statistics
    print(f"\n{'='*60}")
    print("PREPROCESSING SUMMARY")
    print(f"{'='*60}")

    total_original = len(eth_df) + len(btc_df) + len(cryptomarkets_df)
    total_processed = sum(len(df) for df in results.values() if len(df) > 0)

    print(f"Total original posts: {total_original:,}")
    print(f"Total processed posts: {total_processed:,}")
    print(f"Overall reduction: {((total_original - total_processed) / total_original * 100):.1f}%")

    print(f"\nDataset breakdown:")
    for name, df in results.items():
        if len(df) > 0:
            print(f"  {name}: {len(df):,} posts")
            if 'date' in df.columns:
                date_range = f"{df['date'].min()} to {df['date'].max()}"
                print(f"    Date range: {date_range}")
            if 'subreddit' in df.columns:
                print(f"    Subreddits: {df['subreddit'].nunique()}")

    return results, preprocessor




In [7]:
results, preprocessor = preprocess_all_datasets()
preprocessed_datasets = results


Loading datasets...
Raw dataset sizes:
  ETH: 24,330
  BTC: 25,168
  CryptoMarkets: 13,269
  Total: 62,767

PREPROCESSING ETH DATASET
Initial dataset size: 24,330
After score filter (>= -5): 24,257
After text length filter (20-5000 chars): 24,016
After removing deleted posts: 23,876
Filtering for crypto relevance and removing spam...
After relevance and spam filter: 20,802
Preprocessing text...
After preprocessing filter (>= 5 words): 16,745

PREPROCESSING BTC DATASET
Initial dataset size: 25,168
After score filter (>= -5): 24,993
After text length filter (20-5000 chars): 24,915
After removing deleted posts: 24,747
Filtering for crypto relevance and removing spam...
After relevance and spam filter: 22,276
Preprocessing text...
After preprocessing filter (>= 5 words): 20,658

PREPROCESSING CRYPTOMARKETS DATASET
Initial dataset size: 13,269
After score filter (>= -5): 13,115
After text length filter (20-5000 chars): 13,055
After removing deleted posts: 12,972
Filtering for crypto relevan

In [8]:

from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

class LDATopicModeler:
    def __init__(self, n_topics=10, random_state=42):
        """
        Initialize the topic modeler

        Parameters:
        - n_topics: Number of topics to discover (default: 10)
        - random_state: Random seed for reproducibility
        """
        self.n_topics = n_topics
        self.random_state = random_state
        self.lda_model = None
        self.vectorizer = None
        self.feature_names = None
        self.topic_names = None

    def fit_lda_model(self, texts, optimize_topics=True, max_features=1000):
        """
        Fit LDA model on preprocessed texts

        Parameters:
        - texts: List of preprocessed text documents
        - optimize_topics: Whether to optimize number of topics via grid search
        - max_features: Maximum number of features for TF-IDF vectorizer
        """
        print(f"Fitting LDA model on {len(texts):,} documents...")

        # Create TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            min_df=2,  # Ignore terms that appear in less than 2 documents
            max_df=0.8,  # Ignore terms that appear in more than 80% of documents
            ngram_range=(1, 2),  # Use both unigrams and bigrams
            stop_words='english'
        )

        # Fit and transform texts
        print("Creating document-term matrix...")
        doc_term_matrix = self.vectorizer.fit_transform(texts)
        self.feature_names = self.vectorizer.get_feature_names_out()

        print(f"Document-term matrix shape: {doc_term_matrix.shape}")

        # optimize number of topics if requested
        if optimize_topics and len(texts) > 100:
            print("Optimizing number of topics...")
            self.n_topics = self._optimize_topic_number(doc_term_matrix)

        # fit final LDA model
        print(f"Fitting LDA with {self.n_topics} topics...")
        self.lda_model = LatentDirichletAllocation(
            n_components=self.n_topics,
            random_state=self.random_state,
            max_iter=20,
            learning_method='batch',
            doc_topic_prior=None,  # use default symmetric prior
            topic_word_prior=None  # use default symmetric prior
        )

        self.lda_model.fit(doc_term_matrix)

        # generate topic names
        self.topic_names = self._generate_topic_names()

        print("LDA model fitting complete!")
        return doc_term_matrix

    def _optimize_topic_number(self, doc_term_matrix):
        """Optimize number of topics using grid search with perplexity"""
        topic_range = [5, 8, 10, 12, 15]

        # for very large datasets, limit topic range
        if doc_term_matrix.shape[0] > 20000:
            topic_range = [5, 8, 10]

        print(f"Testing topic numbers: {topic_range}")

        param_grid = {'n_components': topic_range}
        lda = LatentDirichletAllocation(
            random_state=self.random_state,
            max_iter=10,
            learning_method='batch'
        )

        # use 3-fold CV, but limit to avoid overfitting on small datasets
        cv_folds = min(3, max(2, len(topic_range)))

        grid_search = GridSearchCV(
            lda,
            param_grid,
            cv=cv_folds,
            scoring='neg_mean_squared_error',  # use MSE as proxy for perplexity
            n_jobs=-1,
            verbose=0
        )

        grid_search.fit(doc_term_matrix)
        optimal_topics = grid_search.best_params_['n_components']

        print(f"Optimal number of topics: {optimal_topics}")
        return optimal_topics

    def _generate_topic_names(self, n_words=5):
        """Generate interpretable topic names based on top words"""
        if not self.lda_model or self.feature_names is None:
            return []

        topic_names = []

        for topic_idx, topic in enumerate(self.lda_model.components_):
            # get top words for this topic
            top_word_indices = topic.argsort()[-n_words:][::-1]
            top_words = [self.feature_names[i] for i in top_word_indices]

            # create descriptive name
            topic_name = f"Topic_{topic_idx}: {', '.join(top_words[:3])}"
            topic_names.append(topic_name)

        return topic_names

    def get_topic_details(self, n_words=10):
        """Get detailed information about each topic"""
        if not self.lda_model or self.feature_names is None:
            return []

        topic_details = []

        for topic_idx, topic in enumerate(self.lda_model.components_):
            # get top words with their weights
            top_word_indices = topic.argsort()[-n_words:][::-1]
            top_words_weights = [
                (self.feature_names[i], topic[i])
                for i in top_word_indices
            ]

            topic_details.append({
                'topic_id': topic_idx,
                'topic_name': self.topic_names[topic_idx] if self.topic_names else f"Topic_{topic_idx}",
                'top_words': top_words_weights
            })

        return topic_details

    def get_document_topics(self, texts):
        """Get topic probabilities for documents"""
        if not self.lda_model or not self.vectorizer:
            raise ValueError("Model not fitted. Call fit_lda_model first.")

        # transform texts using the fitted vectorizer
        doc_term_matrix = self.vectorizer.transform(texts)

        # get topic probabilities
        doc_topic_probs = self.lda_model.transform(doc_term_matrix)

        return doc_topic_probs

    def create_daily_topic_features(self, df):
        """
        Create daily aggregated topic features from preprocessed dataframe

        Parameters:
        - df: Preprocessed dataframe with 'processed_text', 'date', 'subreddit', 'score' columns

        Returns:
        - DataFrame with daily topic features
        """
        print("Creating daily topic features...")

        if 'processed_text' not in df.columns:
            raise ValueError("DataFrame must contain 'processed_text' column")

        # get topic probabilities for all documents
        topic_probs = self.get_document_topics(df['processed_text'])

        # add topic probabilities to dataframe
        topic_columns = [f'topic_{i}' for i in range(self.n_topics)]
        for i, col in enumerate(topic_columns):
            df[col] = topic_probs[:, i]

        # add dominant topic
        df['dominant_topic'] = topic_probs.argmax(axis=1)

        # calculate topic confidence (max probability)
        df['topic_confidence'] = topic_probs.max(axis=1)

        # group by date and subreddit for daily aggregation
        daily_features_list = []

        groupby_cols = ['date', 'subreddit'] if 'subreddit' in df.columns else ['date']

        for group_keys, group_df in df.groupby(groupby_cols):
            if isinstance(group_keys, tuple):
                date, subreddit = group_keys
            else:
                date, subreddit = group_keys, 'unknown'

            if pd.isna(date):
                continue

            # basic features
            features = {
                'date': date,
                'subreddit': subreddit,
                'post_count': len(group_df),
                'avg_score': group_df['score'].mean(),
                'total_score': group_df['score'].sum(),
                'score_std': group_df['score'].std() if len(group_df) > 1 else 0,
                'avg_topic_confidence': group_df['topic_confidence'].mean(),
            }

            # topic probability features (daily averages and volatility)
            for i, topic_col in enumerate(topic_columns):
                features[f'avg_{topic_col}'] = group_df[topic_col].mean()
                features[f'std_{topic_col}'] = group_df[topic_col].std() if len(group_df) > 1 else 0
                features[f'max_{topic_col}'] = group_df[topic_col].max()

            # dominant topic distribution
            dominant_counts = group_df['dominant_topic'].value_counts()
            for topic_idx in range(self.n_topics):
                count = dominant_counts.get(topic_idx, 0)
                features[f'dominant_topic_{topic_idx}_count'] = count
                features[f'dominant_topic_{topic_idx}_pct'] = count / len(group_df)

            # topic diversity metrics
            # shannon entropy of topic distribution
            topic_dist = group_df['dominant_topic'].value_counts(normalize=True)
            if len(topic_dist) > 1:
                entropy = -sum(p * np.log2(p) for p in topic_dist if p > 0)
                features['topic_diversity_entropy'] = entropy
            else:
                features['topic_diversity_entropy'] = 0

            # number of unique topics discussed
            features['unique_topics_count'] = group_df['dominant_topic'].nunique()

            # topic concentration (how focused the discussion is)
            features['topic_concentration'] = dominant_counts.iloc[0] / len(group_df) if len(dominant_counts) > 0 else 0

            daily_features_list.append(features)

        # convert to DataFrame
        daily_features_df = pd.DataFrame(daily_features_list)

        # sort by date
        daily_features_df = daily_features_df.sort_values('date').reset_index(drop=True)

        print(f"Created daily features: {daily_features_df.shape[0]} days, {daily_features_df.shape[1]} features")

        return daily_features_df

def run_topic_modeling_pipeline(datasets):
    """
    Main pipeline for topic modeling and daily feature creation

    Parameters:
    - datasets: Dictionary of preprocessed DataFrames {'eth': df, 'btc': df, etc.}
    """

    # check if we have data
    if not any(len(df) > 0 for df in datasets.values() if isinstance(df, pd.DataFrame)):
        print("No preprocessed data found. Please check your datasets.")
        return None, None

    # use combined dataset for training topics (if available), otherwise use largest dataset
    if 'combined' in datasets and len(datasets['combined']) > 0:
        training_data = datasets['combined']
        print(f"Using combined dataset for topic modeling: {len(training_data):,} posts")
    else:
        # find the largest individual dataset
        largest_name = max(
            [k for k in datasets.keys() if k != 'combined'],
            key=lambda x: len(datasets[x]) if isinstance(datasets[x], pd.DataFrame) else 0
        )
        training_data = datasets[largest_name]
        print(f"Using {largest_name} dataset for topic modeling: {len(training_data):,} posts")

    # initialize and fit topic model
    print(f"\n{'='*60}")
    print("TOPIC MODELING")
    print(f"{'='*60}")

    modeler = LDATopicModeler(n_topics=10, random_state=42)

    # fit LDA model
    modeler.fit_lda_model(
        training_data['processed_text'].tolist(),
        optimize_topics=True,
        max_features=1000
    )

    # display discovered topics
    print(f"\n{'='*60}")
    print("DISCOVERED TOPICS")
    print(f"{'='*60}")

    topic_details = modeler.get_topic_details()
    for topic in topic_details:
        print(f"\n{topic['topic_name']}")
        top_words = [f"{word}({weight:.3f})" for word, weight in topic['top_words'][:5]]
        print(f"  Top words: {', '.join(top_words)}")

    # create daily features for each dataset
    results = {}

    print(f"\n{'='*60}")
    print("CREATING DAILY TOPIC FEATURES")
    print(f"{'='*60}")

    for name, df in datasets.items():
        if len(df) == 0:
            print(f"\nSkipping {name}: no data")
            continue

        print(f"\nProcessing {name.upper()} dataset...")

        try:
            # Create daily topic features
            daily_features = modeler.create_daily_topic_features(df)

            # save results
            output_file = f'actual_topic_modelled/{name}_daily_topic_features.csv'
            daily_features.to_csv(output_file, index=False)
            print(f"Saved to: {output_file}")

            # store results
            results[name] = daily_features

            # print summary stats
            if len(daily_features) > 0:
                print(f"  Shape: {daily_features.shape}")
                print(f"  Date range: {daily_features['date'].min()} to {daily_features['date'].max()}")
                print(f"  Avg posts per day: {daily_features['post_count'].mean():.1f}")

                # Show top topics by average probability
                topic_cols = [col for col in daily_features.columns if col.startswith('avg_topic_')]
                if topic_cols:
                    avg_topic_probs = daily_features[topic_cols].mean().sort_values(ascending=False)
                    print(f"  Top topics: {', '.join(avg_topic_probs.head(3).index)}")

        except Exception as e:
            print(f"Error processing {name}: {e}")
            results[name] = pd.DataFrame()

    # create summary report
    print(f"\n{'='*60}")
    print("TOPIC MODELING SUMMARY")
    print(f"{'='*60}")

    print(f"\nModel Details:")
    print(f"  Number of topics: {modeler.n_topics}")
    print(f"  Vocabulary size: {len(modeler.feature_names) if modeler.feature_names is not None else 'N/A'}")
    print(f"  Training documents: {len(training_data):,}")

    print(f"\nDaily Features Created:")
    total_days = 0
    for name, df in results.items():
        if len(df) > 0:
            days = len(df)
            total_days += days
            print(f"  {name}: {days} days of features")
        else:
            print(f"  {name}: No features created")

    print(f"\nTotal daily observations: {total_days}")

    # save topic model summary
    topic_summary = pd.DataFrame([
        {
            'topic_id': topic['topic_id'],
            'topic_name': topic['topic_name'],
            'top_3_words': ', '.join([word for word, _ in topic['top_words'][:3]])
        }
        for topic in topic_details
    ])

    topic_summary.to_csv('actual_topic_modelled/lda_topic_summary.csv', index=False)
    print(f"\nSaved topic summary to: actual_topic_modelled/lda_topic_summary.csv")

    return results, modeler



# Alternative: BERTopic implementation (requires additional installation)
def run_bertopic_modeling(texts, n_topics=10):
    """
    Alternative topic modeling using BERTopic
    """
    try:
        from bertopic import BERTopic
        from sentence_transformers import SentenceTransformer
        print("Running BERTopic modeling...")

        # initialize BERTopic model
        topic_model = BERTopic(
            embedding_model="all-MiniLM-L6-v2",
            nr_topics=n_topics,
            verbose=True,
            random_state=42
        )

        # fit model and get topics
        topics, probs = topic_model.fit_transform(texts)

        # get topic information
        topic_info = topic_model.get_topic_info()

        print(f"\nBERTopic Results:")
        print(f"Number of topics: {len(topic_info) - 1}")  # -1 because of outlier topic
        print(f"Number of documents: {len(texts)}")

        print(f"\nTop Topics:")
        for idx, row in topic_info.head(10).iterrows():
            if row['Topic'] != -1:  # Skip outlier topic
                print(f"  Topic {row['Topic']}: {row['Name']}")

        return topic_model, topics, probs, topic_info

    except ImportError:
        print("BERTopic not installed. Run: pip install bertopic sentence-transformers")
        return None, None, None, None



In [9]:
def analyze_topic_trends(daily_features_df, modeler, top_n_topics=5):
    """
    Analyze topic trends over time - streamlined for forecasting insights

    Parameters:
    - daily_features_df: DataFrame with daily topic features
    - modeler: Fitted CryptoTopicModeler
    - top_n_topics: Number of top topics to analyze
    """
    if len(daily_features_df) == 0:
        print("No data to analyze")
        return

    print(f"\n{'='*50}")
    print("TOPIC TREND ANALYSIS")
    print(f"{'='*50}")

    # get only numbered topic columns
    topic_cols = []
    for col in daily_features_df.columns:
        if col.startswith('avg_topic_'):
            try:
                topic_num = int(col.split('_')[-1])
                topic_cols.append(col)
            except ValueError:
                continue

    if not topic_cols:
        print("No topic columns found")
        return

    # calculate average topic probabilities
    avg_topic_probs = daily_features_df[topic_cols].mean().sort_values(ascending=False)

    print(f"\nTop {top_n_topics} Topics by Average Probability:")
    for i, (topic_col, avg_prob) in enumerate(avg_topic_probs.head(top_n_topics).items()):
        topic_idx = int(topic_col.split('_')[-1])
        topic_name = modeler.topic_names[topic_idx] if modeler.topic_names else f"Topic {topic_idx}"
        print(f"  {i+1}. {topic_name}: {avg_prob:.4f}")

    # show basic stats for forecasting context
    if 'date' in daily_features_df.columns:
        print(f"\nAnalysis covers: {daily_features_df['date'].min()} to {daily_features_df['date'].max()}")
        print(f"Total days: {len(daily_features_df)}")
        if 'post_count' in daily_features_df.columns:
            print(f"Average posts per day: {daily_features_df['post_count'].mean():.1f}")

def create_forecasting_features(daily_features_df, lag_days=[1, 3]):
    """
    Create minimal lagged features for time series forecasting

    Parameters:
    - daily_features_df: DataFrame with daily features
    - lag_days: List of lag periods to create (reduced to just 1,3 days)
    """
    if len(daily_features_df) == 0:
        return daily_features_df

    print(f"Creating minimal forecasting features...")

    # sort by date
    df = daily_features_df.sort_values('date').copy()

    # only create features for key columns that matter for forecasting
    key_features = [
        'post_count',           # volume of discussion
        'avg_score',            # sentiment proxy
        'avg_topic_0',          # top topics only
        'avg_topic_1',
        'avg_topic_4',
        'avg_topic_7'
    ]

    # filter to existing columns
    key_features = [col for col in key_features if col in df.columns]

    # create only short-term lags (1-3 days) for key features
    for lag in lag_days:
        for col in key_features:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)

    # create 3-day rolling average only for post_count and avg_score
    important_cols = ['post_count', 'avg_score']
    important_cols = [col for col in important_cols if col in df.columns]

    for col in important_cols:
        df[f'{col}_3day_avg'] = df[col].rolling(window=3, min_periods=1).mean()

    print(f"Created forecasting features: {df.shape[1]} total columns")

    return df

In [10]:
# preprocessed_datasets = results

# # run the topic modeling pipeline
# results, modeler = run_topic_modeling_pipeline(preprocessed_datasets)

In [11]:

# # analyze trends for each crypto
# for name, daily_df in results.items():
#     if len(daily_df) > 0:
#         print(f"\n{'='*60}")
#         print(f"ANALYZING {name.upper()} TRENDS")
#         print(f"{'='*60}")

#         # Get trend analysis
#         analyze_topic_trends(daily_df, modeler)

#         # Create forecasting features with lags
#         forecasting_df = create_forecasting_features(daily_df)

#         # Save if needed
#         forecasting_df.to_csv(f'actual_topic_modelled/{name}_lda_forecasting_features.csv', index=False)

In [12]:
eth_lda_forecast_df = pd.read_csv('actual_topic_modelled/eth_lda_forecasting_features.csv')
btc_lda_forecast_df = pd.read_csv('actual_topic_modelled/btc_lda_forecasting_features.csv')
cryptomarkets_lda_forecast_df = pd.read_csv('actual_topic_modelled/cryptomarkets_lda_forecasting_features.csv')
combined_lda_forecast_df = pd.read_csv('actual_topic_modelled/combined_lda_forecasting_features.csv')

In [13]:
topic_summary = pd.read_csv('actual_topic_modelled/topic_summary.csv')
topic_summary

Unnamed: 0,topic_id,topic_name,top_3_words
0,0,"Topic_0: loan, taking, usd","loan, taking, usd"
1,1,"Topic_1: bch, cash, money","bch, cash, money"
2,2,"Topic_2: term, long, expect","term, long, expect"
3,3,"Topic_3: exactly, april, hit","exactly, april, hit"
4,4,"Topic_4: today, bought, bought today","today, bought, bought today"


In [14]:
btc_lda_forecast_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 47 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date                     33 non-null     object 
 1   subreddit                33 non-null     object 
 2   post_count               33 non-null     int64  
 3   avg_score                33 non-null     float64
 4   total_score              33 non-null     float64
 5   score_std                33 non-null     float64
 6   avg_topic_confidence     33 non-null     float64
 7   avg_topic_0              33 non-null     float64
 8   std_topic_0              33 non-null     float64
 9   max_topic_0              33 non-null     float64
 10  avg_topic_1              33 non-null     float64
 11  std_topic_1              33 non-null     float64
 12  max_topic_1              33 non-null     float64
 13  avg_topic_2              33 non-null     float64
 14  std_topic_2              33 

**BERTopic**

In [15]:
import pandas as pd
import numpy as np

# CLEAN SOLUTION: Just modify your existing CryptoTopicModeler class
class BERTopicCryptoModeler:
    def __init__(self, n_topics=10, random_state=42):
        """BERTopic version - same interface as LDA"""
        self.n_topics = n_topics
        self.random_state = random_state  # Store for later use
        self.topic_model = None
        self.topic_names = None

    def fit_model(self, texts, optimize_topics=True, max_features=1000):
        """Fit BERTopic model - same method name for compatibility"""
        try:
            from bertopic import BERTopic
            print(f"Fitting BERTopic model on {len(texts):,} documents...")


            self.topic_model = BERTopic(
                embedding_model="all-MiniLM-L6-v2",
                nr_topics=self.n_topics if not optimize_topics else None,
                verbose=True,
                calculate_probabilities=True
            )

            # fit model
            topics, probs = self.topic_model.fit_transform(texts)

            # get topic info and update n_topics
            topic_info = self.topic_model.get_topic_info()
            valid_topics = topic_info[topic_info['Topic'] != -1]
            self.n_topics = len(valid_topics)

            # generate topic names compatible with existing code
            self.topic_names = []
            for _, row in valid_topics.iterrows():
                topic_id = row['Topic']
                topic_words = self.topic_model.get_topic(topic_id)
                top_3_words = [word for word, _ in topic_words[:3]]
                topic_name = f"Topic_{topic_id}: {', '.join(top_3_words)}"
                self.topic_names.append(topic_name)

            print(f"BERTopic model fitting complete! Found {self.n_topics} topics")
            return np.zeros((len(texts), 100))  # dummy return for compatibility

        except ImportError:
            print("BERTopic not installed. Run: pip install bertopic sentence-transformers")
            return None
        except Exception as e:
            print(f"Error fitting BERTopic model: {e}")
            return None

    def get_topic_details(self, n_words=10):
        """Get topic details - same format as LDA"""
        if not self.topic_model or not self.topic_names:
            return []

        topic_details = []
        topic_info = self.topic_model.get_topic_info()
        valid_topics = topic_info[topic_info['Topic'] != -1]

        for i, (_, row) in enumerate(valid_topics.iterrows()):
            topic_id = row['Topic']
            topic_words = self.topic_model.get_topic(topic_id)
            top_words_weights = [(word, score) for word, score in topic_words[:n_words]]

            topic_details.append({
                'topic_id': i,
                'topic_name': self.topic_names[i],
                'top_words': top_words_weights
            })

        return topic_details

    def get_document_topics(self, texts):
        """Get topic probabilities - same format as LDA"""
        if not self.topic_model:
            raise ValueError("Model not fitted. Call fit_lda_model first.")

        topics, probs = self.topic_model.transform(texts)

        # create probability matrix like LDA
        n_docs = len(texts)
        doc_topic_probs = np.zeros((n_docs, self.n_topics))

        # map BERTopic IDs to sequential IDs
        topic_info = self.topic_model.get_topic_info()
        valid_topics = topic_info[topic_info['Topic'] != -1]['Topic'].tolist()
        topic_id_mapping = {bert_id: seq_id for seq_id, bert_id in enumerate(valid_topics)}

        for doc_idx, (topic_id, prob_row) in enumerate(zip(topics, probs)):
            if topic_id in topic_id_mapping:
                seq_id = topic_id_mapping[topic_id]
                doc_topic_probs[doc_idx, seq_id] = prob_row.max() if hasattr(prob_row, 'max') else prob_row

        return doc_topic_probs

    def create_daily_topic_features(self, df):
        """
        COMPLETE METHOD: Create daily topic features - same as LDA version
        """
        print("Creating daily topic features...")

        if 'processed_text' not in df.columns:
            raise ValueError("DataFrame must contain 'processed_text' column")

        # get topic probabilities for all documents
        topic_probs = self.get_document_topics(df['processed_text'])

        # add topic probabilities to dataframe (make a copy to avoid modifying original)
        df_copy = df.copy()
        topic_columns = [f'topic_{i}' for i in range(self.n_topics)]
        for i, col in enumerate(topic_columns):
            df_copy[col] = topic_probs[:, i]

        # add dominant topic
        df_copy['dominant_topic'] = topic_probs.argmax(axis=1)
        df_copy['topic_confidence'] = topic_probs.max(axis=1)

        # group by date and subreddit for daily aggregation
        daily_features_list = []
        groupby_cols = ['date', 'subreddit'] if 'subreddit' in df_copy.columns else ['date']

        for group_keys, group_df in df_copy.groupby(groupby_cols):
            if isinstance(group_keys, tuple):
                date, subreddit = group_keys
            else:
                date, subreddit = group_keys, 'unknown'

            if pd.isna(date):
                continue

            # basic features
            features = {
                'date': date,
                'subreddit': subreddit,
                'post_count': len(group_df),
                'avg_score': group_df['score'].mean(),
                'total_score': group_df['score'].sum(),
                'score_std': group_df['score'].std() if len(group_df) > 1 else 0,
                'avg_topic_confidence': group_df['topic_confidence'].mean(),
            }

            # topic probability features
            for i, topic_col in enumerate(topic_columns):
                features[f'avg_{topic_col}'] = group_df[topic_col].mean()
                features[f'std_{topic_col}'] = group_df[topic_col].std() if len(group_df) > 1 else 0
                features[f'max_{topic_col}'] = group_df[topic_col].max()

            # dominant topic distribution
            dominant_counts = group_df['dominant_topic'].value_counts()
            for topic_idx in range(self.n_topics):
                count = dominant_counts.get(topic_idx, 0)
                features[f'dominant_topic_{topic_idx}_count'] = count
                features[f'dominant_topic_{topic_idx}_pct'] = count / len(group_df)

            # topic diversity metrics
            topic_dist = group_df['dominant_topic'].value_counts(normalize=True)
            if len(topic_dist) > 1:
                entropy = -sum(p * np.log2(p) for p in topic_dist if p > 0)
                features['topic_diversity_entropy'] = entropy
            else:
                features['topic_diversity_entropy'] = 0

            features['unique_topics_count'] = group_df['dominant_topic'].nunique()
            features['topic_concentration'] = dominant_counts.iloc[0] / len(group_df) if len(dominant_counts) > 0 else 0

            daily_features_list.append(features)

        # convert to DataFrame
        daily_features_df = pd.DataFrame(daily_features_list)
        daily_features_df = daily_features_df.sort_values('date').reset_index(drop=True)

        print(f"Created daily features: {daily_features_df.shape[0]} days, {daily_features_df.shape[1]} features")
        return daily_features_df



def run_bertopic_pipeline(datasets):
    """Same as existing pipeline but with BERTopic"""


    if not any(len(df) > 0 for df in datasets.values() if isinstance(df, pd.DataFrame)):
        print("No preprocessed data found.")
        return None, None

    if 'combined' in datasets and len(datasets['combined']) > 0:
        training_data = datasets['combined']
        print(f"Using combined dataset for topic modeling: {len(training_data):,} posts")
    else:
        largest_name = max(
            [k for k in datasets.keys() if k != 'combined'],
            key=lambda x: len(datasets[x]) if isinstance(datasets[x], pd.DataFrame) else 0
        )
        training_data = datasets[largest_name]
        print(f"Using {largest_name} dataset for topic modeling: {len(training_data):,} posts")

    print(f"\n{'='*60}")
    print("BERTOPIC MODELING")
    print(f"{'='*60}")

    modeler = BERTopicCryptoModeler(n_topics=10, random_state=42)

    modeler.fit_model(
        training_data['processed_text'].tolist(),
        optimize_topics=False,
        max_features=1000
    )

    # display topics
    print(f"\n{'='*60}")
    print("DISCOVERED TOPICS")
    print(f"{'='*60}")

    topic_details = modeler.get_topic_details()
    for topic in topic_details:
        print(f"\n{topic['topic_name']}")
        top_words = [f"{word}({weight:.3f})" for word, weight in topic['top_words'][:5]]
        print(f"  Top words: {', '.join(top_words)}")

    # process each dataset
    results = {}
    print(f"\n{'='*60}")
    print("CREATING DAILY TOPIC FEATURES")
    print(f"{'='*60}")

    for name, df in datasets.items():
        if len(df) == 0:
            print(f"\nSkipping {name}: no data")
            continue

        print(f"\nProcessing {name.upper()} dataset...")

        try:
            daily_features = modeler.create_daily_topic_features(df)

            output_file = f'actual_topic_modelled/{name}_daily_bertopic_features.csv'
            daily_features.to_csv(output_file, index=False)
            print(f"Saved to: {output_file}")

            results[name] = daily_features

            if len(daily_features) > 0:
                print(f"  Shape: {daily_features.shape}")
                print(f"  Date range: {daily_features['date'].min()} to {daily_features['date'].max()}")
                print(f"  Avg posts per day: {daily_features['post_count'].mean():.1f}")

        except Exception as e:
            print(f"Error processing {name}: {e}")
            results[name] = pd.DataFrame()

    return results, modeler






In [None]:
results, modeler = run_bertopic_pipeline(preprocessed_datasets)

for name, daily_df in results.items():
    if len(daily_df) > 0:
        analyze_topic_trends(daily_df, modeler)
        forecasting_df = create_forecasting_features(daily_df)
        forecasting_df.to_csv(f'actual_topic_modelled/{name}_bertopic_forecasting_features.csv', index=False)

Using combined dataset for topic modeling: 30,000 posts

BERTOPIC MODELING


2025-09-21 10:48:54,408 - BERTopic - Embedding - Transforming documents to embeddings.


Fitting BERTopic model on 30,000 documents...


Batches:   0%|          | 0/938 [00:00<?, ?it/s]

2025-09-21 10:55:59,258 - BERTopic - Embedding - Completed ✓
2025-09-21 10:55:59,260 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-21 10:57:29,503 - BERTopic - Dimensionality - Completed ✓
2025-09-21 10:57:29,507 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-21 13:12:44,828 - BERTopic - Cluster - Completed ✓
2025-09-21 13:12:44,855 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-21 13:12:47,602 - BERTopic - Representation - Completed ✓
2025-09-21 13:12:47,605 - BERTopic - Topic reduction - Reducing number of topics
2025-09-21 13:12:47,825 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-21 13:12:48,515 - BERTopic - Representation - Completed ✓
2025-09-21 13:12:48,522 - BERTopic - Topic reduction - Reduced number of topics from 1064 to 10


BERTopic model fitting complete! Found 9 topics

DISCOVERED TOPICS

Topic_0: year, money, loan
  Top words: year(0.019), money(0.016), loan(0.016), bought(0.016), next(0.014)

Topic_1: node, proof, work
  Top words: node(0.297), proof(0.203), work(0.191), enable(0.133), axiom(0.130)

Topic_2: sub, discussion, ticker
  Top words: sub(0.240), discussion(0.161), ticker(0.141), subreddit(0.133), name(0.123)

Topic_3: want, two, thing
  Top words: want(0.402), two(0.391), thing(0.312), red(0.248), brings(0.234)

Topic_4: goog, amzn, meta
  Top words: goog(0.386), amzn(0.385), meta(0.364), silver(0.341), bigger(0.302)

Topic_5: supply, xcv, total
  Top words: supply(0.350), xcv(0.216), total(0.190), genesis(0.187), flow(0.156)

Topic_6: que, pero, refuge
  Top words: que(0.336), pero(0.142), refuge(0.124), valeur(0.124), una(0.123)

Topic_7: announces, agency, chatgpt
  Top words: announces(0.741), agency(0.692), chatgpt(0.605), integration(0.573), across(0.506)

Topic_8: javascript, functio

Batches:   0%|          | 0/938 [00:00<?, ?it/s]

2025-09-21 13:19:46,660 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-09-21 13:19:46,765 - BERTopic - Dimensionality - Completed ✓
2025-09-21 13:19:46,768 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-09-21 13:19:48,929 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN


refactoring attempt

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')


class BaseTopicModeler:
    """
    Base class containing all shared functionality for topic modeling
    """

    def __init__(self, n_topics=10, random_state=42):
        """
        Initialize base topic modeler

        Parameters:
        - n_topics: Number of topics to discover
        - random_state: Random seed for reproducibility
        """
        self.n_topics = n_topics
        self.random_state = random_state
        self.topic_names = None

    def create_daily_topic_features(self, df):
        """
        Create daily aggregated topic features from preprocessed dataframe
        This method is identical for both LDA and BERTopic

        Parameters:
        - df: Preprocessed dataframe with required columns

        Returns:
        - DataFrame with daily topic features
        """
        print("Creating daily topic features...")

        if 'processed_text' not in df.columns:
            raise ValueError("DataFrame must contain 'processed_text' column")

        # Get topic probabilities for all documents
        topic_probs = self.get_document_topics(df['processed_text'])

        # Add topic probabilities to dataframe (make a copy)
        df_copy = df.copy()
        topic_columns = [f'topic_{i}' for i in range(self.n_topics)]
        for i, col in enumerate(topic_columns):
            df_copy[col] = topic_probs[:, i]

        # Add dominant topic and confidence
        df_copy['dominant_topic'] = topic_probs.argmax(axis=1)
        df_copy['topic_confidence'] = topic_probs.max(axis=1)

        # Group by date and subreddit for daily aggregation
        daily_features_list = []
        groupby_cols = ['date', 'subreddit'] if 'subreddit' in df_copy.columns else ['date']

        for group_keys, group_df in df_copy.groupby(groupby_cols):
            if isinstance(group_keys, tuple):
                date, subreddit = group_keys
            else:
                date, subreddit = group_keys, 'unknown'

            if pd.isna(date):
                continue

            # Basic features
            features = {
                'date': date,
                'subreddit': subreddit,
                'post_count': len(group_df),
                'avg_score': group_df['score'].mean(),
                'total_score': group_df['score'].sum(),
                'score_std': group_df['score'].std() if len(group_df) > 1 else 0,
                'avg_topic_confidence': group_df['topic_confidence'].mean(),
            }

            # Topic probability features
            for i, topic_col in enumerate(topic_columns):
                features[f'avg_{topic_col}'] = group_df[topic_col].mean()
                features[f'std_{topic_col}'] = group_df[topic_col].std() if len(group_df) > 1 else 0
                features[f'max_{topic_col}'] = group_df[topic_col].max()

            # Dominant topic distribution
            dominant_counts = group_df['dominant_topic'].value_counts()
            for topic_idx in range(self.n_topics):
                count = dominant_counts.get(topic_idx, 0)
                features[f'dominant_topic_{topic_idx}_count'] = count
                features[f'dominant_topic_{topic_idx}_pct'] = count / len(group_df)

            # Topic diversity metrics
            topic_dist = group_df['dominant_topic'].value_counts(normalize=True)
            if len(topic_dist) > 1:
                entropy = -sum(p * np.log2(p) for p in topic_dist if p > 0)
                features['topic_diversity_entropy'] = entropy
            else:
                features['topic_diversity_entropy'] = 0

            features['unique_topics_count'] = group_df['dominant_topic'].nunique()
            features['topic_concentration'] = dominant_counts.iloc[0] / len(group_df) if len(dominant_counts) > 0 else 0

            daily_features_list.append(features)

        # Convert to DataFrame
        daily_features_df = pd.DataFrame(daily_features_list)
        daily_features_df = daily_features_df.sort_values('date').reset_index(drop=True)

        print(f"Created daily features: {daily_features_df.shape[0]} days, {daily_features_df.shape[1]} features")
        return daily_features_df

    # Abstract methods that must be implemented by subclasses
    def fit_model(self, texts, **kwargs):
        """Must be implemented by subclasses"""
        raise NotImplementedError("Subclasses must implement fit_model")

    def get_topic_details(self, n_words=10):
        """Must be implemented by subclasses"""
        raise NotImplementedError("Subclasses must implement get_topic_details")

    def get_document_topics(self, texts):
        """Must be implemented by subclasses"""
        raise NotImplementedError("Subclasses must implement get_document_topics")


class LDATopicModeler(BaseTopicModeler):
    """
    LDA-specific topic modeling implementation
    """

    def __init__(self, n_topics=10, random_state=42):
        super().__init__(n_topics, random_state)
        self.lda_model = None
        self.vectorizer = None
        self.feature_names = None

    def fit_model(self, texts, optimize_topics=True, max_features=1000):
        """Fit LDA model on preprocessed texts"""
        print(f"Fitting LDA model on {len(texts):,} documents...")

        # Create TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            min_df=2,
            max_df=0.8,
            ngram_range=(1, 2),
            stop_words='english'
        )

        # Fit and transform texts
        print("Creating document-term matrix...")
        doc_term_matrix = self.vectorizer.fit_transform(texts)
        self.feature_names = self.vectorizer.get_feature_names_out()
        print(f"Document-term matrix shape: {doc_term_matrix.shape}")

        # Optimize number of topics if requested
        if optimize_topics and len(texts) > 100:
            print("Optimizing number of topics...")
            self.n_topics = self._optimize_topic_number(doc_term_matrix)

        # Fit final LDA model
        print(f"Fitting LDA with {self.n_topics} topics...")
        self.lda_model = LatentDirichletAllocation(
            n_components=self.n_topics,
            random_state=self.random_state,
            max_iter=20,
            learning_method='batch'
        )

        self.lda_model.fit(doc_term_matrix)
        self.topic_names = self._generate_topic_names()

        print("LDA model fitting complete!")
        return doc_term_matrix

    def _optimize_topic_number(self, doc_term_matrix):
        """Optimize number of topics using grid search"""
        topic_range = [5, 8, 10, 12, 15]
        if doc_term_matrix.shape[0] > 20000:
            topic_range = [5, 8, 10]

        print(f"Testing topic numbers: {topic_range}")

        param_grid = {'n_components': topic_range}
        lda = LatentDirichletAllocation(
            random_state=self.random_state,
            max_iter=10,
            learning_method='batch'
        )

        cv_folds = min(3, max(2, len(topic_range)))
        grid_search = GridSearchCV(
            lda, param_grid, cv=cv_folds,
            scoring='neg_mean_squared_error',
            n_jobs=-1, verbose=0
        )

        grid_search.fit(doc_term_matrix)
        optimal_topics = grid_search.best_params_['n_components']

        print(f"Optimal number of topics: {optimal_topics}")
        return optimal_topics

    def _generate_topic_names(self, n_words=3):
        """Generate interpretable topic names"""
        if not self.lda_model or self.feature_names is None:
            return []

        topic_names = []
        for topic_idx, topic in enumerate(self.lda_model.components_):
            top_word_indices = topic.argsort()[-n_words:][::-1]
            top_words = [self.feature_names[i] for i in top_word_indices]
            topic_name = f"Topic_{topic_idx}: {', '.join(top_words)}"
            topic_names.append(topic_name)

        return topic_names

    def get_topic_details(self, n_words=10):
        """Get detailed information about each topic"""
        if not self.lda_model or self.feature_names is None:
            return []

        topic_details = []
        for topic_idx, topic in enumerate(self.lda_model.components_):
            top_word_indices = topic.argsort()[-n_words:][::-1]
            top_words_weights = [
                (self.feature_names[i], topic[i])
                for i in top_word_indices
            ]

            topic_details.append({
                'topic_id': topic_idx,
                'topic_name': self.topic_names[topic_idx],
                'top_words': top_words_weights
            })

        return topic_details

    def get_document_topics(self, texts):
        """Get topic probabilities for documents"""
        if not self.lda_model or not self.vectorizer:
            raise ValueError("Model not fitted. Call fit_model first.")

        doc_term_matrix = self.vectorizer.transform(texts)
        doc_topic_probs = self.lda_model.transform(doc_term_matrix)
        return doc_topic_probs


class BERTopicModeler(BaseTopicModeler):
    """
    BERTopic-specific topic modeling implementation
    """

    def __init__(self, n_topics=10, random_state=42):
        super().__init__(n_topics, random_state)
        self.topic_model = None

    def fit_model(self, texts, optimize_topics=True, max_features=1000):
        """Fit BERTopic model on preprocessed texts"""
        try:
            from bertopic import BERTopic
            print(f"Fitting BERTopic model on {len(texts):,} documents...")

            # Initialize BERTopic model
            self.topic_model = BERTopic(
                embedding_model="all-MiniLM-L6-v2",
                nr_topics=self.n_topics if not optimize_topics else None,
                verbose=True,
                calculate_probabilities=True
            )

            # Fit model
            topics, probs = self.topic_model.fit_transform(texts)

            # Get topic info and update n_topics
            topic_info = self.topic_model.get_topic_info()
            valid_topics = topic_info[topic_info['Topic'] != -1]
            self.n_topics = len(valid_topics)

            # Generate topic names
            self.topic_names = self._generate_topic_names(valid_topics)

            print(f"BERTopic model fitting complete! Found {self.n_topics} topics")
            return np.zeros((len(texts), 100))  # Dummy return for compatibility

        except ImportError:
            print("BERTopic not installed. Run: pip install bertopic sentence-transformers")
            return None
        except Exception as e:
            print(f"Error fitting BERTopic model: {e}")
            return None

    def _generate_topic_names(self, valid_topics):
        """Generate topic names for BERTopic"""
        topic_names = []
        for _, row in valid_topics.iterrows():
            topic_id = row['Topic']
            topic_words = self.topic_model.get_topic(topic_id)
            top_3_words = [word for word, _ in topic_words[:3]]
            topic_name = f"Topic_{topic_id}: {', '.join(top_3_words)}"
            topic_names.append(topic_name)
        return topic_names

    def get_topic_details(self, n_words=10):
        """Get detailed information about each topic"""
        if not self.topic_model or not self.topic_names:
            return []

        topic_details = []
        topic_info = self.topic_model.get_topic_info()
        valid_topics = topic_info[topic_info['Topic'] != -1]

        for i, (_, row) in enumerate(valid_topics.iterrows()):
            topic_id = row['Topic']
            topic_words = self.topic_model.get_topic(topic_id)
            top_words_weights = [(word, score) for word, score in topic_words[:n_words]]

            topic_details.append({
                'topic_id': i,
                'topic_name': self.topic_names[i],
                'top_words': top_words_weights
            })

        return topic_details

    def get_document_topics(self, texts):
        """Get topic probabilities for documents"""
        if not self.topic_model:
            raise ValueError("Model not fitted. Call fit_model first.")

        topics, probs = self.topic_model.transform(texts)

        # Create probability matrix like LDA
        n_docs = len(texts)
        doc_topic_probs = np.zeros((n_docs, self.n_topics))

        # Map BERTopic IDs to sequential IDs
        topic_info = self.topic_model.get_topic_info()
        valid_topics = topic_info[topic_info['Topic'] != -1]['Topic'].tolist()
        topic_id_mapping = {bert_id: seq_id for seq_id, bert_id in enumerate(valid_topics)}

        for doc_idx, (topic_id, prob_row) in enumerate(zip(topics, probs)):
            if topic_id in topic_id_mapping:
                seq_id = topic_id_mapping[topic_id]
                doc_topic_probs[doc_idx, seq_id] = prob_row.max() if hasattr(prob_row, 'max') else prob_row

        return doc_topic_probs


def run_topic_modeling_pipeline(datasets, modeler_class=LDATopicModeler):
    """
    Generic pipeline that works with any topic modeler

    Parameters:
    - datasets: Dictionary of preprocessed DataFrames
    - modeler_class: LDATopicModeler or BERTopicModeler class
    """
    # Check if we have data
    if not any(len(df) > 0 for df in datasets.values() if isinstance(df, pd.DataFrame)):
        print("No preprocessed data found.")
        return None, None

    # Select training data
    if 'combined' in datasets and len(datasets['combined']) > 0:
        training_data = datasets['combined']
        print(f"Using combined dataset for topic modeling: {len(training_data):,} posts")
    else:
        largest_name = max(
            [k for k in datasets.keys() if k != 'combined'],
            key=lambda x: len(datasets[x]) if isinstance(datasets[x], pd.DataFrame) else 0
        )
        training_data = datasets[largest_name]
        print(f"Using {largest_name} dataset for topic modeling: {len(training_data):,} posts")

    # Initialize modeler
    algorithm_name = modeler_class.__name__.replace('TopicModeler', '').replace('Modeler', '')
    print(f"\n{'='*60}")
    print(f"{algorithm_name.upper()} TOPIC MODELING")
    print(f"{'='*60}")

    modeler = modeler_class(n_topics=10, random_state=42)

    # Fit model
    modeler.fit_model(
        training_data['processed_text'].tolist(),
        optimize_topics=True,
        max_features=1000
    )

    # Display topics
    print(f"\n{'='*60}")
    print("DISCOVERED TOPICS")
    print(f"{'='*60}")

    topic_details = modeler.get_topic_details()
    for topic in topic_details:
        print(f"\n{topic['topic_name']}")
        top_words = [f"{word}({weight:.3f})" for word, weight in topic['top_words'][:5]]
        print(f"  Top words: {', '.join(top_words)}")

    # Process each dataset
    results = {}
    print(f"\n{'='*60}")
    print("CREATING DAILY TOPIC FEATURES")
    print(f"{'='*60}")

    for name, df in datasets.items():
        if len(df) == 0:
            print(f"\nSkipping {name}: no data")
            continue

        print(f"\nProcessing {name.upper()} dataset...")

        try:
            daily_features = modeler.create_daily_topic_features(df)

            # Save with algorithm-specific filename
            algo_suffix = 'lda' if isinstance(modeler, LDATopicModeler) else 'bertopic'
            output_file = f'actual_topic_modelled/{name}_daily_{algo_suffix}_features.csv'
            daily_features.to_csv(output_file, index=False)
            print(f"Saved to: {output_file}")

            results[name] = daily_features

            if len(daily_features) > 0:
                print(f"  Shape: {daily_features.shape}")
                print(f"  Date range: {daily_features['date'].min()} to {daily_features['date'].max()}")
                print(f"  Avg posts per day: {daily_features['post_count'].mean():.1f}")

        except Exception as e:
            print(f"Error processing {name}: {e}")
            results[name] = pd.DataFrame()

    return results, modeler


# Convenience functions
def run_lda_pipeline(datasets):
    """Run LDA topic modeling pipeline"""
    return run_topic_modeling_pipeline(datasets, LDATopicModeler)


def run_bertopic_pipeline(datasets):
    """Run BERTopic modeling pipeline"""
    return run_topic_modeling_pipeline(datasets, BERTopicModeler)


# Usage examples:
"""
# Use LDA
results_lda, modeler_lda = run_lda_pipeline(datasets)

# Use BERTopic
results_bert, modeler_bert = run_bertopic_pipeline(datasets)

# Or use the generic pipeline directly
results, modeler = run_topic_modeling_pipeline(datasets, BERTopicModeler)
"""