In [1]:
"""
enhanced_thread_insights.py

Improved script to process Reddit-thread CSVs and produce richer per-thread and global insights.

Features:
- Robust parsing of CSVs with missing columns
- Threaded text composition (fallbacks if parent/body missing)
- Batch sentiment analysis (counts + percentages + raw labels)
- Topic modeling using LDA with configurable number of topics & words
- Human-readable topic labels (heuristic & optional summarizer fallback)
- Frequent unigrams/bigrams/trigrams extraction
- Named Entity Recognition (optional if transformers/nlp model available)
- Thread statistics: post/comment counts, avg/median comment length, unique authors, max depth
- Saves per-thread CSV and a global_summary.csv
- Safe error handling and informative logging

Notes:
- Requires: pandas, scikit-learn, nltk, transformers (optional), tqdm
- nltk stopwords will be downloaded if missing
- Transformers model downloads may occur on first run; code handles absence gracefully

Usage:
- Edit INPUT_FOLDER and OUTPUT_FOLDER to point to your directories and run.

"""

import os
import math
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import logging

# NLP / ML imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
# Define max token length for the sentiment model
MAX_SENT_LEN = 512

# Try to import transformers pipelines if available
try:
    from transformers import pipeline
    _HAS_TRANSFORMERS = True
except Exception:
    _HAS_TRANSFORMERS = False

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure NLTK stopwords
try:
    stopwords.words('english')
except Exception:
    nltk.download('stopwords')

# -------------------- Configuration --------------------
INPUT_FOLDER = r"D:/AIML/reddit mental health/tf_env/data/processed/clean_csv"  # update
OUTPUT_FOLDER = r"D:/AIML/reddit mental health/tf_env/data/processed/senti_output"  # update
GLOBAL_SUMMARY_PATH = os.path.join(OUTPUT_FOLDER, "global_summary.csv")
NUM_TOPICS = 6         # change to store more topics
NUM_TOPIC_WORDS = 8    # number of keywords per topic
USE_NER = True         # set False to skip named entity extraction
BATCH_SIZE = 32        # for batched sentiment
MIN_DOC_FOR_LDA = 2

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# -------------------- Transformers pipelines (optional) --------------------
sentiment_pipeline = None
ner_pipeline = None
summarizer_pipeline = None

if _HAS_TRANSFORMERS:
    try:
        sentiment_pipeline = pipeline('sentiment-analysis')
        logging.info('Loaded transformers sentiment pipeline')
    except Exception as e:
        logging.warning(f'Could not load sentiment pipeline: {e}. Sentiment will fallback to simple heuristics.')
    if USE_NER:
        try:
            ner_pipeline = pipeline('ner', aggregation_strategy='simple')
            logging.info('Loaded transformers NER pipeline')
        except Exception as e:
            logging.warning(f'Could not load NER pipeline: {e}. NER will be skipped.')
            ner_pipeline = None
    # optional summarizer for topic labels (comment out if you don't want extra downloads)
    try:
        summarizer_pipeline = pipeline('text2text-generation', model='google/flan-t5-small')
        logging.info('Loaded summarizer pipeline for topic labeling')
    except Exception:
        summarizer_pipeline = None
        logging.info('Summarizer pipeline not loaded; falling back to heuristic topic labels')

# -------------------- Helper functions --------------------

def safe_read_csv(path):
    """Read CSV with fallback encodings and simple repairs."""
    try:
        return pd.read_csv(path)
    except Exception:
        try:
            return pd.read_csv(path, encoding='utf-8', engine='python')
        except Exception:
            try:
                return pd.read_csv(path, encoding='latin1', engine='python')
            except Exception as e:
                logging.error(f"Failed to read {path}: {e}")
                return None


def thread_comments(df):
    """Compose threaded texts by attaching parent body to comment body when possible.
    Returns a list of texts and some metadata per comment row.
    """
    if df is None or df.empty:
        return []
    # ensure required columns exist
    if 'type' not in df.columns:
        # assume everything is comment-like
        candidates = df.to_dict('records')
        texts = [str(r.get('body', '')).strip() for r in candidates if r.get('body')]
        return texts

    comment_df = df[df['type'] == 'comment'] if 'type' in df.columns else df
    texts = []
    # create a lookup for id -> body
    body_lookup = {}
    if 'id' in df.columns and 'body' in df.columns:
        body_lookup = pd.Series(df['body'].values, index=df['id']).to_dict()

    # if depth exists, sort by it
    if 'depth' in comment_df.columns:
        comments = comment_df.sort_values('depth', na_position='last')
    else:
        comments = comment_df

    for _, row in comments.iterrows():
        body = str(row.get('body', '')).strip()
        if not body:
            continue
        parent_body = ''
        if 'parent_id' in row and row['parent_id'] in body_lookup:
            parent_body = str(body_lookup.get(row['parent_id'], '')).strip()
        # combine parent and body for context but avoid duplication
        if parent_body and parent_body not in body:
            combined = parent_body + ' ' + body
        else:
            combined = body
        texts.append(combined)
    return texts


def batch_sentiment(texts, pipeline_fn, batch_size=32):
    """Batch sentiment using transformers pipeline if available. Returns list of labels.
    Falls back to simple neutral labeling when pipeline not available.
    Automatically truncates text to MAX_SENT_LEN tokens to avoid model errors.
    """
    labels = []
    if not texts:
        return labels
    if pipeline_fn is None:
        return ['NEUTRAL'] * len(texts)

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # Truncate by words to avoid token overflow; safer for models without tokenizer access
        truncated_batch = []
        for t in batch:
            if not isinstance(t, str):
                t = str(t)
            words = t.split()
            if len(words) > MAX_SENT_LEN:
                words = words[:MAX_SENT_LEN]
            truncated_batch.append(' '.join(words))

        try:
            results = pipeline_fn(truncated_batch)
            for r in results:
                if isinstance(r, dict) and 'label' in r:
                    labels.append(r['label'])
                elif isinstance(r, list) and r and isinstance(r[0], dict) and 'label' in r[0]:
                    labels.append(r[0]['label'])
                else:
                    labels.append('NEUTRAL')
        except Exception as e:
            logging.warning(f"Sentiment pipeline error on batch starting at {i}: {e}")
            labels.extend(['NEUTRAL'] * len(batch))
    return labels


def analyze_sentiment_labels(labels):
    """Compute counts and percentages from label list."""
    if not labels:
        return {'positive_pct': 0.0, 'negative_pct': 0.0, 'neutral_pct': 100.0,
                'positive_count': 0, 'negative_count': 0, 'neutral_count': 0, 'total': 0}
    pos = sum(1 for l in labels if l.upper().startswith('POS'))
    neg = sum(1 for l in labels if l.upper().startswith('NEG'))
    other = len(labels) - pos - neg
    total = len(labels)
    return {
        'positive_pct': round(pos / total * 100, 3),
        'negative_pct': round(neg / total * 100, 3),
        'neutral_pct': round(other / total * 100, 3),
        'positive_count': pos,
        'negative_count': neg,
        'neutral_count': other,
        'total': total
    }


def extract_topics_lda(texts, num_topics=5, num_words=6, stop_words=None):
    """Extract LDA topics. Returns list of dicts: {'topic_id', 'keywords':[], 'keyword_str'}"""
    if not texts or len(texts) < MIN_DOC_FOR_LDA:
        return []
    if stop_words is None:
        stop_words = stopwords.words('english')
    try:
        vectorizer = CountVectorizer(stop_words=stop_words, max_df=0.95, min_df=2, ngram_range=(1,1))
        dtm = vectorizer.fit_transform(texts)
        lda = LatentDirichletAllocation(n_components=min(num_topics, dtm.shape[0]), random_state=42)
        lda.fit(dtm)
        topics = []
        feature_names = vectorizer.get_feature_names_out()
        for idx, comp in enumerate(lda.components_):
            top_idx = comp.argsort()[-num_words:][::-1]
            keywords = [feature_names[i] for i in top_idx]
            topics.append({'topic_id': idx + 1, 'keywords': keywords, 'keyword_str': ', '.join(keywords)})
        return topics
    except Exception as e:
        logging.warning(f"LDA failed: {e}")
        return []


def label_topic_readable(keyword_str, summarizer=None):
    """Create a short human-readable label for topic keywords.
    If summarizer pipeline present, use it; otherwise heuristic.
    """
    if not keyword_str:
        return ''
    if summarizer is not None:
        try:
            prompt = f"Create a short (6-12 word) descriptive title for these keywords: {keyword_str}"
            out = summarizer(prompt, max_length=30)
            if isinstance(out, list) and out and isinstance(out[0], dict):
                text = out[0].get('generated_text') or out[0].get('summary_text') or str(out[0])
                return text.strip()
        except Exception as e:
            logging.debug(f"Summarizer failed: {e}")
    # heuristic: pick top 3 keywords to form phrase
    parts = [p.strip() for p in keyword_str.split(',') if p.strip()]
    if not parts:
        return ''
    top = parts[:3]
    if len(top) == 1:
        return f"About {top[0]}"
    elif len(top) == 2:
        return f"About {top[0]} and {top[1]}"
    else:
        return f"About {top[0]}, {top[1]} and {top[2]}"


def top_ngrams(texts, n=10, ngram_range=(2,3), stop_words=None):
    """Return top n n-grams (bigrams/trigrams) as list of (ngram, count)"""
    if not texts:
        return []
    if stop_words is None:
        stop_words = stopwords.words('english')
    try:
        vec = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words, min_df=2)
        X = vec.fit_transform(texts)
        sums = X.sum(axis=0)
        freqs = [(word, int(sums[0, idx])) for word, idx in vec.vocabulary_.items()]
        freqs_sorted = sorted(freqs, key=lambda x: x[1], reverse=True)
        return freqs_sorted[:n]
    except Exception as e:
        logging.debug(f"top_ngrams failed: {e}")
        return []


def extract_named_entities(texts, ner_fn=None, top_n=10):
    """Return most common named entities using transformers NER pipeline if available."""
    if ner_fn is None or not texts:
        return []
    ents = []
    try:
        for i in range(0, len(texts), BATCH_SIZE):
            batch = texts[i:i + BATCH_SIZE]
            try:
                results = ner_fn(batch)
                # results is a list of lists (one per input) when aggregation_strategy not set, or aggregated dicts
                if isinstance(results, list):
                    for r in results:
                        if isinstance(r, list):
                            for e in r:
                                label = e.get('entity_group') or e.get('entity') or e.get('label')
                                word = e.get('word') or e.get('entity') or e.get('text')
                                if word:
                                    ents.append((word, label))
                        elif isinstance(r, dict):
                            word = r.get('word') or r.get('entity') or r.get('text')
                            label = r.get('entity_group') or r.get('entity') or r.get('label')
                            if word:
                                ents.append((word, label))
                else:
                    logging.debug('NER returned unexpected type, skipping')
            except Exception as e:
                logging.debug(f'NER batch error: {e}')
                continue
        # aggregate top entities
        if not ents:
            return []
        df = pd.DataFrame(ents, columns=['text', 'label'])
        grouped = df.groupby(['text', 'label']).size().reset_index(name='count')
        grouped = grouped.sort_values('count', ascending=False).head(top_n)
        return grouped.to_dict('records')
    except Exception as e:
        logging.debug(f"NER failed overall: {e}")
        return []

# -------------------- Main processing --------------------

def process_file(path):
    df = safe_read_csv(path)
    if df is None:
        return None
    # gather thread-level stats
    thread_stats = {}
    thread_stats['file'] = os.path.basename(path)
    thread_stats['rows'] = len(df)
    # count types
    if 'type' in df.columns:
        thread_stats['num_posts'] = int((df['type'] == 'post').sum())
        thread_stats['num_comments'] = int((df['type'] == 'comment').sum())
    else:
        thread_stats['num_posts'] = 0
        thread_stats['num_comments'] = len(df)

    # author / unique users
    if 'author' in df.columns:
        thread_stats['unique_authors'] = int(df['author'].nunique())
    else:
        thread_stats['unique_authors'] = None

    # depth
    if 'depth' in df.columns:
        try:
            thread_stats['max_depth'] = int(df['depth'].max(skipna=True))
        except Exception:
            thread_stats['max_depth'] = None
    else:
        thread_stats['max_depth'] = None

    # title
    if 'title' in df.columns and not df[df['type'] == 'post'].empty:
        try:
            thread_stats['title'] = str(df[df['type'] == 'post']['title'].iloc[0])
        except Exception:
            thread_stats['title'] = ''
    else:
        thread_stats['title'] = ''

    # threaded texts
    texts = thread_comments(df)
    # basic text stats
    lengths = [len(t.split()) for t in texts if isinstance(t, str) and t.strip()]
    if lengths:
        thread_stats['avg_comment_words'] = round(float(np.mean(lengths)), 2)
        thread_stats['median_comment_words'] = float(np.median(lengths))
        thread_stats['max_comment_words'] = int(np.max(lengths))
    else:
        thread_stats['avg_comment_words'] = 0.0
        thread_stats['median_comment_words'] = 0.0
        thread_stats['max_comment_words'] = 0

    # Sentiment labels (batched)
    labels = batch_sentiment(texts, sentiment_pipeline, batch_size=BATCH_SIZE)
    sent_summary = analyze_sentiment_labels(labels)
    thread_stats.update(sent_summary)

    # Topic extraction
    topics = extract_topics_lda(texts, num_topics=NUM_TOPICS, num_words=NUM_TOPIC_WORDS)
    # add readable labels
    for t in topics:
        t['readable'] = label_topic_readable(t['keyword_str'], summarizer=summarizer_pipeline)

    # Top ngrams (bigrams & trigrams)
    ngrams = top_ngrams(texts, n=10, ngram_range=(2,3))

    # NER
    ner_top = []
    if ner_pipeline is not None and USE_NER:
        ner_top = extract_named_entities(texts, ner_pipeline, top_n=10)

    # prepare per-thread output DataFrame
    # we'll store metrics as rows for easy reading
    rows = []
    rows.append({'metric': 'file', 'value': thread_stats['file']})
    rows.append({'metric': 'title', 'value': thread_stats['title']})
    rows.append({'metric': 'rows', 'value': thread_stats['rows']})
    rows.append({'metric': 'num_posts', 'value': thread_stats['num_posts']})
    rows.append({'metric': 'num_comments', 'value': thread_stats['num_comments']})
    rows.append({'metric': 'unique_authors', 'value': thread_stats['unique_authors']})
    rows.append({'metric': 'max_depth', 'value': thread_stats['max_depth']})
    rows.append({'metric': 'avg_comment_words', 'value': thread_stats['avg_comment_words']})
    rows.append({'metric': 'median_comment_words', 'value': thread_stats['median_comment_words']})
    rows.append({'metric': 'max_comment_words', 'value': thread_stats['max_comment_words']})

    rows.append({'metric': 'positive_count', 'value': thread_stats['positive_count']})
    rows.append({'metric': 'negative_count', 'value': thread_stats['negative_count']})
    rows.append({'metric': 'neutral_count', 'value': thread_stats['neutral_count']})
    rows.append({'metric': 'positive_pct', 'value': thread_stats['positive_pct']})
    rows.append({'metric': 'negative_pct', 'value': thread_stats['negative_pct']})
    rows.append({'metric': 'neutral_pct', 'value': thread_stats['neutral_pct']})

    # topics details
    for t in topics:
        rows.append({'metric': f"topic_{t['topic_id']}_keywords", 'value': t['keyword_str']})
        rows.append({'metric': f"topic_{t['topic_id']}_label", 'value': t['readable']})

    # top ngrams
    for idx, (gram, cnt) in enumerate(ngrams, 1):
        rows.append({'metric': f"top_ngram_{idx}", 'value': f"{gram} ({cnt})"})

    # ner
    for idx, rec in enumerate(ner_top, 1):
        rows.append({'metric': f"ner_{idx}", 'value': f"{rec.get('text')}|{rec.get('label')}|{rec.get('count')}"})

    out_df = pd.DataFrame(rows)
    return {'stats': thread_stats, 'topics': topics, 'ngrams': ngrams, 'ner': ner_top, 'out_df': out_df}


if __name__ == '__main__':
    files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith('.csv')]
    logging.info(f"Found {len(files)} CSV files in {INPUT_FOLDER}")
    summary = []
    for i, fname in enumerate(tqdm(files, desc='Processing files')):
        path = os.path.join(INPUT_FOLDER, fname)
        try:
            res = process_file(path)
            if res is None:
                continue
            thread_id = os.path.splitext(fname)[0]
            perthread_path = os.path.join(OUTPUT_FOLDER, f"{thread_id}_enhanced_insights.csv")
            # write per-thread CSV
            res['out_df'].to_csv(perthread_path, index=False)
            # append to summary
            s = res['stats'].copy()
            # attach top topic labels to summary for easy review
            s['top_topics_readable'] = '; '.join([t['readable'] for t in res['topics']]) if res['topics'] else ''
            s['top_topics_keywords'] = '; '.join([t['keyword_str'] for t in res['topics']]) if res['topics'] else ''
            s['top_ngrams'] = '; '.join([f"{g}({c})" for g, c in res['ngrams']]) if res['ngrams'] else ''
            # keep only selected keys in global summary to keep CSV tidy
            keep_keys = ['file', 'title', 'rows', 'num_posts', 'num_comments', 'unique_authors', 'max_depth',
                         'avg_comment_words', 'median_comment_words', 'max_comment_words',
                         'positive_count', 'negative_count', 'neutral_count', 'positive_pct', 'negative_pct', 'neutral_pct',
                         'top_topics_readable', 'top_topics_keywords', 'top_ngrams']
            summary.append({k: s.get(k, '') for k in keep_keys})
        except Exception as e:
            logging.exception(f"Failed processing {fname}: {e}")
            continue

    # save global summary
    if summary:
        summary_df = pd.DataFrame(summary)
        summary_df.to_csv(GLOBAL_SUMMARY_PATH, index=False)
        logging.info(f"Global summary saved to {GLOBAL_SUMMARY_PATH}")
    logging.info('All done!')


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
    PyTorch 2.7.1+cu126 with CUDA 1208 (you have 2.7.1+cpu)
    Python  3.9.13 (you have 3.11.0)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
2025-08-14 19:00:48,306 - INFO - Loaded transformers sentiment pipeline
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetune

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

2025-08-14 19:04:40,382 - INFO - Loaded summarizer pipeline for topic labeling
2025-08-14 19:04:40,421 - INFO - Found 683 CSV files in D:/AIML/reddit mental health/tf_env/data/processed/clean_csv
Processing files:   2%|▏         | 11/683 [21:10<23:14:00, 124.47s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
Processing files: 100%|██████████| 683/683 [35:58:40<00:00, 189.63s/it]
2025-08-16 07:03:20,832 - INFO - Global summary saved to D:/AIML/reddit mental health/tf_env/data/processed/senti_output\global_summary.csv
2025-08-16 07:03:20,832 - INFO - All done!
