In [1]:
import os
import pandas as pd
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Initialize sentiment pipeline (downloads model on first run)
sentiment_pipeline = pipeline('sentiment-analysis')

# Paths (update if needed)
input_folder = "D:/AIML/reddit mental health/tf_env/data/processed/clean_csv"  # Folder with 683 CSVs
output_folder = "D:/AIML/reddit mental health/tf_env/data/processed/insights_output"  # For per-thread CSVs
summary_path = os.path.join(output_folder, "global_summary.csv")  # Global aggregate

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

def thread_comments(df):
    """Thread comments by parent_id and depth for context-aware text, with fallback if 'depth' missing."""
    comment_df = df[df['type'] == 'comment']
    if 'depth' in comment_df.columns:
        comments = comment_df.sort_values('depth')
    else:
        comments = comment_df  # Proceed without sorting if 'depth' missing
        print(f"Warning: 'depth' column missing in DataFrame—skipping sort.")
    threaded = []
    for _, row in comments.iterrows():
        parent_body = ''
        if 'parent_id' in row and 'id' in df.columns and 'body' in df.columns:
            parent_matches = df[df['id'] == row['parent_id']]['body']
            if not parent_matches.empty:
                parent_body = parent_matches.values[0]
        threaded_text = f"{parent_body} {row['body']}".strip() if parent_body else row.get('body', '')
        if threaded_text and isinstance(threaded_text, str):  # Skip empty or non-string
            threaded.append(threaded_text)
    return threaded

def analyze_sentiment(texts):
    """Get average sentiment scores, with handling for invalid results."""
    labels = []
    for text in texts:
        if not text or not isinstance(text, str):
            continue  # Skip invalid
        try:
            result = sentiment_pipeline(text[:512])  # Truncate for model limit
            if isinstance(result, list) and result:  # Check for expected output
                labels.append(result[0]['label'])
            else:
                labels.append('NEUTRAL')  # Fallback for unexpected output
        except Exception:
            labels.append('NEUTRAL')  # Fallback on error
    if not labels:
        return {'positive_pct': 0, 'negative_pct': 0, 'neutral_pct': 100}
    positive = labels.count('POSITIVE') / len(labels)
    negative = labels.count('NEGATIVE') / len(labels)
    neutral = 1 - positive - negative
    return {'positive_pct': positive * 100, 'negative_pct': negative * 100, 'neutral_pct': neutral * 100}

def extract_topics(texts, num_topics=3):
    """Simple topic modeling with LDA."""
    stop_words = stopwords.words('english')  # This is a list, as required
    vectorizer = CountVectorizer(stop_words=stop_words, max_df=0.95, min_df=2)
    if len(texts) < 2:
        return ["Insufficient text for topics"]
    dtm = vectorizer.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(dtm)
    topics = []
    for idx, topic in enumerate(lda.components_):
        top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
        topics.append(f"Topic {idx+1}: {' '.join(top_words)}")
    return topics

# Process each CSV and collect for summary (with progress logging)
summary_data = []
files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
for i, filename in enumerate(files, 1):
    file_path = os.path.join(input_folder, filename)
    print(f"Processing {i}/{len(files)}: {filename}")
    try:
        df = pd.read_csv(file_path)
        threaded_texts = thread_comments(df)
        if not threaded_texts:
            print(f"Skipping {filename}: No valid threaded texts.")
            continue
        
        # Analyze
        sentiment_scores = analyze_sentiment(threaded_texts)
        topics = extract_topics(threaded_texts)
        
        # Per-thread output
        thread_id = filename.replace('.csv', '')
        output_df = pd.DataFrame({
            'metric': ['Positive %', 'Negative %', 'Neutral %'] + topics,
            'value': [sentiment_scores['positive_pct'], sentiment_scores['negative_pct'], sentiment_scores['neutral_pct']] + [''] * len(topics)
        })
        output_path = os.path.join(output_folder, f"{thread_id}_insights.csv")
        output_df.to_csv(output_path, index=False)
        
        # Collect for summary
        post_title = df[df['type'] == 'post']['title'].iloc[0] if 'title' in df.columns else 'Unknown'
        summary_data.append({
            'thread_id': thread_id,
            'title': post_title,
            **sentiment_scores,
            'top_topics': '; '.join(topics)
        })
    except Exception as e:
        print(f"Error processing {filename}: {e}. Skipping.")

# Save global summary
summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(summary_path, index=False)
print("Analysis complete! Per-thread insights in:", output_folder)
print("Global summary:", summary_path)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
    PyTorch 2.7.1+cu126 with CUDA 1208 (you have 2.7.1+cpu)
    Python  3.9.13 (you have 3.11.0)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Processing 1/683: 1ggfmyg.csv
Processing 2/683: 1ggjfl9.csv
Processing 3/683: 1ggnkgn.csv
Processing 4/683: 1gh41pn.csv
Processing 5/683: 1gh6wk4.csv
Processing 6/683: 1ghc9ti.csv
Processing 7/683: 1ghcfvs.csv
Processing 8/683: 1ghchxr.csv
Processing 9/683: 1ghcjv5.csv
Processing 10/683: 1gik043.csv
Processing 11/683: 1gin7az.csv
Processing 12/683: 1gjer2l.csv
Processing 13/683: 1gjh58v.csv
Processing 14/683: 1gjh61r.csv
Processing 15/683: 1gjhbm5.csv
Processing 16/683: 1gjhr1y.csv
Processing 17/683: 1gji5ur.csv
Processing 18/683: 1gjjnez.csv
Processing 19/683: 1gk1qen.csv
Processing 20/683: 1gk1s0u.csv
Processing 21/683: 1gkcrq3.csv
Processing 22/683: 1gkfrpz.csv
Processing 23/683: 1gkt1pv.csv
Processing 24/683: 1gkt43k.csv
Processing 25/683: 1gkzhol.csv
Processing 26/683: 1gl5j0a.csv
Processing 27/683: 1glscy4.csv
Processing 28/683: 1glst16.csv
Processing 29/683: 1gmcfil.csv
Processing 30/683: 1gmg0iy.csv
Processing 31/683: 1gmhx3y.csv
Processing 32/683: 1gmkobd.csv
Processing 33/683