# Imports

In [1]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     -- ------------------------------------- 2.1/33.5 MB 9.8 MB/s eta 0:00:04
     -------------- ------------------------ 12.1/33.5 MB 29.1 MB/s eta 0:00:01
     -------------------- ------------------ 17.3/33.5 MB 28.7 MB/s eta 0:00:01
     ------------------------- ------------- 21.5/33.5 MB 26.1 MB/s eta 0:00:01
     ---------------------------------- ---- 29.9/33.5 MB 29.2 MB/s eta 0:00:01
     --------------------------------------- 33.5/33.5 MB 30.0 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from datetime import datetime, timedelta
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import json
import pickle
from tqdm import tqdm
import spacy
from textblob import TextBlob
from collections import Counter
import community as community_louvain
import warnings
from gensim import similarities  

warnings.filterwarnings('ignore')

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load spaCy model for NLP tasks
nlp = spacy.load('en_core_web_md')
# Set up matplotlib for better visualizations
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to C:\Users\Shahaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shahaf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shahaf/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shahaf/nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


# =============================================
# Part 1: Data Loading (Using Existing CSV Files)
# =============================================


In [3]:
def load_data_from_csv(posts_csv_path, comments_csv_path):
    """
    Load Reddit data from existing CSV files
    
    Parameters:
    -----------
    posts_csv_path : str
        Path to the CSV file containing post data
    comments_csv_path : str
        Path to the CSV file containing comment data
        
    Returns:
    --------
    posts_df : pandas.DataFrame
        DataFrame containing posts
    comments_df : pandas.DataFrame
        DataFrame containing comments
    """
    print(f"Loading posts from {posts_csv_path}...")
    posts_df = pd.read_csv(posts_csv_path)
    
    print(f"Loading comments from {comments_csv_path}...")
    comments_df = pd.read_csv(comments_csv_path)
    
    # Convert timestamp columns to datetime
    timestamp_columns = ['created_utc']
    for col in timestamp_columns:
        if col in posts_df.columns:
            posts_df[col] = pd.to_datetime(posts_df[col])
        if col in comments_df.columns:
            comments_df[col] = pd.to_datetime(comments_df[col])
    
    # Ensure required columns exist
    required_post_columns = ['post_id', 'subreddit', 'title', 'body', 'author', 'created_utc', 'score']
    required_comment_columns = ['comment_id', 'post_id', 'subreddit', 'body', 'author', 'created_utc', 'score']
    
    # Check if required columns exist in post DataFrame
    missing_post_columns = [col for col in required_post_columns if col not in posts_df.columns]
    if missing_post_columns:
        # Try to find alternative column names or create placeholder columns
        for col in missing_post_columns:
            if col == 'post_id' and 'id' in posts_df.columns:
                posts_df['post_id'] = posts_df['id']
            elif col == 'body' and 'selftext' in posts_df.columns:
                posts_df['body'] = posts_df['selftext']
            else:
                print(f"Warning: Column '{col}' not found in posts CSV. Creating empty column.")
                posts_df[col] = None
    
    # Check if required columns exist in comment DataFrame
    missing_comment_columns = [col for col in required_comment_columns if col not in comments_df.columns]
    if missing_comment_columns:
        # Try to find alternative column names or create placeholder columns
        for col in missing_comment_columns:
            if col == 'comment_id' and 'id' in comments_df.columns:
                comments_df['comment_id'] = comments_df['id']
            else:
                print(f"Warning: Column '{col}' not found in comments CSV. Creating empty column.")
                comments_df[col] = None
    
    # Print data summary
    print(f"\nLoaded {len(posts_df)} posts from {posts_df['subreddit'].nunique()} subreddits.")
    print(f"Loaded {len(comments_df)} comments.")
    
    # Print the subreddits found in the data
    print("\nSubreddits in the dataset:")
    for subreddit, count in posts_df['subreddit'].value_counts().items():
        print(f"  - r/{subreddit}: {count} posts")
    
    return posts_df, comments_df



In [4]:
posts_csv_path = "merged_posts.csv" 
comments_csv_path = "merged_comments.csv" 
posts_df, comments_df = load_data_from_csv(posts_csv_path, comments_csv_path)

Loading posts from merged_posts.csv...
Loading comments from merged_comments.csv...

Loaded 150505 posts from 3 subreddits.
Loaded 3561862 comments.

Subreddits in the dataset:
  - r/Conservative: 132987 posts
  - r/centrist: 11059 posts
  - r/Liberal: 6459 posts


# =============================================
# Part 2: Data Preprocessing
# =============================================

In [22]:

def preprocess_text(text):
    """
    Preprocess text data for NLP analysis
    
    Parameters:
    -----------
    text : str
        Text to preprocess
        
    Returns:
    --------
    str
        Preprocessed text
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Rejoin tokens
    processed_text = ' '.join(tokens)
    
    return processed_text

def process_dataframes(posts_df, comments_df):
    """
    Process the post and comment DataFrames to add NLP features
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing posts
    comments_df : pandas.DataFrame
        DataFrame containing comments
        
    Returns:
    --------
    posts_df : pandas.DataFrame
        Processed posts DataFrame with only relevant columns
    comments_df : pandas.DataFrame
        Processed comments DataFrame with only relevant columns
    """
    print("Starting preprocessing...")
    
    # Define essential columns to keep for posts
    posts_essential_cols = [
        'id', 'post_id', 'title', 'selftext', 'author', 'subreddit', 
        'created_utc', 'score', 'num_comments', 'upvote_ratio'
    ]
    
    # Define essential columns to keep for comments
    comments_essential_cols = [
        'id', 'comment_id', 'body', 'author', 'subreddit', 
        'created_utc', 'score', 'parent_id', 'post_id'
    ]
    
    # Keep only columns that exist in the dataframes
    posts_cols_to_keep = [col for col in posts_essential_cols if col in posts_df.columns]
    comments_cols_to_keep = [col for col in comments_essential_cols if col in comments_df.columns]
    
    # Add any columns that are missing but required
    for col in ['post_id', 'subreddit', 'created_utc', 'score']:
        if col not in posts_cols_to_keep and col != 'post_id':  # Skip post_id as it might be derived from id
            posts_cols_to_keep.append(col)
            posts_df[col] = None
            print(f"Added missing required column '{col}' to posts dataframe")
    
    for col in ['comment_id', 'post_id', 'subreddit', 'created_utc', 'score']:
        if col not in comments_cols_to_keep and col != 'comment_id':  # Skip comment_id as it might be derived from id
            comments_cols_to_keep.append(col)
            comments_df[col] = None
            print(f"Added missing required column '{col}' to comments dataframe")
    
    # Create a copy with only the essential columns
    posts_df_slim = posts_df[posts_cols_to_keep].copy()
    comments_df_slim = comments_df[comments_cols_to_keep].copy()
    
    print(f"Reduced posts dataframe from {len(posts_df.columns)} to {len(posts_df_slim.columns)} columns")
    print(f"Reduced comments dataframe from {len(comments_df.columns)} to {len(comments_df_slim.columns)} columns")
    
    # Make sure post_id is consistently named
    if 'id' in posts_df_slim.columns and 'post_id' not in posts_df_slim.columns:
        posts_df_slim['post_id'] = posts_df_slim['id']
    
    if 'id' in comments_df_slim.columns and 'comment_id' not in comments_df_slim.columns:
        comments_df_slim['comment_id'] = comments_df_slim['id']
    
    # Make sure parent_id in comments is properly formatted
    if 'parent_id' in comments_df_slim.columns:
        # Extract post_id from parent_id if it's in format "t3_xxxxx"
        comments_df_slim['post_id'] = comments_df_slim['parent_id'].apply(
            lambda x: x.split('_')[1] if isinstance(x, str) and x.startswith('t3_') else x
        )
        # If parent_id starts with 't1_', it's a reply to another comment
        comments_df_slim['is_reply_to_comment'] = comments_df_slim['parent_id'].apply(
            lambda x: True if isinstance(x, str) and x.startswith('t1_') else False
        )
    
    # Process posts
    print("Processing posts text...")
    if 'title' in posts_df_slim.columns and 'selftext' in posts_df_slim.columns:
        # Process title
        tqdm.pandas(desc="Processing post titles")
        posts_df_slim['processed_title'] = posts_df_slim['title'].progress_apply(
            lambda x: preprocess_text(x) if pd.notna(x) else ""
        )
        
        # Process selftext (body)
        tqdm.pandas(desc="Processing post bodies")
        posts_df_slim['processed_body'] = posts_df_slim['selftext'].progress_apply(
            lambda x: preprocess_text(x) if pd.notna(x) else ""
        )
        
        # Create combined text field
        posts_df_slim['combined_text'] = posts_df_slim['processed_title'] + ' ' + posts_df_slim['processed_body']
    
    # Process comments
    print("Processing comments text...")
    if 'body' in comments_df_slim.columns:
        tqdm.pandas(desc="Processing comments")
        comments_df_slim['processed_body'] = comments_df_slim['body'].progress_apply(
            lambda x: preprocess_text(x) if pd.notna(x) else ""
        )
    
    # Add sentiment analysis
    try:
        print("Adding sentiment analysis...")
        # For posts
        if 'combined_text' in posts_df_slim.columns:
            tqdm.pandas(desc="Calculating post sentiment")
            posts_df_slim['sentiment'] = posts_df_slim['combined_text'].progress_apply(
                lambda x: TextBlob(x).sentiment.polarity if x else 0
            )
        
        # For comments
        if 'processed_body' in comments_df_slim.columns:
            tqdm.pandas(desc="Calculating comment sentiment")
            comments_df_slim['sentiment'] = comments_df_slim['processed_body'].progress_apply(
                lambda x: TextBlob(x).sentiment.polarity if x else 0
            )
    except Exception as e:
        print(f"Warning: Error in sentiment analysis: {e}")
        print("Continuing without sentiment analysis...")
    
    # Add named entity recognition (if spaCy model is available)
    try:
        print("Adding named entity recognition...")
        # Function to safely extract entities
        def extract_entities(text):
            if not text or not isinstance(text, str):
                return []
            try:
                # Limit text length to avoid memory issues
                text = text[:10000]  # Limit to 10k chars to prevent memory issues
                doc = nlp(text)
                return [(ent.text, ent.label_) for ent in doc.ents]
            except Exception as e:
                print(f"Error in entity extraction: {e}")
                return []
        
        # For posts
        if 'combined_text' in posts_df_slim.columns:
            tqdm.pandas(desc="Extracting post entities")
            posts_df_slim['entities'] = posts_df_slim['combined_text'].progress_apply(extract_entities)
        
        # For comments - only process a sample to save time
        if 'processed_body' in comments_df_slim.columns:
            sample_size = min(5000, len(comments_df_slim))
            if len(comments_df_slim) > sample_size:
                print(f"Processing entities for a sample of {sample_size} comments...")
                sample_idx = comments_df_slim.sample(sample_size).index
                comments_df_slim.loc[sample_idx, 'entities'] = comments_df_slim.loc[sample_idx, 'processed_body'].progress_apply(extract_entities)
            else:
                tqdm.pandas(desc="Extracting comment entities")
                comments_df_slim['entities'] = comments_df_slim['processed_body'].progress_apply(extract_entities)
    except Exception as e:
        print(f"Warning: Error in entity recognition: {e}")
        print("Continuing without entity recognition...")
    
    # Convert timestamps if needed
    for df in [posts_df_slim, comments_df_slim]:
        if 'created_utc' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['created_utc']):
            print("Converting timestamps...")
            try:
                df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
            except Exception as e:
                print(f"Warning: Error converting timestamps: {e}")
                print("Attempting alternative timestamp conversion...")
                try:
                    df['created_utc'] = pd.to_datetime(df['created_utc'], errors='coerce')
                except:
                    print("Failed to convert timestamps. Some analyses may not work correctly.")
    
    # Drop any columns that are no longer needed (optional)
    # For example, after processing, we could drop the original text columns to save memory
    if 'processed_title' in posts_df_slim.columns and 'processed_body' in posts_df_slim.columns:
        posts_cols_to_drop = []  # You could add 'title', 'selftext' here if you don't need them anymore
        posts_df_slim = posts_df_slim.drop(columns=[col for col in posts_cols_to_drop if col in posts_df_slim.columns])
    
    if 'processed_body' in comments_df_slim.columns:
        comments_cols_to_drop = []  # You could add 'body' here if you don't need it anymore
        comments_df_slim = comments_df_slim.drop(columns=[col for col in comments_cols_to_drop if col in comments_df_slim.columns])
    
    print("Preprocessing complete!")
    return posts_df_slim, comments_df_slim

In [7]:
# Step 2: Data Preprocessing
processed_posts_path = 'processed_posts.csv'
processed_comments_path = 'processed_comments.csv'

# Check if processed files already exist
import os
if os.path.exists(processed_posts_path) and os.path.exists(processed_comments_path):
    print("\n2. Loading existing processed data files...")
    try:
        posts_df = pd.read_csv(processed_posts_path)
        comments_df = pd.read_csv(processed_comments_path)
        
        # Convert timestamp columns to datetime
        if 'created_utc' in posts_df.columns:
            posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc'])
        if 'created_utc' in comments_df.columns:
            comments_df['created_utc'] = pd.to_datetime(comments_df['created_utc'])
        
        # Convert entities column from string back to list if needed
        if 'entities' in posts_df.columns and posts_df['entities'].dtype == 'object':
            try:
                import ast
                posts_df['entities'] = posts_df['entities'].apply(
                    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
                )
            except:
                print("Warning: Could not convert entities column back to list.")
        
        if 'entities' in comments_df.columns and comments_df['entities'].dtype == 'object':
            try:
                import ast
                comments_df['entities'] = comments_df['entities'].apply(
                    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
                )
            except:
                print("Warning: Could not convert entities column back to list.")
        
        print(f"Loaded processed data: {len(posts_df)} posts and {len(comments_df)} comments.")
    except Exception as e:
        print(f"Error loading processed files: {e}")
        print("Will preprocess data from scratch...")
        posts_df, comments_df = process_dataframes(posts_df, comments_df)
        
        # Save processed data
        posts_df.to_csv(processed_posts_path, index=False)
        comments_df.to_csv(processed_comments_path, index=False)
        print(f"Saved processed data to {processed_posts_path} and {processed_comments_path}")
else:
    print("\n2. Processing and enriching data...")
    posts_df, comments_df = process_dataframes(posts_df, comments_df)
    
    # Save processed data
    posts_df.to_csv(processed_posts_path, index=False)
    comments_df.to_csv(processed_comments_path, index=False)
    print(f"Saved processed data to {processed_posts_path} and {processed_comments_path}")

# Initialize results dictionary
results = {
    'posts_df': posts_df,
    'comments_df': comments_df
}


2. Processing and enriching data...
Starting preprocessing...
Reduced posts dataframe from 126 to 10 columns
Reduced comments dataframe from 80 to 9 columns
Processing posts text...


Processing post titles: 100%|██████████| 150505/150505 [00:19<00:00, 7589.29it/s]
Processing post bodies: 100%|██████████| 150505/150505 [00:10<00:00, 14786.22it/s]


Processing comments text...


Processing comments: 100%|██████████| 3561862/3561862 [08:54<00:00, 6661.93it/s]


Adding sentiment analysis...


Calculating post sentiment: 100%|██████████| 150505/150505 [00:09<00:00, 15282.64it/s]
Calculating comment sentiment: 100%|██████████| 3561862/3561862 [03:37<00:00, 16393.35it/s]


Adding named entity recognition...


Extracting post entities: 100%|██████████| 150505/150505 [07:53<00:00, 317.88it/s]


Processing entities for a sample of 5000 comments...


Extracting post entities: 100%|██████████| 5000/5000 [00:18<00:00, 275.21it/s]


Preprocessing complete!
Saved processed data to processed_posts.csv and processed_comments.csv


# =============================================
# Part 3: Topic Modeling to Identify Claims/Stories
# =============================================

In [17]:


def create_topic_model(posts_df, num_topics=10, passes=10, min_posts=5):
    """
    Create a topic model using LDA to identify potential claims/stories
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing processed posts
    num_topics : int
        Number of topics to identify
    passes : int
        Number of passes for LDA
    min_posts : int
        Minimum number of posts required for topic modeling
        
    Returns:
    --------
    lda_model : gensim.models.LdaModel or None
        Trained LDA model or None if not enough data
    corpus : list
        Document-term matrix
    dictionary : gensim.corpora.Dictionary
        Dictionary mapping words to their IDs
    """
    # Check if there's enough data for topic modeling
    if len(posts_df) < min_posts:
        print(f"Warning: Not enough posts for topic modeling. Need at least {min_posts}.")
        return None, [], None
    
    # Check if the combined_text column exists
    if 'combined_text' not in posts_df.columns:
        print("Error: 'combined_text' column not found in posts DataFrame.")
        if 'processed_title' in posts_df.columns and 'processed_body' in posts_df.columns:
            print("Creating combined_text from processed_title and processed_body...")
            posts_df['combined_text'] = posts_df['processed_title'].fillna('') + ' ' + posts_df['processed_body'].fillna('')
        elif 'title' in posts_df.columns and 'selftext' in posts_df.columns:
            print("Creating combined_text from raw title and selftext...")
            posts_df['combined_text'] = posts_df['title'].fillna('') + ' ' + posts_df['selftext'].fillna('')
        else:
            print("Error: Cannot create combined_text. No suitable text columns found.")
            return None, [], None
    
    # Create texts for topic modeling
    texts = []
    for text in posts_df['combined_text']:
        if isinstance(text, str) and len(text.strip()) > 0:
            words = text.split()
            if len(words) >= 3:  # Only include texts with at least 3 words
                texts.append(words)
    
    # Check if there are enough texts with content
    if len(texts) < min_posts:
        print(f"Warning: Not enough posts with meaningful content for topic modeling. Found {len(texts)}, need at least {min_posts}.")
        return None, [], None
    
    print(f"Creating topic model with {len(texts)} documents...")
    
    # Create a corpus and dictionary
    dictionary = corpora.Dictionary(texts)
    
    # Check if dictionary has terms
    if len(dictionary) == 0:
        print("Error: Dictionary has no terms. Check text preprocessing.")
        return None, [], None
    
    # Filter out words that appear in less than 2 documents or more than 90% of documents
    dictionary.filter_extremes(no_below=2, no_above=0.9)
    
    # Check if dictionary still has terms after filtering
    if len(dictionary) == 0:
        print("Error: Dictionary has no terms after filtering extremes. Relaxing filter criteria...")
        # Try again with more relaxed criteria
        dictionary = corpora.Dictionary(texts)
        dictionary.filter_extremes(no_below=1, no_above=1.0)
        
        if len(dictionary) == 0:
            print("Error: Still no terms in dictionary after relaxing filters.")
            return None, [], None
    
    # Create document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Check if corpus is empty
    if not corpus or all(len(doc) == 0 for doc in corpus):
        print("Error: Empty corpus. No terms found in documents after preprocessing.")
        return None, [], None
    
    # Train LDA model
    try:
        lda_model = models.LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=min(num_topics, len(texts) // 2),  # Ensure num_topics is not too large
            passes=passes,
            alpha='auto',
            eta='auto'
        )
        print(f"Successfully created topic model with {lda_model.num_topics} topics.")
        return lda_model, corpus, dictionary
    except Exception as e:
        print(f"Error training LDA model: {e}")
        return None, corpus, dictionary

def get_topic_distributions(lda_model, corpus, posts_df):
    """
    Get topic distributions for each post
    
    Parameters:
    -----------
    lda_model : gensim.models.LdaModel
        Trained LDA model (or None if topic modeling failed)
    corpus : list
        Document-term matrix
    posts_df : pandas.DataFrame
        DataFrame containing posts
        
    Returns:
    --------
    posts_df : pandas.DataFrame
        DataFrame with added topic distribution columns
    """
    # If LDA model is None, return the original DataFrame
    if lda_model is None or not corpus:
        print("Warning: No topic model available. Skipping topic distribution.")
        return posts_df
    
    # Get topic distributions for each document
    topic_distributions = []
    for bow in corpus:
        topics = lda_model.get_document_topics(bow)
        topic_dist = [0] * lda_model.num_topics
        for topic_id, prob in topics:
            topic_dist[topic_id] = prob
        topic_distributions.append(topic_dist)
    
    # Add topic distributions to DataFrame
    topic_df = pd.DataFrame(topic_distributions, 
                           columns=[f'topic_{i}' for i in range(lda_model.num_topics)])
    
    # If topic_df is empty, return the original DataFrame
    if topic_df.empty:
        print("Warning: No topic distributions calculated. Skipping topic distribution.")
        return posts_df
    
    # Reset index to ensure alignment
    posts_df = posts_df.reset_index(drop=True)
    topic_df = topic_df.reset_index(drop=True)
    
    # Check if the lengths match
    if len(posts_df) != len(topic_df):
        print(f"Warning: Length mismatch between posts_df ({len(posts_df)}) and topic_df ({len(topic_df)}).")
        print("Cannot merge topic distributions. Returning original DataFrame.")
        return posts_df
    
    # Concatenate DataFrames
    result_df = pd.concat([posts_df, topic_df], axis=1)
    
    # Add dominant topic column
    topic_columns = [f'topic_{i}' for i in range(lda_model.num_topics)]
    if all(col in result_df.columns for col in topic_columns):
        result_df['dominant_topic'] = result_df[topic_columns].idxmax(axis=1)
        print("Successfully added topic distributions and dominant_topic column.")
    else:
        print("Warning: Topic columns not properly added to DataFrame.")
    
    return result_df
    
def display_topics(lda_model, num_words=10):
    """
    Display the top words for each topic
    
    Parameters:
    -----------
    lda_model : gensim.models.LdaModel
        Trained LDA model
    num_words : int
        Number of top words to display per topic
    """
    for idx, topic in lda_model.print_topics(num_words=num_words):
        print(f"Topic {idx}: {topic}")

# Example usage
"""
# Create topic model
lda_model, corpus, dictionary = create_topic_model(posts_df, num_topics=15, passes=20)

# Display topics
display_topics(lda_model)

# Get topic distributions
posts_df = get_topic_distributions(lda_model, corpus, posts_df)

# Save model and data
lda_model.save('lda_model.model')
with open('lda_corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)
dictionary.save('lda_dictionary.dict')
"""

"\n# Create topic model\nlda_model, corpus, dictionary = create_topic_model(posts_df, num_topics=15, passes=20)\n\n# Display topics\ndisplay_topics(lda_model)\n\n# Get topic distributions\nposts_df = get_topic_distributions(lda_model, corpus, posts_df)\n\n# Save model and data\nlda_model.save('lda_model.model')\nwith open('lda_corpus.pkl', 'wb') as f:\n    pickle.dump(corpus, f)\ndictionary.save('lda_dictionary.dict')\n"

In [18]:
# Step 3: Topic Modeling (skip if too few posts)
print("\n3. Creating topic model to identify claims/stories...")
lda_model, corpus, dictionary = create_topic_model(posts_df, num_topics=15, passes=20, min_posts=5)

# If topic modeling failed, return early with just the processed data
if lda_model is None:
    print("Topic modeling failed. Returning processed data only.")
    results['lda_model'] = None
    results['corpus'] = None

# Display topics
print("\nIdentified Topics:")
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx}: {topic}")

# Get topic distributions
posts_df = get_topic_distributions(lda_model, corpus, posts_df)

# Save model and data
lda_model.save('lda_model.model')
with open('lda_corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)
dictionary.save('lda_dictionary.dict')

# Update results
results['posts_df'] = posts_df
results['lda_model'] = lda_model
results['corpus'] = corpus
results['dictionary'] = dictionary


3. Creating topic model to identify claims/stories...
Creating topic model with 138758 documents...
Successfully created topic model with 15 topics.

Identified Topics:
Topic 0: 0.278*"removed" + 0.185*"trump" + 0.029*"say" + 0.025*"president" + 0.022*"new" + 0.014*"call" + 0.013*"court" + 0.013*"order" + 0.012*"administration" + 0.011*"donald"
Topic 1: 0.058*"democrat" + 0.044*"republican" + 0.042*"vote" + 0.042*"election" + 0.027*"news" + 0.025*"voter" + 0.022*"harris" + 0.018*"win" + 0.016*"kamala" + 0.016*"show"
Topic 2: 0.068*"state" + 0.035*"federal" + 0.026*"law" + 0.021*"bill" + 0.017*"school" + 0.017*"public" + 0.015*"congress" + 0.013*"funding" + 0.013*"department" + 0.013*"blue"
Topic 3: 0.069*"year" + 0.046*"first" + 0.032*"policy" + 0.031*"next" + 0.030*"last" + 0.026*"week" + 0.023*"history" + 0.022*"term" + 0.022*"two" + 0.018*"month"
Topic 4: 0.042*"im" + 0.037*"right" + 0.034*"conservative" + 0.031*"liberal" + 0.030*"think" + 0.018*"feel" + 0.017*"left" + 0.015*"quest

# =============================================
# Part 4: Claim Identification and Tracking
# =============================================

In [19]:

def identify_potential_claims(posts_df, threshold=0.7):
    """
    Identify potential claims or news stories based on topic modeling and keyword detection
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing posts with topic distributions
    threshold : float
        Threshold for topic probability to consider a post as strongly related to a topic
        
    Returns:
    --------
    claims_df : pandas.DataFrame
        DataFrame containing identified claims
    """
    # Check if necessary columns from topic modeling exist
    topic_columns = [col for col in posts_df.columns if col.startswith('topic_')]
    
    # If no topic columns or dominant_topic column doesn't exist, we can't identify claims
    if not topic_columns or 'dominant_topic' not in posts_df.columns:
        print("Warning: Topic modeling columns not found. Cannot identify claims.")
        # Return an empty DataFrame with appropriate columns
        return pd.DataFrame(columns=['topic_id', 'representative_post_id', 'representative_title',
                                     'common_entities', 'post_count', 'subreddits', 'avg_sentiment',
                                     'earliest_post', 'latest_post'])
    
    # Find posts with high topic probability
    high_prob_mask = (posts_df[topic_columns] > threshold).any(axis=1)
    high_prob_posts = posts_df[high_prob_mask].copy()
    
    # If no posts have high topic probability, return empty DataFrame
    if high_prob_posts.empty:
        print("No posts with high topic probability found.")
        return pd.DataFrame(columns=['topic_id', 'representative_post_id', 'representative_title',
                                     'common_entities', 'post_count', 'subreddits', 'avg_sentiment',
                                     'earliest_post', 'latest_post'])
    
    # Group by dominant topic
    claims = []
    for topic in high_prob_posts['dominant_topic'].unique():
        try:
            # Get the topic index number (e.g., extract 5 from 'topic_5')
            topic_idx = int(topic.split('_')[1]) if isinstance(topic, str) else topic
        except (IndexError, ValueError):
            print(f"Warning: Invalid topic format: {topic}. Skipping.")
            continue
            
        topic_posts = high_prob_posts[high_prob_posts['dominant_topic'] == topic].copy()
        
        # Skip if no posts for this topic
        if topic_posts.empty:
            continue
            
        # Make sure required columns exist
        if 'post_id' not in topic_posts.columns:
            print(f"Warning: 'post_id' column not found. Skipping topic {topic}.")
            continue
            
        # Find most representative post for this topic (highest probability)
        topic_col = f'topic_{topic_idx}'
        if topic_col in topic_posts.columns:
            # Find post with highest probability for this topic
            try:
                representative_post = topic_posts.loc[topic_posts[topic_col].idxmax()]
            except:
                # If there's an error, just take the first post
                print(f"Warning: Could not find most representative post for topic {topic}. Using first post.")
                representative_post = topic_posts.iloc[0]
        else:
            # If topic column doesn't exist, just take the first post
            print(f"Warning: Topic column {topic_col} not found. Using first post as representative.")
            representative_post = topic_posts.iloc[0]
        
        # Extract common entities in this topic
        all_entities = []
        if 'entities' in topic_posts.columns:
            for ent_list in topic_posts['entities']:
                if isinstance(ent_list, list):
                    all_entities.extend([ent[0] for ent in ent_list if isinstance(ent, tuple) and len(ent) > 0])
        
        common_entities = Counter(all_entities).most_common(5) if all_entities else []
        
        # Get subreddits and sentiment
        subreddits = topic_posts['subreddit'].unique().tolist() if 'subreddit' in topic_posts.columns else []
        avg_sentiment = topic_posts['sentiment'].mean() if 'sentiment' in topic_posts.columns else 0
        
        # Get time range
        if 'created_utc' in topic_posts.columns:
            earliest_post = topic_posts['created_utc'].min()
            latest_post = topic_posts['created_utc'].max()
        else:
            earliest_post = None
            latest_post = None
        
        # Get title
        representative_title = representative_post.get('title', '') if hasattr(representative_post, 'get') else ''
        
        claim = {
            'topic_id': topic,
            'representative_post_id': representative_post.get('post_id', ''),
            'representative_title': representative_title,
            'common_entities': common_entities,
            'post_count': len(topic_posts),
            'subreddits': subreddits,
            'avg_sentiment': avg_sentiment,
            'earliest_post': earliest_post,
            'latest_post': latest_post
        }
        claims.append(claim)
    
    # If no claims were identified, return empty DataFrame
    if not claims:
        print("No claims identified.")
        return pd.DataFrame(columns=['topic_id', 'representative_post_id', 'representative_title',
                                     'common_entities', 'post_count', 'subreddits', 'avg_sentiment',
                                     'earliest_post', 'latest_post'])
    
    claims_df = pd.DataFrame(claims)
    return claims_df

def build_claim_similarity_matrix(posts_df, claims_df):
    """
    Build a similarity matrix between posts and identified claims
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing all posts
    claims_df : pandas.DataFrame
        DataFrame containing identified claims
        
    Returns:
    --------
    similarity_matrix : numpy.ndarray
        Matrix of similarity scores between posts and claims
    post_ids : list
        List of post IDs corresponding to rows in the similarity matrix
    claim_ids : list
        List of claim IDs corresponding to columns in the similarity matrix
    """
    # Check if claims_df is empty
    if claims_df.empty:
        print("Warning: No claims found. Cannot build similarity matrix.")
        return np.array([]), [], []
    
    # Get representative posts for each claim
    claim_post_ids = claims_df['representative_post_id'].tolist()
    claim_posts = posts_df[posts_df['post_id'].isin(claim_post_ids)]
    
    # Check if any representative posts were found
    if claim_posts.empty:
        print("Warning: No representative posts found for claims. Cannot build similarity matrix.")
        return np.array([]), [], []
    
    # Create TF-IDF vectors for all posts and claim posts
    # Ensure 'combined_text' column exists in both dataframes
    if 'combined_text' not in posts_df.columns:
        print("Error: 'combined_text' column not found in posts DataFrame.")
        return np.array([]), [], []
    
    if 'combined_text' not in claim_posts.columns:
        print("Error: 'combined_text' column not found in representative posts.")
        return np.array([]), [], []
    
    # Fill NaN values in combined_text to avoid errors
    posts_df_text = posts_df['combined_text'].fillna('')
    claim_posts_text = claim_posts['combined_text'].fillna('')
    
    try:
        # Create TF-IDF vectorizer
        tfidf_vectorizer = TfidfVectorizer(max_features=5000)
        
        # Fit and transform all posts to create the vocabulary
        tfidf_matrix = tfidf_vectorizer.fit_transform(posts_df_text)
        
        # Transform claim posts using the same vocabulary
        claim_tfidf_matrix = tfidf_vectorizer.transform(claim_posts_text)
        
        # Calculate cosine similarity between all posts and claim posts
        # Use sklearn's cosine_similarity instead of gensim's similarities
        similarity_matrix = cosine_similarity(tfidf_matrix, claim_tfidf_matrix)
        
        # Get post IDs and claim IDs for reference
        post_ids = posts_df['post_id'].tolist()
        
        # Map claim post IDs to claim IDs
        post_id_to_claim_id = {}
        for _, claim in claims_df.iterrows():
            post_id_to_claim_id[claim['representative_post_id']] = claim['topic_id']
        
        claim_ids = [post_id_to_claim_id.get(pid, None) for pid in claim_post_ids]
        
        return similarity_matrix, post_ids, claim_ids
    
    except Exception as e:
        print(f"Error building similarity matrix: {e}")
        return np.array([]), [], []

def track_claim_spread(posts_df, comments_df, similarity_matrix, post_ids, claim_ids, threshold=0.5):
    """
    Track how claims spread across different subreddits over time
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing all posts
    comments_df : pandas.DataFrame
        DataFrame containing all comments
    similarity_matrix : numpy.ndarray
        Matrix of similarity scores between posts and claims
    post_ids : list
        List of post IDs corresponding to rows in the similarity matrix
    claim_ids : list
        List of claim IDs corresponding to columns in the similarity matrix
    threshold : float
        Threshold for similarity score to consider a post as related to a claim
        
    Returns:
    --------
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    """
    # Check if inputs are valid
    if similarity_matrix.size == 0 or not post_ids or not claim_ids:
        print("Warning: Empty similarity matrix or ID lists. Cannot track claim spread.")
        return pd.DataFrame(columns=['claim_id', 'post_id', 'subreddit', 'created_utc', 
                                     'similarity_score', 'sentiment', 'score', 'num_comments'])
    
    claim_spread = []
    
    # Create a mapping from post_id to index
    post_id_to_idx = {post_id: idx for idx, post_id in enumerate(post_ids)}
    
    # For each claim
    for claim_idx, claim_id in enumerate(claim_ids):
        if claim_idx >= similarity_matrix.shape[1]:
            print(f"Warning: claim_idx {claim_idx} is out of bounds for similarity_matrix with shape {similarity_matrix.shape}")
            continue
            
        # Find posts that are similar to this claim
        similar_posts_mask = similarity_matrix[:, claim_idx] > threshold
        similar_post_indices = np.where(similar_posts_mask)[0]
        
        for idx in similar_post_indices:
            if idx >= len(post_ids):
                print(f"Warning: post index {idx} is out of bounds for post_ids with length {len(post_ids)}")
                continue
                
            post_id = post_ids[idx]
            matching_posts = posts_df[posts_df['post_id'] == post_id]
            
            if matching_posts.empty:
                print(f"Warning: No post found with ID {post_id}")
                continue
                
            post = matching_posts.iloc[0]
            
            # Get values with error handling
            subreddit = post.get('subreddit', 'unknown')
            created_utc = post.get('created_utc', None)
            sentiment = post.get('sentiment', 0)
            score = post.get('score', 0)
            num_comments = post.get('num_comments', 0)
            
            spread_info = {
                'claim_id': claim_id,
                'post_id': post_id,
                'subreddit': subreddit,
                'created_utc': created_utc,
                'similarity_score': similarity_matrix[idx, claim_idx],
                'sentiment': sentiment,
                'score': score,
                'num_comments': num_comments
            }
            claim_spread.append(spread_info)
    
    # Create DataFrame
    if claim_spread:
        claim_spread_df = pd.DataFrame(claim_spread)
        
        # Sort by creation time if possible
        if 'created_utc' in claim_spread_df.columns:
            claim_spread_df = claim_spread_df.sort_values(['claim_id', 'created_utc'])
    else:
        # Create empty DataFrame with correct columns
        claim_spread_df = pd.DataFrame(columns=['claim_id', 'post_id', 'subreddit', 'created_utc', 
                                               'similarity_score', 'sentiment', 'score', 'num_comments'])
    
    return claim_spread_df

def analyze_correction_propagation(claim_spread_df, corrections_df):
    """
    Analyze how corrections propagate compared to original claims
    
    Parameters:
    -----------
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    corrections_df : pandas.DataFrame
        DataFrame containing identified corrections
        
    Returns:
    --------
    propagation_metrics : dict
        Dictionary containing metrics about correction propagation
    """
    propagation_metrics = {}
    
    # For each claim
    for claim_id in claim_spread_df['claim_id'].unique():
        # Get claim spread data
        claim_data = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
        
        # Get corrections for this claim
        claim_corrections = corrections_df[corrections_df['claim_id'] == claim_id]
        
        if claim_corrections.empty:
            continue
        
        # Calculate time metrics
        claim_first_appearance = claim_data['created_utc'].min()
        
        if not claim_corrections.empty:
            first_correction = claim_corrections['created_utc'].min()
            time_to_first_correction = (first_correction - claim_first_appearance).total_seconds() / 3600  # hours
        else:
            time_to_first_correction = None
        
        # Calculate subreddit overlap
        claim_subreddits = set(claim_data['subreddit'])
        correction_subreddits = set(claim_corrections['subreddit'])
        subreddit_overlap = len(claim_subreddits.intersection(correction_subreddits))
        subreddit_overlap_ratio = subreddit_overlap / len(claim_subreddits) if claim_subreddits else 0
        
        # Calculate engagement metrics
        avg_claim_score = claim_data['score'].mean()
        avg_correction_score = claim_corrections['score'].mean() if not claim_corrections.empty else 0
        relative_engagement = avg_correction_score / avg_claim_score if avg_claim_score != 0 else 0
        
        # Store metrics
        propagation_metrics[claim_id] = {
            'claim_first_appearance': claim_first_appearance,
            'first_correction': first_correction if not claim_corrections.empty else None,
            'time_to_first_correction': time_to_first_correction,
            'num_claim_posts': len(claim_data),
            'num_corrections': len(claim_corrections),
            'correction_ratio': len(claim_corrections) / len(claim_data),
            'claim_subreddits': claim_subreddits,
            'correction_subreddits': correction_subreddits,
            'subreddit_overlap': subreddit_overlap,
            'subreddit_overlap_ratio': subreddit_overlap_ratio,
            'avg_claim_score': avg_claim_score,
            'avg_correction_score': avg_correction_score,
            'relative_engagement': relative_engagement
        }
    
    return propagation_metrics

def visualize_correction_propagation(claim_spread_df, corrections_df, claim_id):
    """
    Visualize how a specific claim and its corrections propagate over time
    
    Parameters:
    -----------
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    corrections_df : pandas.DataFrame
        DataFrame containing identified corrections
    claim_id : str
        ID of the claim to visualize
    """
    # Get data for this claim
    claim_data = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
    claim_corrections = corrections_df[corrections_df['claim_id'] == claim_id]
    
    # Create a timeline
    plt.figure(figsize=(14, 8))
    
    # Plot claim posts
    for idx, (_, post) in enumerate(claim_data.iterrows()):
        plt.scatter(post['created_utc'], idx % 10, c='red', s=post['score']/10 + 50, alpha=0.7)
        plt.text(post['created_utc'], idx % 10 + 0.5, post['subreddit'], fontsize=8)
    
    # Plot corrections
    for idx, (_, correction) in enumerate(claim_corrections.iterrows()):
        plt.scatter(correction['created_utc'], (idx % 10) - 15, c='green', s=correction['score']/10 + 50, alpha=0.7)
        plt.text(correction['created_utc'], (idx % 10) - 14.5, correction['subreddit'], fontsize=8)
    
    # Add labels and title
    plt.xlabel('Time')
    plt.ylabel('Position (for visualization only)')
    plt.title(f'Propagation Timeline for Claim {claim_id}')
    
    # Add legend
    plt.scatter([], [], c='red', s=100, label='Original claim posts')
    plt.scatter([], [], c='green', s=100, label='Correction posts/comments')
    plt.legend()
    
    # Format x-axis as dates
    plt.gcf().autofmt_xdate()
    
    plt.tight_layout()
    plt.show()

# Example usage
"""
# Identify corrections
corrections_df = identify_corrections(posts_df, comments_df, claim_spread_df)

# Analyze correction propagation
propagation_metrics = analyze_correction_propagation(claim_spread_df, corrections_df)

# Visualize correction propagation for a specific claim
claim_id = claim_spread_df['claim_id'].iloc[0]  # Just get the first claim for example
visualize_correction_propagation(claim_spread_df, corrections_df, claim_id)

# Save correction data
corrections_df.to_csv('corrections.csv', index=False)
with open('propagation_metrics.json', 'w') as f:
    json.dump(propagation_metrics, f, default=str)  # default=str to handle datetime objects
"""

"\n# Identify corrections\ncorrections_df = identify_corrections(posts_df, comments_df, claim_spread_df)\n\n# Analyze correction propagation\npropagation_metrics = analyze_correction_propagation(claim_spread_df, corrections_df)\n\n# Visualize correction propagation for a specific claim\nclaim_id = claim_spread_df['claim_id'].iloc[0]  # Just get the first claim for example\nvisualize_correction_propagation(claim_spread_df, corrections_df, claim_id)\n\n# Save correction data\ncorrections_df.to_csv('corrections.csv', index=False)\nwith open('propagation_metrics.json', 'w') as f:\n    json.dump(propagation_metrics, f, default=str)  # default=str to handle datetime objects\n"

In [20]:
# Step 4: Claim Identification and Tracking
print("\n4. Identifying potential claims...")
claims_df = identify_potential_claims(posts_df, threshold=0.6)
# If no claims were identified, return early
if claims_df.empty:
    print("No claims identified. Returning processed data and topic model only.")
    results['claims_df'] = claims_df



4. Identifying potential claims...
No claims identified. Returning processed data and topic model only.


In [21]:
claims_df

Unnamed: 0,topic_id,representative_post_id,representative_title,common_entities,post_count,subreddits,avg_sentiment,earliest_post,latest_post


In [11]:
# =============================================
# Part 7: Linguistic Analysis
# =============================================

def analyze_linguistic_differences(posts_df, claim_spread_df):
    """
    Analyze linguistic differences in how the same information is presented in different communities
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing all posts
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
        
    Returns:
    --------
    linguistic_metrics : dict
        Dictionary containing metrics about linguistic differences
    """
    linguistic_metrics = {}
    
    # For each claim
    for claim_id in claim_spread_df['claim_id'].unique():
        # Get claim spread data
        claim_data = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
        
        # Get post data for this claim
        claim_posts = posts_df[posts_df['post_id'].isin(claim_data['post_id'])]
        
        # Group by subreddit
        subreddit_metrics = {}
        for subreddit, subreddit_posts in claim_posts.groupby('subreddit'):
            # Skip if there are too few posts
            if len(subreddit_posts) < 3:
                continue
                
            # Calculate linguistic metrics
            metrics = {
                'num_posts': len(subreddit_posts),
                'avg_sentiment': subreddit_posts['sentiment'].mean(),
                'sentiment_std': subreddit_posts['sentiment'].std(),
                'avg_score': subreddit_posts['score'].mean(),
                'common_entities': []
            }
            
            # Calculate word frequencies
            all_text = ' '.join(subreddit_posts['combined_text'].tolist())
            tokens = word_tokenize(all_text)
            freq_dist = nltk.FreqDist(tokens)
            metrics['top_words'] = freq_dist.most_common(20)
            
            # Extract common entities
            all_entities = []
            for ent_list in subreddit_posts['entities']:
                all_entities.extend([ent[0] for ent in ent_list])
            metrics['common_entities'] = Counter(all_entities).most_common(10)
            
            subreddit_metrics[subreddit] = metrics
        
        linguistic_metrics[claim_id] = subreddit_metrics
    
    return linguistic_metrics

def compare_framing_across_communities(posts_df, claim_spread_df, claim_id):
    """
    Compare how the same information is framed across different communities
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing all posts
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    claim_id : str
        ID of the claim to analyze
        
    Returns:
    --------
    framing_comparison : pandas.DataFrame
        DataFrame containing framing comparison metrics
    """
    # Get claim spread data
    claim_data = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
    
    # Get post data for this claim
    claim_posts = posts_df[posts_df['post_id'].isin(claim_data['post_id'])]
    
    # Prepare framing metrics
    framing_data = []
    
    # Group by subreddit
    for subreddit, subreddit_posts in claim_posts.groupby('subreddit'):
        # Skip if there are too few posts
        if len(subreddit_posts) < 2:
            continue
            
        # Calculate sentiment metrics
        sentiment_mean = subreddit_posts['sentiment'].mean()
        sentiment_std = subreddit_posts['sentiment'].std()
        
        # Analyze title framing
        titles = ' '.join(subreddit_posts['title'].tolist())
        title_doc = nlp(titles)
        
        # Extract named entities
        entities = [ent.text for ent in title_doc.ents]
        top_entities = Counter(entities).most_common(5)
        
        # Extract noun phrases
        noun_phrases = [chunk.text for chunk in title_doc.noun_chunks]
        top_noun_phrases = Counter(noun_phrases).most_common(5)
        
        # Check for emotional language
        positive_words = sum(1 for token in title_doc if token.text.lower() in ['good', 'great', 'excellent', 'positive', 'success'])
        negative_words = sum(1 for token in title_doc if token.text.lower() in ['bad', 'terrible', 'awful', 'negative', 'failure'])
        emotional_ratio = (positive_words + negative_words) / len(title_doc) if len(title_doc) > 0 else 0
        
        # Store metrics
        framing = {
            'claim_id': claim_id,
            'subreddit': subreddit,
            'num_posts': len(subreddit_posts),
            'avg_sentiment': sentiment_mean,
            'sentiment_std': sentiment_std,
            'top_entities': top_entities,
            'top_noun_phrases': top_noun_phrases,
            'positive_words': positive_words,
            'negative_words': negative_words,
            'emotional_ratio': emotional_ratio
        }
        framing_data.append(framing)
    
    framing_df = pd.DataFrame(framing_data)
    return framing_df

def visualize_sentiment_comparison(framing_df):
    """
    Visualize sentiment comparison across subreddits
    
    Parameters:
    -----------
    framing_df : pandas.DataFrame
        DataFrame containing framing comparison metrics
    """
    # Sort by average sentiment
    framing_df = framing_df.sort_values('avg_sentiment')
    
    plt.figure(figsize=(12, 6))
    
    # Create sentiment bar chart
    bars = plt.bar(framing_df['subreddit'], framing_df['avg_sentiment'], 
                  yerr=framing_df['sentiment_std'],
                  color=plt.cm.RdBu(np.interp(framing_df['avg_sentiment'], [-1, 1], [0, 1])))
    
    # Add labels
    plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
    plt.xlabel('Subreddit')
    plt.ylabel('Average Sentiment')
    plt.title('Sentiment Comparison Across Political Subreddits')
    
    # Add post count as text
    for idx, row in framing_df.iterrows():
        plt.text(idx, row['avg_sentiment'] + (0.1 if row['avg_sentiment'] > 0 else -0.1), 
                f"n={row['num_posts']}", ha='center')
    
    plt.tight_layout()
    plt.xticks(rotation=45, ha='right')
    plt.show()

# Example usage
"""
# Analyze linguistic differences
linguistic_metrics = analyze_linguistic_differences(posts_df, claim_spread_df)

# Compare framing for a specific claim
claim_id = claim_spread_df['claim_id'].iloc[0]  # Just get the first claim for example
framing_df = compare_framing_across_communities(posts_df, claim_spread_df, claim_id)

# Visualize sentiment comparison
visualize_sentiment_comparison(framing_df)

# Save linguistic analysis data
with open('linguistic_metrics.json', 'w') as f:
    json.dump(linguistic_metrics, f, default=str)
framing_df.to_csv(f'framing_comparison_claim_{claim_id}.csv', index=False)
"""

"\n# Analyze linguistic differences\nlinguistic_metrics = analyze_linguistic_differences(posts_df, claim_spread_df)\n\n# Compare framing for a specific claim\nclaim_id = claim_spread_df['claim_id'].iloc[0]  # Just get the first claim for example\nframing_df = compare_framing_across_communities(posts_df, claim_spread_df, claim_id)\n\n# Visualize sentiment comparison\nvisualize_sentiment_comparison(framing_df)\n\n# Save linguistic analysis data\nwith open('linguistic_metrics.json', 'w') as f:\n    json.dump(linguistic_metrics, f, default=str)\nframing_df.to_csv(f'framing_comparison_claim_{claim_id}.csv', index=False)\n"

In [63]:
# =============================================
# Part 8: Visualize Overall Insights
# =============================================

def create_summary_dashboard(claim_spread_df, corrections_df, propagation_metrics, network_metrics):
    """
    Create a summary dashboard of key findings
    
    Parameters:
    -----------
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    corrections_df : pandas.DataFrame
        DataFrame containing identified corrections
    propagation_metrics : dict
        Dictionary containing metrics about correction propagation
    network_metrics : dict
        Dictionary containing network analysis metrics
    """
    # Create a 2x2 grid of subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot 1: Claim spread by subreddit
    claim_counts = claim_spread_df.groupby('subreddit').size().sort_values(ascending=False)
    claim_counts.head(10).plot(kind='bar', ax=axes[0, 0], color='skyblue')
    axes[0, 0].set_title('Top 10 Subreddits by Claim Appearance')
    axes[0, 0].set_ylabel('Number of Posts')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Plot 2: Correction effectiveness
    correction_ratios = []
    for claim_id, metrics in propagation_metrics.items():
        correction_ratios.append({
            'claim_id': claim_id,
            'correction_ratio': metrics['correction_ratio'],
            'relative_engagement': metrics['relative_engagement'],
            'subreddit_overlap_ratio': metrics['subreddit_overlap_ratio']
        })
    
    correction_df = pd.DataFrame(correction_ratios)
    if not correction_df.empty:
        correction_df.plot.scatter(x='correction_ratio', y='relative_engagement', 
                                 s=correction_df['subreddit_overlap_ratio'] * 100 + 50,
                                 alpha=0.6, ax=axes[0, 1])
        axes[0, 1].set_title('Correction Effectiveness')
        axes[0, 1].set_xlabel('Correction Ratio (Corrections/Claims)')
        axes[0, 1].set_ylabel('Relative Engagement (Correction/Claim Score)')
        axes[0, 1].grid(True, linestyle='--', alpha=0.7)
    
    # Plot 3: Top information sources and receivers
    source_df = pd.DataFrame({
        'Subreddit': [s[0] for s in network_metrics['top_sources']],
        'Out Degree Centrality': [s[1] for s in network_metrics['top_sources']]
    })
    receiver_df = pd.DataFrame({
        'Subreddit': [r[0] for r in network_metrics['top_receivers']],
        'In Degree Centrality': [r[1] for r in network_metrics['top_receivers']]
    })
    
    source_df.set_index('Subreddit').plot(kind='bar', ax=axes[1, 0], color='coral')
    axes[1, 0].set_title('Top Information Sources')
    axes[1, 0].set_ylabel('Out Degree Centrality')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    receiver_df.set_index('Subreddit').plot(kind='bar', ax=axes[1, 1], color='lightgreen')
    axes[1, 1].set_title('Top Information Receivers')
    axes[1, 1].set_ylabel('In Degree Centrality')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.suptitle('Misinformation Spread Analysis Dashboard', fontsize=20, y=1.02)
    plt.savefig('misinformation_dashboard.png', dpi=300, bbox_inches='tight')
    plt.show()

def generate_insights_report(claims_df, claim_spread_df, corrections_df, propagation_metrics, network_metrics, linguistic_metrics):
    """
    Generate a summary report of key insights
    
    Parameters:
    -----------
    claims_df : pandas.DataFrame
        DataFrame containing identified claims
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    corrections_df : pandas.DataFrame
        DataFrame containing identified corrections
    propagation_metrics : dict
        Dictionary containing metrics about correction propagation
    network_metrics : dict
        Dictionary containing network analysis metrics
    linguistic_metrics : dict
        Dictionary containing metrics about linguistic differences
        
    Returns:
    --------
    insights : dict
        Dictionary containing key insights
    """
    insights = {
        'num_claims': len(claims_df),
        'num_posts': len(claim_spread_df),
        'num_corrections': len(corrections_df),
        'communities_analyzed': len(claim_spread_df['subreddit'].unique()),
        'key_findings': []
    }
    
    # Calculate average time to first correction
    correction_times = [metrics['time_to_first_correction'] for _, metrics in propagation_metrics.items() 
                       if metrics['time_to_first_correction'] is not None]
    if correction_times:
        avg_correction_time = sum(correction_times) / len(correction_times)
        insights['avg_time_to_correction'] = avg_correction_time
        
        insights['key_findings'].append(
            f"On average, it takes {avg_correction_time:.1f} hours for the first correction to appear after a claim is posted."
        )
    
    # Find subreddits most/least susceptible to corrections
    subreddit_correction_ratios = {}
    for claim_id, metrics in propagation_metrics.items():
        for subreddit in metrics['claim_subreddits']:
            if subreddit not in subreddit_correction_ratios:
                subreddit_correction_ratios[subreddit] = []
            
            # Check if this subreddit received corrections for this claim
            if subreddit in metrics['correction_subreddits']:
                subreddit_correction_ratios[subreddit].append(1)
            else:
                subreddit_correction_ratios[subreddit].append(0)
    
    # Calculate average correction ratios
    avg_correction_ratios = {
        subreddit: sum(ratios) / len(ratios) 
        for subreddit, ratios in subreddit_correction_ratios.items() 
        if len(ratios) >= 3  # Only consider subreddits with at least 3 claims
    }
    
    if avg_correction_ratios:
        # Most and least susceptible subreddits
        most_susceptible = max(avg_correction_ratios.items(), key=lambda x: x[1])
        least_susceptible = min(avg_correction_ratios.items(), key=lambda x: x[1])
        
        insights['most_susceptible_subreddit'] = most_susceptible[0]
        insights['least_susceptible_subreddit'] = least_susceptible[0]
        
        insights['key_findings'].append(
            f"r/{most_susceptible[0]} is most susceptible to corrections, with {most_susceptible[1]*100:.1f}% of claims receiving corrections."
        )
        insights['key_findings'].append(
            f"r/{least_susceptible[0]} is least susceptible to corrections, with only {least_susceptible[1]*100:.1f}% of claims receiving corrections."
        )
    
    # Find top information bridges
    if network_metrics.get('top_bridges'):
        top_bridge = network_metrics['top_bridges'][0]
        insights['top_information_bridge'] = top_bridge[0]
        
        insights['key_findings'].append(
            f"r/{top_bridge[0]} serves as the most important bridge for information flow between different political communities."
        )
    
    # Analyze linguistic differences
    sentiment_by_subreddit = {}
    for claim_id, subreddit_data in linguistic_metrics.items():
        for subreddit, metrics in subreddit_data.items():
            if subreddit not in sentiment_by_subreddit:
                sentiment_by_subreddit[subreddit] = []
            sentiment_by_subreddit[subreddit].append(metrics['avg_sentiment'])
    
    # Calculate average sentiment
    avg_sentiments = {
        subreddit: sum(sentiments) / len(sentiments)
        for subreddit, sentiments in sentiment_by_subreddit.items()
        if len(sentiments) >= 3  # Only consider subreddits with at least 3 claims
    }
    
    if avg_sentiments:
        # Most positive and negative subreddits
        most_positive = max(avg_sentiments.items(), key=lambda x: x[1])
        most_negative = min(avg_sentiments.items(), key=lambda x: x[1])
        
        insights['most_positive_subreddit'] = most_positive[0]
        insights['most_negative_subreddit'] = most_negative[0]
        
        insights['key_findings'].append(
            f"r/{most_positive[0]} presents information with the most positive sentiment (avg: {most_positive[1]:.2f})."
        )
        insights['key_findings'].append(
            f"r/{most_negative[0]} presents information with the most negative sentiment (avg: {most_negative[1]:.2f})."
        )
    
    return insights

# Example usage
"""
# Create summary dashboard
create_summary_dashboard(claim_spread_df, corrections_df, propagation_metrics, network_metrics)

# Generate insights report
insights = generate_insights_report(claims_df, claim_spread_df, corrections_df, 
                                  propagation_metrics, network_metrics, linguistic_metrics)

# Save insights
with open('misinformation_insights.json', 'w') as f:
    json.dump(insights, f, indent=2)

# Print key findings
print("Key Findings:")
for idx, finding in enumerate(insights['key_findings'], 1):
    print(f"{idx}. {finding}")
"""

'\n# Create summary dashboard\ncreate_summary_dashboard(claim_spread_df, corrections_df, propagation_metrics, network_metrics)\n\n# Generate insights report\ninsights = generate_insights_report(claims_df, claim_spread_df, corrections_df, \n                                  propagation_metrics, network_metrics, linguistic_metrics)\n\n# Save insights\nwith open(\'misinformation_insights.json\', \'w\') as f:\n    json.dump(insights, f, indent=2)\n\n# Print key findings\nprint("Key Findings:")\nfor idx, finding in enumerate(insights[\'key_findings\'], 1):\n    print(f"{idx}. {finding}")\n'

In [64]:
# =============================================
# Part 5: Network Analysis of Information Flow
# =============================================

def create_information_flow_network(claim_spread_df):
    """
    Create a network representation of information flow between subreddits
    
    Parameters:
    -----------
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
        
    Returns:
    --------
    G : networkx.DiGraph
        Directed graph representing information flow
    """
    G = nx.DiGraph()
    
    # Group by claim_id
    for claim_id, claim_data in claim_spread_df.groupby('claim_id'):
        # Sort by creation time
        claim_data = claim_data.sort_values('created_utc')
        
        # Add nodes (subreddits)
        for subreddit in claim_data['subreddit'].unique():
            if subreddit not in G:
                G.add_node(subreddit, type='subreddit')
        
        # Add edges based on temporal flow
        subreddits_seen = []
        for _, post in claim_data.iterrows():
            subreddit = post['subreddit']
            
            # Add edges from all previously seen subreddits to current subreddit
            for prev_subreddit in subreddits_seen:
                if prev_subreddit != subreddit:
                    # Check if edge already exists
                    if G.has_edge(prev_subreddit, subreddit):
                        # Increment weight
                        G[prev_subreddit][subreddit]['weight'] += 1
                    else:
                        # Create new edge
                        G.add_edge(prev_subreddit, subreddit, weight=1, claims=[claim_id])
            
            # Add current subreddit to seen list if not already there
            if subreddit not in subreddits_seen:
                subreddits_seen.append(subreddit)
    
    return G

def analyze_network(G):
    """
    Analyze the information flow network
    
    Parameters:
    -----------
    G : networkx.DiGraph
        Directed graph representing information flow
        
    Returns:
    --------
    network_metrics : dict
        Dictionary containing network analysis metrics
    """
    # Calculate basic network metrics
    metrics = {
        'num_nodes': G.number_of_nodes(),
        'num_edges': G.number_of_edges(),
        'density': nx.density(G),
        'is_strongly_connected': nx.is_strongly_connected(G)
    }
    
    # Centrality measures
    metrics['out_degree_centrality'] = nx.out_degree_centrality(G)
    metrics['in_degree_centrality'] = nx.in_degree_centrality(G)
    metrics['betweenness_centrality'] = nx.betweenness_centrality(G)
    
    # Community detection
    # Convert to undirected for community detection
    G_undirected = G.to_undirected()
    partition = community_louvain.best_partition(G_undirected)
    metrics['communities'] = partition
    
    # Identify key nodes
    sorted_out_degree = sorted(metrics['out_degree_centrality'].items(), 
                              key=lambda x: x[1], reverse=True)
    sorted_in_degree = sorted(metrics['in_degree_centrality'].items(), 
                             key=lambda x: x[1], reverse=True)
    sorted_betweenness = sorted(metrics['betweenness_centrality'].items(), 
                               key=lambda x: x[1], reverse=True)
    
    metrics['top_sources'] = sorted_out_degree[:5]  # Top 5 sources of information
    metrics['top_receivers'] = sorted_in_degree[:5]  # Top 5 receivers of information
    metrics['top_bridges'] = sorted_betweenness[:5]  # Top 5 information bridges
    
    return metrics

def visualize_information_flow(G, metrics, output_file='information_flow_network.png'):
    """
    Visualize the information flow network
    
    Parameters:
    -----------
    G : networkx.DiGraph
        Directed graph representing information flow
    metrics : dict
        Dictionary containing network analysis metrics
    output_file : str
        Path to save the visualization
    """
    plt.figure(figsize=(16, 12))
    
    # Set up node positions using spring layout
    pos = nx.spring_layout(G, k=0.3, iterations=50)
    
    # Get node attributes for visualization
    communities = metrics['communities']
    node_colors = [communities[node] for node in G.nodes()]
    
    # Scale node sizes based on betweenness centrality
    betweenness = metrics['betweenness_centrality']
    node_sizes = [betweenness[node] * 5000 + 100 for node in G.nodes()]
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, 
                          node_color=node_colors, 
                          node_size=node_sizes,
                          alpha=0.8, 
                          cmap=plt.cm.tab20)
    
    # Draw edges with weights as width
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    nx.draw_networkx_edges(G, pos, 
                          width=[w * 0.5 for w in edge_weights], 
                          alpha=0.5, 
                          edge_color='gray',
                          arrowsize=15)
    
    # Draw node labels
    nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')
    
    plt.title('Information Flow Network Between Political Subreddits', fontsize=20)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.show()

# Example usage
"""
# Create information flow network
G = create_information_flow_network(claim_spread_df)

# Analyze network
network_metrics = analyze_network(G)

# Visualize network
visualize_information_flow(G, network_metrics)

# Save network data
with open('information_flow_network.pkl', 'wb') as f:
    pickle.dump({'graph': G, 'metrics': network_metrics}, f)
"""

"\n# Create information flow network\nG = create_information_flow_network(claim_spread_df)\n\n# Analyze network\nnetwork_metrics = analyze_network(G)\n\n# Visualize network\nvisualize_information_flow(G, network_metrics)\n\n# Save network data\nwith open('information_flow_network.pkl', 'wb') as f:\n    pickle.dump({'graph': G, 'metrics': network_metrics}, f)\n"

In [65]:
# =============================================
# Part 6: Analyze Correction Propagation
# =============================================

def identify_corrections(posts_df, comments_df, claim_spread_df, similarity_threshold=0.3):
    """
    Identify posts and comments that might be corrections to misinformation
    
    Parameters:
    -----------
    posts_df : pandas.DataFrame
        DataFrame containing all posts
    comments_df : pandas.DataFrame
        DataFrame containing all comments
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    similarity_threshold : float
        Threshold for similarity to consider a post/comment as related to a claim
        
    Returns:
    --------
    corrections_df : pandas.DataFrame
        DataFrame containing identified corrections
    """
    # Keywords that might indicate a correction
    correction_keywords = [
        'correction', 'false', 'debunk', 'fact check', 'misleading', 'incorrect',
        'wrong', 'not true', 'misinformation', 'fake news', 'disinformation'
    ]
    
    # Create a regex pattern for the keywords
    pattern = '|'.join(correction_keywords)
    
    # Find potential corrections in posts
    post_corrections = []
    for claim_id in claim_spread_df['claim_id'].unique():
        # Get posts related to this claim
        claim_posts = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
        claim_post_ids = claim_posts['post_id'].tolist()
        
        # Find posts that might be corrections
        for _, post in posts_df.iterrows():
            # Skip posts that are already part of the claim spread
            if post['post_id'] in claim_post_ids:
                continue
                
            # Check if post contains correction keywords
            title_text = str(post.get('title', ''))
            body_text = str(post.get('body', ''))
            combined_text = title_text + ' ' + body_text
            
            if re.search(pattern, combined_text, re.IGNORECASE):
                # Calculate similarity to claim posts
                claim_texts = []
                for pid in claim_post_ids:
                    claim_post = posts_df[posts_df['post_id'] == pid]
                    if not claim_post.empty and 'combined_text' in claim_post.columns:
                        text = claim_post['combined_text'].iloc[0]
                        if isinstance(text, str) and text.strip():
                            claim_texts.append(text)
                
                post_text = post.get('combined_text', '')
                
                # Skip if post text is empty
                if not isinstance(post_text, str) or not post_text.strip():
                    continue
                    
                # Calculate similarities
                similarities = []
                for claim_text in claim_texts:
                    if not claim_text.strip():
                        continue
                    try:
                        # Use simple TF-IDF vectorizer for this comparison
                        tfidf = TfidfVectorizer().fit_transform([post_text, claim_text])
                        similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
                        similarities.append(similarity)
                    except Exception as e:
                        print(f"Error calculating similarity: {e}")
                        continue
                
                # If any similarity is above threshold, consider it a correction
                if similarities and max(similarities) > similarity_threshold:
                    correction = {
                        'claim_id': claim_id,
                        'correction_type': 'post',
                        'correction_id': post['post_id'],
                        'subreddit': post.get('subreddit', ''),
                        'author': post.get('author', ''),
                        'created_utc': post.get('created_utc'),
                        'score': post.get('score', 0),
                        'sentiment': post.get('sentiment', 0),
                        'max_similarity': max(similarities) if similarities else 0
                    }
                    post_corrections.append(correction)
    
    # Find potential corrections in comments
    comment_corrections = []
    for claim_id in claim_spread_df['claim_id'].unique():
        # Get posts related to this claim
        claim_posts = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
        claim_post_ids = claim_posts['post_id'].tolist()
        
        # Get comments for these posts
        if 'post_id' in comments_df.columns:
            related_comments = comments_df[comments_df['post_id'].isin(claim_post_ids)]
        else:
            related_comments = pd.DataFrame()  # Empty DataFrame if post_id not in comments
        
        # Find comments that might be corrections
        for _, comment in related_comments.iterrows():
            # Check if comment contains correction keywords
            comment_body = str(comment.get('body', ''))
            if re.search(pattern, comment_body, re.IGNORECASE):
                correction = {
                    'claim_id': claim_id,
                    'correction_type': 'comment',
                    'correction_id': comment.get('comment_id', ''),
                    'post_id': comment.get('post_id', ''),
                    'subreddit': comment.get('subreddit', ''),
                    'author': comment.get('author', ''),
                    'created_utc': comment.get('created_utc'),
                    'score': comment.get('score', 0),
                    'sentiment': comment.get('sentiment', 0)
                }
                comment_corrections.append(correction)
    
    # Combine post and comment corrections
    all_corrections = post_corrections + comment_corrections
    
    if all_corrections:
        corrections_df = pd.DataFrame(all_corrections)
    else:
        # Create empty DataFrame with correct columns if no corrections found
        corrections_df = pd.DataFrame(columns=[
            'claim_id', 'correction_type', 'correction_id', 'post_id', 
            'subreddit', 'author', 'created_utc', 'score', 'sentiment', 'max_similarity'
        ])
    
    return corrections_df

def analyze_correction_propagation(claim_spread_df, corrections_df):
    """
    Analyze how corrections propagate compared to original claims
    
    Parameters:
    -----------
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    corrections_df : pandas.DataFrame
        DataFrame containing identified corrections
        
    Returns:
    --------
    propagation_metrics : dict
        Dictionary containing metrics about correction propagation
    """
    propagation_metrics = {}
    
    # For each claim
    for claim_id in claim_spread_df['claim_id'].unique():
        # Get claim spread data
        claim_data = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
        
        # Get corrections for this claim
        claim_corrections = corrections_df[corrections_df['claim_id'] == claim_id]
        
        # Calculate time metrics
        if 'created_utc' in claim_data.columns and len(claim_data) > 0:
            claim_first_appearance = claim_data['created_utc'].min()
            
            if not claim_corrections.empty and 'created_utc' in claim_corrections.columns:
                first_correction = claim_corrections['created_utc'].min()
                # Calculate time difference in hours
                try:
                    time_to_first_correction = (first_correction - claim_first_appearance).total_seconds() / 3600
                except Exception as e:
                    print(f"Error calculating time difference: {e}")
                    time_to_first_correction = None
            else:
                first_correction = None
                time_to_first_correction = None
        else:
            claim_first_appearance = None
            first_correction = None
            time_to_first_correction = None
        
        # Calculate subreddit overlap
        if 'subreddit' in claim_data.columns and len(claim_data) > 0:
            claim_subreddits = set(claim_data['subreddit'])
            
            if not claim_corrections.empty and 'subreddit' in claim_corrections.columns:
                correction_subreddits = set(claim_corrections['subreddit'])
                subreddit_overlap = len(claim_subreddits.intersection(correction_subreddits))
                subreddit_overlap_ratio = subreddit_overlap / len(claim_subreddits) if claim_subreddits else 0
            else:
                correction_subreddits = set()
                subreddit_overlap = 0
                subreddit_overlap_ratio = 0
        else:
            claim_subreddits = set()
            correction_subreddits = set()
            subreddit_overlap = 0
            subreddit_overlap_ratio = 0
        
        # Calculate engagement metrics
        if 'score' in claim_data.columns and len(claim_data) > 0:
            avg_claim_score = claim_data['score'].mean()
            
            if not claim_corrections.empty and 'score' in claim_corrections.columns:
                avg_correction_score = claim_corrections['score'].mean()
                relative_engagement = avg_correction_score / avg_claim_score if avg_claim_score != 0 else 0
            else:
                avg_correction_score = 0
                relative_engagement = 0
        else:
            avg_claim_score = 0
            avg_correction_score = 0
            relative_engagement = 0
        
        # Store metrics
        propagation_metrics[claim_id] = {
            'claim_first_appearance': claim_first_appearance,
            'first_correction': first_correction,
            'time_to_first_correction': time_to_first_correction,
            'num_claim_posts': len(claim_data),
            'num_corrections': len(claim_corrections),
            'correction_ratio': len(claim_corrections) / len(claim_data) if len(claim_data) > 0 else 0,
            'claim_subreddits': list(claim_subreddits),  # Convert to list for JSON serialization
            'correction_subreddits': list(correction_subreddits),  # Convert to list for JSON serialization
            'subreddit_overlap': subreddit_overlap,
            'subreddit_overlap_ratio': subreddit_overlap_ratio,
            'avg_claim_score': avg_claim_score,
            'avg_correction_score': avg_correction_score,
            'relative_engagement': relative_engagement
        }
    
    return propagation_metrics

def visualize_correction_propagation(claim_spread_df, corrections_df, claim_id):
    """
    Visualize how a specific claim and its corrections propagate over time
    
    Parameters:
    -----------
    claim_spread_df : pandas.DataFrame
        DataFrame containing information about claim spread
    corrections_df : pandas.DataFrame
        DataFrame containing identified corrections
    claim_id : str
        ID of the claim to visualize
    """
    # Get data for this claim
    claim_data = claim_spread_df[claim_spread_df['claim_id'] == claim_id]
    claim_corrections = corrections_df[corrections_df['claim_id'] == claim_id]
    
    # Skip if no data
    if claim_data.empty:
        print(f"No claim data available for claim {claim_id}")
        return
    
    # Create a timeline
    plt.figure(figsize=(14, 8))
    
    # Plot claim posts
    for idx, (_, post) in enumerate(claim_data.iterrows()):
        # Check if score is available
        score = post.get('score', 50)  # Default value if score is missing
        if not isinstance(score, (int, float)):
            score = 50
        
        # Ensure created_utc is datetime
        if not isinstance(post.get('created_utc'), pd.Timestamp):
            continue
            
        plt.scatter(post['created_utc'], idx % 10, c='red', s=score/10 + 50, alpha=0.7)
        plt.text(post['created_utc'], idx % 10 + 0.5, post.get('subreddit', 'unknown'), fontsize=8)
    
    # Plot corrections if any exist
    if not claim_corrections.empty:
        for idx, (_, correction) in enumerate(claim_corrections.iterrows()):
            # Check if score is available
            score = correction.get('score', 50)  # Default value if score is missing
            if not isinstance(score, (int, float)):
                score = 50
            
            # Ensure created_utc is datetime
            if not isinstance(correction.get('created_utc'), pd.Timestamp):
                continue
                
            plt.scatter(correction['created_utc'], (idx % 10) - 15, c='green', s=score/10 + 50, alpha=0.7)
            plt.text(correction['created_utc'], (idx % 10) - 14.5, correction.get('subreddit', 'unknown'), fontsize=8)
    
    # Add labels and title
    plt.xlabel('Time')
    plt.ylabel('Position (for visualization only)')
    plt.title(f'Propagation Timeline for Claim {claim_id}')
    
    # Add legend
    plt.scatter([], [], c='red', s=100, label='Original claim posts')
    plt.scatter([], [], c='green', s=100, label='Correction posts/comments')
    plt.legend()
    
    # Format x-axis as dates
    plt.gcf().autofmt_xdate()
    
    plt.tight_layout()
    plt.show()

# Example usage
"""
# Identify corrections
corrections_df = identify_corrections(posts_df, comments_df, claim_spread_df)

# Analyze correction propagation
propagation_metrics = analyze_correction_propagation(claim_spread_df, corrections_df)

# Visualize correction propagation for a specific claim
claim_id = claim_spread_df['claim_id'].iloc[0]  # Just get the first claim for example
visualize_correction_propagation(claim_spread_df, corrections_df, claim_id)

# Save correction data
corrections_df.to_csv('corrections.csv', index=False)
with open('propagation_metrics.json', 'w') as f:
    json.dump(propagation_metrics, f, default=str)  # default=str to handle datetime objects
"""

"\n# Identify corrections\ncorrections_df = identify_corrections(posts_df, comments_df, claim_spread_df)\n\n# Analyze correction propagation\npropagation_metrics = analyze_correction_propagation(claim_spread_df, corrections_df)\n\n# Visualize correction propagation for a specific claim\nclaim_id = claim_spread_df['claim_id'].iloc[0]  # Just get the first claim for example\nvisualize_correction_propagation(claim_spread_df, corrections_df, claim_id)\n\n# Save correction data\ncorrections_df.to_csv('corrections.csv', index=False)\nwith open('propagation_metrics.json', 'w') as f:\n    json.dump(propagation_metrics, f, default=str)  # default=str to handle datetime objects\n"

In [66]:
def main_pipeline(posts_csv_path, comments_csv_path):
    """
    Execute the full misinformation tracking pipeline using existing CSV files
    
    Parameters:
    -----------
    posts_csv_path : str
        Path to the CSV file containing Reddit posts
    comments_csv_path : str
        Path to the CSV file containing Reddit comments
        
    Returns:
    --------
    dict
        Dictionary containing all analysis results
    """
    print("Starting misinformation tracking pipeline with existing CSV data...")
    
    # Step 1: Load Data from CSV
    print("\n1. Loading data from CSV files...")
    # Try to handle potential encoding issues
    try:
        posts_df = pd.read_csv(posts_csv_path)
    except UnicodeDecodeError:
        print("UTF-8 encoding failed for posts, trying latin-1...")
        posts_df = pd.read_csv(posts_csv_path, encoding='latin-1')
    
    try:
        comments_df = pd.read_csv(comments_csv_path)
    except UnicodeDecodeError:
        print("UTF-8 encoding failed for comments, trying latin-1...")
        comments_df = pd.read_csv(comments_csv_path, encoding='latin-1')
    
    # Print data summary
    print(f"\nLoaded {len(posts_df)} posts from {posts_df['subreddit'].nunique()} subreddits.")
    print(f"Loaded {len(comments_df)} comments.")
    
    print("\nSubreddits in the dataset:")
    for subreddit, count in posts_df['subreddit'].value_counts().items():
        print(f"  - r/{subreddit}: {count} posts")
    
    # Step 2: Data Preprocessing
    processed_posts_path = 'processed_posts.csv'
    processed_comments_path = 'processed_comments.csv'
    
    # Check if processed files already exist
    import os
    if os.path.exists(processed_posts_path) and os.path.exists(processed_comments_path):
        print("\n2. Loading existing processed data files...")
        try:
            posts_df = pd.read_csv(processed_posts_path)
            comments_df = pd.read_csv(processed_comments_path)
            
            # Convert timestamp columns to datetime
            if 'created_utc' in posts_df.columns:
                posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc'])
            if 'created_utc' in comments_df.columns:
                comments_df['created_utc'] = pd.to_datetime(comments_df['created_utc'])
            
            # Convert entities column from string back to list if needed
            if 'entities' in posts_df.columns and posts_df['entities'].dtype == 'object':
                try:
                    import ast
                    posts_df['entities'] = posts_df['entities'].apply(
                        lambda x: ast.literal_eval(x) if isinstance(x, str) else []
                    )
                except:
                    print("Warning: Could not convert entities column back to list.")
            
            if 'entities' in comments_df.columns and comments_df['entities'].dtype == 'object':
                try:
                    import ast
                    comments_df['entities'] = comments_df['entities'].apply(
                        lambda x: ast.literal_eval(x) if isinstance(x, str) else []
                    )
                except:
                    print("Warning: Could not convert entities column back to list.")
            
            print(f"Loaded processed data: {len(posts_df)} posts and {len(comments_df)} comments.")
        except Exception as e:
            print(f"Error loading processed files: {e}")
            print("Will preprocess data from scratch...")
            posts_df, comments_df = process_dataframes(posts_df, comments_df)
            
            # Save processed data
            posts_df.to_csv(processed_posts_path, index=False)
            comments_df.to_csv(processed_comments_path, index=False)
            print(f"Saved processed data to {processed_posts_path} and {processed_comments_path}")
    else:
        print("\n2. Processing and enriching data...")
        posts_df, comments_df = process_dataframes(posts_df, comments_df)
        
        # Save processed data
        posts_df.to_csv(processed_posts_path, index=False)
        comments_df.to_csv(processed_comments_path, index=False)
        print(f"Saved processed data to {processed_posts_path} and {processed_comments_path}")
    
    # Initialize results dictionary
    results = {
        'posts_df': posts_df,
        'comments_df': comments_df
    }
    
    # Step 3: Topic Modeling (skip if too few posts)
    print("\n3. Creating topic model to identify claims/stories...")
    lda_model, corpus, dictionary = create_topic_model(posts_df, num_topics=15, passes=20, min_posts=5)
    
    # If topic modeling failed, return early with just the processed data
    if lda_model is None:
        print("Topic modeling failed. Returning processed data only.")
        return results
    
    # Display topics
    print("\nIdentified Topics:")
    for idx, topic in lda_model.print_topics(num_words=10):
        print(f"Topic {idx}: {topic}")
    
    # Get topic distributions
    posts_df = get_topic_distributions(lda_model, corpus, posts_df)
    
    # Save model and data
    lda_model.save('lda_model.model')
    with open('lda_corpus.pkl', 'wb') as f:
        pickle.dump(corpus, f)
    dictionary.save('lda_dictionary.dict')
    
    # Update results
    results['posts_df'] = posts_df
    results['lda_model'] = lda_model
    results['corpus'] = corpus
    results['dictionary'] = dictionary
    
    # Step 4: Claim Identification and Tracking
    print("\n4. Identifying potential claims...")
    claims_df = identify_potential_claims(posts_df, threshold=0.6)
    
    # If no claims were identified, return early
    if claims_df.empty:
        print("No claims identified. Returning processed data and topic model only.")
        results['claims_df'] = claims_df
        return results
    
    print("\n5. Building similarity matrix...")
    similarity_matrix, post_ids, claim_ids = build_claim_similarity_matrix(posts_df, claims_df)
    
    print("\n6. Tracking claim spread...")
    claim_spread_df = track_claim_spread(posts_df, comments_df, similarity_matrix, post_ids, claim_ids)
    
    # Save claim data
    claims_df.to_csv('identified_claims.csv', index=False)
    claim_spread_df.to_csv('claim_spread.csv', index=False)
    
    # Update results
    results['claims_df'] = claims_df
    results['claim_spread_df'] = claim_spread_df
    
    # Step 5: Network Analysis
    print("\n7. Creating information flow network...")
    G = create_information_flow_network(claim_spread_df)
    
    print("\n8. Analyzing network...")
    network_metrics = analyze_network(G)
    
    print("\n9. Visualizing information flow network...")
    visualize_information_flow(G, network_metrics)
    
    # Save network data
    with open('information_flow_network.pkl', 'wb') as f:
        pickle.dump({'graph': G, 'metrics': network_metrics}, f)
    
    # Update results
    results['network_graph'] = G
    results['network_metrics'] = network_metrics
    
    # Step 6: Correction Analysis
    print("\n10. Identifying corrections...")
    corrections_df = identify_corrections(posts_df, comments_df, claim_spread_df)
    
    print("\n11. Analyzing correction propagation...")
    propagation_metrics = analyze_correction_propagation(claim_spread_df, corrections_df)
    
    # Skip visualization if no claims
    if len(claim_spread_df) > 0:
        print("\n12. Visualizing correction propagation...")
        # Just visualize the first claim as an example
        claim_id = claim_spread_df['claim_id'].iloc[0]
        visualize_correction_propagation(claim_spread_df, corrections_df, claim_id)
    
    # Save correction data
    corrections_df.to_csv('corrections.csv', index=False)
    with open('propagation_metrics.json', 'w') as f:
        json.dump(propagation_metrics, f, default=str)
    
    # Update results
    results['corrections_df'] = corrections_df
    results['propagation_metrics'] = propagation_metrics
    
    # Continue with remaining steps if there are claims...
    # (linguistic analysis, summaries, etc.)
    
    print("\nPipeline completed successfully!")
    return results

In [None]:
# Run the processing pipeline



In [None]:

# Run the pipeline
results = main_pipeline(posts_csv_path, comments_csv_path)

# You can access the processed dataframes
processed_posts = results['posts_df']
processed_comments = results['comments_df']

# Display a sample of the processed data
processed_posts.head()

Starting misinformation tracking pipeline with existing CSV data...

1. Loading data from CSV files...

Loaded 150505 posts from 3 subreddits.
Loaded 3561862 comments.

Subreddits in the dataset:
  - r/Conservative: 132987 posts
  - r/centrist: 11059 posts
  - r/Liberal: 6459 posts

2. Processing and enriching data...
Processing posts...


Processing post titles: 100%|██████████| 150505/150505 [01:18<00:00, 1907.44it/s]
Processing post bodies: 100%|██████████| 150505/150505 [00:40<00:00, 3751.41it/s]


Processing comments...


Processing comments: 100%|██████████| 3561862/3561862 [38:26<00:00, 1544.37it/s] 


Adding sentiment analysis...


Calculating post sentiment: 100%|██████████| 150505/150505 [00:21<00:00, 7051.11it/s] 
Calculating comment sentiment:  10%|▉         | 343923/3561862 [66:17:39<13:15, 4046.63it/s]     

In [28]:
merged_posts_df = pd.read_csv(posts_csv_path)
merged_comments_df = pd.read_csv(comments_csv_path)

# I want to save head for each in csv
merged_posts_df.head(10000).to_csv('merged_posts_head.csv', index=False)
merged_comments_df.head(10000).to_csv('merged_comments_head.csv', index=False)