In [1]:
import praw
import pandas as pd
import time
import openai
import os
import re
import random
from datetime import datetime
import json

# Reddit API Setup
reddit = praw.Reddit(
    client_id = "reddit client id",
    client_secret = "reddit secret key",
    user_agent = "MyAcademicApp2/0.1 by myredditusername"
)

# OpenAI API Setup
client = openai.OpenAI(
    api_key="api key here"
)

print("API connections initialized")

API connections initialized


In [2]:
def search_ukraine_financial_posts(start_date=1640995200, end_date=1735689600, posts_per_year=2):
    """
    Search for posts about Ukraine financial aid/funding in r/conspiracy
    Limited to specific date range with even distribution
    """
    subreddit = reddit.subreddit("conspiracy")
    posts = []
    
    # Create date ranges for each year to ensure even distribution
    date_ranges = [
        (1640995200, 1672531200),  # 2022
        (1672531200, 1704067200),  # 2023
        (1704067200, 1735689600)   # 2024-25
    ]
    
    # Search terms focused on financial aspects
    search_terms = [
        "Ukraine funding", 
        "Ukraine aid", 
        "Ukraine money", 
        "Ukraine financial",
        "Ukraine billions"
    ]
    
    for date_range in date_ranges:
        range_start, range_end = date_range
        year_posts = []
        
        for term in search_terms:
            # Search with time filter when possible
            try:
                for submission in subreddit.search(term, sort="relevance", limit=20):
                    # Check if post is within our date range
                    if range_start <= submission.created_utc <= range_end:
                        # Verify it's finance-related with a simple keyword check
                        if contains_funding_keywords(submission.title + " " + submission.selftext):
                            year_posts.append(submission)
                            print(f"Found post: {submission.title} (Date: {time.ctime(submission.created_utc)})")
                    
                    if len(year_posts) >= posts_per_year:
                        break
                        
                if len(year_posts) >= posts_per_year:
                    break
            except Exception as e:
                print(f"Error searching for '{term}': {e}")
                continue
        
        # Add this year's posts to our collection
        posts.extend(year_posts[:posts_per_year])  # Limit to posts_per_year
        
        # Be nice to the API - don't make too many requests too quickly
        time.sleep(2)
    
    return posts
# filter out all the posts specfic to funding
def contains_funding_keywords(text):
    """Check if text contains keywords related to Ukraine funding - less restrictive version"""
    keywords = ['fund', 'aid', 'money', 'billion', 'million', 'dollar', 'financial', 
                'spend', 'cost', 'budget', 'donate', 'assistance', 'support',
                'help', 'pay', 'send', 'give', 'cash', 'economic']
    
    # Less restrictive - only require "ukraine" and not necessarily funding terms
    # This will include more comments in the discussion threads
    text_lower = text.lower()
    return 'ukraine' in text_lower
def extract_post_data(submission):
    """Extract relevant data from a submission"""
    # Handle deleted authors
    author = submission.author.name if submission.author else "[deleted]"
    
    return {
        'id': submission.id,
        'title': submission.title,
        'author': author,
        'created_utc': submission.created_utc,
        'score': submission.score,
        'upvote_ratio': submission.upvote_ratio,
        'num_comments': submission.num_comments,
        'text': submission.selftext,
        'permalink': submission.permalink
    }

In [3]:
def get_all_comments(submission, limit=100):
    """
    Get comments from a submission with a maximum limit to avoid excessive requests
    Returns a list of comment objects
    """
    try:
        # Expand the comment tree but with limits to prevent excessive API calls
        submission.comments.replace_more(limit=5)
        all_comments = submission.comments.list()
        
        # Apply a maximum limit
        return all_comments[:limit]
    except Exception as e:
        print(f"Error getting comments for post {submission.id}: {e}")
        return []

def extract_comment_data(comment, submission_id):
    """Extract relevant data from a comment"""
    # Handle deleted authors
    author = comment.author.name if comment.author else "[deleted]"
    
    return {
        'id': comment.id,
        'submission_id': submission_id,
        'author': author,
        'parent_id': comment.parent_id,
        'created_utc': comment.created_utc,
        'score': comment.score,
        'body': comment.body,
        'contains_ukraine': 'ukraine' in comment.body.lower()
    }

def get_parent_author(comment, all_comments, submission):
    """
    Find the author of the parent comment or submission
    """
    parent_id = comment.parent_id[3:]  # Remove 't1_' or 't3_' prefix
    
    # Check if parent is a comment
    if comment.parent_id.startswith('t1_'):
        for potential_parent in all_comments:
            if potential_parent.id == parent_id:
                return potential_parent.author.name if potential_parent.author else "[deleted]"
    
    # If parent is the submission
    elif comment.parent_id.startswith('t3_'):
        return submission.author.name if submission.author else "[deleted]"
    
    return "[deleted]"

In [44]:
def classify_sentiment(texts, batch_size=10):
    """
    Classify the sentiment regarding Ukraine funding with more flexibility
    Returns list of classifications (Negative, Neutral, Positive)
    """
    if not texts:
        return []
        
    results = []
    
    # Process in batches to minimize API calls
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        # Create prompt with indexed texts
        classification_prompt = ""
        for idx, text in enumerate(batch):
            cleaned_text = text.replace('\n', ' ')
            classification_prompt += f"{idx}: {cleaned_text}\n\n"
        
        system_prompt = """
        Context: You are analyzing Reddit posts and comments related to Ukraine from conspiracy-oriented communities.
        Task: For each text, determine the author's stance toward Ukraine in the conflict:
            "Strongly Pro-Ukraine" - Clearly supports Ukraine, condemns Russia's actions, or shows strong support for Ukrainian resistance.
            "Mildly Pro-Ukraine" - Generally sympathetic to Ukraine but with some reservations or qualifications.
            "Mildly Anti-Ukraine" - Somewhat critical of Ukraine or implies Ukrainian government/Western allies share blame.
            "Strongly Anti-Ukraine" - Explicitly against Ukraine, suggests Ukraine is corrupt/illegitimate, or strongly supports Russian position.
            "Neutral" - ONLY use when truly balanced or completely unclear (use sparingly).
        Classification Indicators:
            Pro-Ukraine signals:
                Criticizes Russian aggression or Putin
                Supports military/financial aid to Ukraine
                Refers to "Russian propaganda" or "Putin's lies"
                Sympathizes with Ukrainian civilians/refugees
                Mentions "defending democracy" or "Ukrainian sovereignty"
                Condemns the invasion or occupation
            Anti-Ukraine signals:
                Mentions "NATO expansion" as cause of conflict
                References "US puppet government" in Ukraine
                Talks about "bioweapons labs" in Ukraine
                Suggests Ukraine is controlled by "globalists"
                Refers to "Western propaganda" about the war
                Questions the legitimacy of Ukrainian government
                Portrays Russia as justified or defending itself
                Criticizes financial or military support for Ukraine
        Guidelines:
            Consider context, tone, and implied positions, you may take creative freedom here
            If comment presents conspiracy theories against Ukraine, classify as Anti-Ukraine
            Avoid defaulting to "Neutral" when any sentiment can be detected
            Look for subtle ideological indicators specific to conspiracy communities
            Constraints: Provide answers in JSON format with keys "Index" and "Sentiment". Return one JSON object per line.
            Example: {"Index": 0, "Sentiment": "Strongly Anti-Ukraine"}

        """
        
        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Classify these texts regarding support for Ukraine:\n{classification_prompt}"}
                ]
            )
            
            # Parse responses from strcutred output
            response_content = response.choices[0].message.content
            
            try:
                lines = [line.strip() for line in response_content.split('\n') if line.strip()]
                batch_results = []
                
                for line in lines:
                    # Clean up the line to ensure it's valid JSON
                    clean_line = line
                    # Remove markdown code block markers if present
                    if clean_line.startswith('```') and clean_line.endswith('```'):
                        clean_line = clean_line[3:-3].strip()
                    # Remove any leading/trailing backticks
                    clean_line = clean_line.strip('`')
                    
                    try:
                        if clean_line.startswith('{') and clean_line.endswith('}'):
                            result = json.loads(clean_line)
                            batch_results.append(result)
                    except json.JSONDecodeError:
                        # Try to extract JSON from the line
                        import re
                        json_match = re.search(r'({.*?})', clean_line)
                        if json_match:
                            try:
                                result = json.loads(json_match.group(1))
                                batch_results.append(result)
                            except:
                                pass
                
                results.extend(batch_results)
            except Exception as e:
                print(f"Error parsing classification response: {e}")
                print(f"Raw response: {response_content}")
            
            # Be nice to the API again, always be nice to the API 
            time.sleep(1)
            
        except Exception as e:
            print(f"Error calling OpenAI API: {e}")
    
    return results

In [45]:
# takes all the users idea and the repsones and agregrates it properly
def aggregate_user_sentiment(posts_data, comments_data, classification_data):
    """
    Aggregate post and comment-level sentiment classifications to user level
    """
    user_profiles = {}
    
    # Process posts first
    for post in posts_data:
        username = post['author']
        
        # Skip deleted users
        if username == "[deleted]":
            continue
            
        # Initialize user profile
        if username not in user_profiles:
            user_profiles[username] = {
                'post_count': 0,
                'comment_count': 0,
                'sentiment_counts': {
                    'Strongly Pro-Ukraine': 0,
                    'Mildly Pro-Ukraine': 0,
                    'Mildly Anti-Ukraine': 0,
                    'Strongly Anti-Ukraine': 0,
                    'Neutral': 0
                },
                'avg_score': 0,
                'total_score': 0,
                'interacts_with': set()
            }
        
        # Add post data
        user_profiles[username]['post_count'] += 1
        user_profiles[username]['total_score'] += post['score']
        
        # Add sentiment if available
        post_key = f"post_{post['id']}"
        if post_key in classification_data:
            sentiment = classification_data[post_key]
            user_profiles[username]['sentiment_counts'][sentiment] += 1
    
    # Process comments
    for comment in comments_data:
        username = comment['author']
        
        # Skip deleted users, finding a way to track deleted users cometns and sentimets would be nice, perhas create a new user for each detlele account coment on its own?
        if username == "[deleted]":
            continue
            
        # Initialize user profile if needed
        if username not in user_profiles:
            user_profiles[username] = {
                'post_count': 0,
                'comment_count': 0,
                'sentiment_counts': {
                            'Strongly Pro-Ukraine': 0,
                            'Mildly Pro-Ukraine': 0,
                            'Mildly Anti-Ukraine': 0,
                            'Strongly Anti-Ukraine': 0,
                            'Neutral': 0
                        },

                'avg_score': 0,
                'total_score': 0,
                'interacts_with': set()
            }
        
        # Add comment data
        user_profiles[username]['comment_count'] += 1
        user_profiles[username]['total_score'] += comment['score']
        
        # Add parent commenter to interactions list (for edges)
        if 'parent_author' in comment and comment['parent_author'] != username:
            user_profiles[username]['interacts_with'].add(comment['parent_author'])
        
        # Add sentiment if available
        comment_key = f"comment_{comment['id']}"
        if comment_key in classification_data:
            sentiment = classification_data[comment_key]
            user_profiles[username]['sentiment_counts'][sentiment] += 1
    
    # Calculate metrics for each user
    for username, profile in user_profiles.items():
        total_items = profile['post_count'] + profile['comment_count']
        
        # Calculate average score
        if total_items > 0:
            profile['avg_score'] = profile['total_score'] / total_items
        
        # Determine dominant sentiment
        sentiment_counts = profile['sentiment_counts']
        if sum(sentiment_counts.values()) > 0:
            profile['dominant_sentiment'] = max(
                sentiment_counts, key=sentiment_counts.get
            )
        else:
            profile['dominant_sentiment'] = 'Unknown'
        
        # Create sentiment strength metric
        total_classified = sum(sentiment_counts.values())
        if total_classified > 0:
            dominant_count = sentiment_counts[profile['dominant_sentiment']]
            profile['sentiment_strength'] = dominant_count / total_classified
        else:
            profile['sentiment_strength'] = 0
    
    return user_profiles

# creates the network data for the edgelist and nodelist
def create_network_data(user_profiles):
    """
    Create node and edge lists for Gephi from user profiles
    """
    # Create nodes
    nodes = []
    for username, profile in user_profiles.items():
        total_items = profile['post_count'] + profile['comment_count']
        if total_items > 0:  # Only include users with activity
            nodes.append({
                'id': username,
                'post_count': profile['post_count'],
                'comment_count': profile['comment_count'],
                'total_count': total_items,
                'avg_score': profile['avg_score'],
                'total_score': profile['total_score'],
                'sentiment': profile['dominant_sentiment'],
                'sentiment_strength': profile['sentiment_strength'],
                'strongly_pro_count': profile['sentiment_counts']['Strongly Pro-Ukraine'],
                'mildly_pro_count': profile['sentiment_counts']['Mildly Pro-Ukraine'],
                'mildly_anti_count': profile['sentiment_counts']['Mildly Anti-Ukraine'],
                'strongly_anti_count': profile['sentiment_counts']['Strongly Anti-Ukraine'],
                'neutral_count': profile['sentiment_counts']['Neutral']

            })
    
    # Create edges (user interactions)
    edges = []
    for username, profile in user_profiles.items():
        for target in profile['interacts_with']:
            # Skip if target is not in our user profiles (e.g., deleted)
            if target not in user_profiles or target == "[deleted]":
                continue
                
            # Create directed edge from commenter to parent
            edges.append({
                'source': username,
                'target': target,
                'type': 'reply',
                'weight': 1
            })
    
    return pd.DataFrame(nodes), pd.DataFrame(edges)

In [47]:
def analyze_ukraine_funding_network(subreddit_name="conspiracy", posts_per_year=3):
    """
    End-to-end process to analyze Ukraine funding stance network
    """
    print("Starting Ukraine funding network analysis...")
    
    # 1. Collect posts about Ukraine funding
    print("Searching for posts about Ukraine funding...")
    posts = search_ukraine_financial_posts(posts_per_year=posts_per_year)
    print(f"Found {len(posts)} posts")
    
    # Extract post data
    posts_data = []
    for post in posts:
        posts_data.append(extract_post_data(post))
    
    # 2. Extract comments from posts
    print("Extracting comments from posts...")
    all_comments = []
    comments_with_parent_author = []
    
    for post in posts:
        print(f"Processing comments for post: {post.title[:40]}...")
        post_comments = get_all_comments(post)
        
        # Process each comment
        for comment in post_comments:
            # Skip deleted authors
            if comment.author is None:
                continue
                
            # Extract data
            comment_data = extract_comment_data(comment, post.id)
            all_comments.append(comment_data)
            
            # Add parent author information
            parent_author = get_parent_author(comment, post_comments, post)
            comment_data['parent_author'] = parent_author
            comments_with_parent_author.append(comment_data)
    
    print(f"Extracted {len(all_comments)} comments")
    


    # 3. Prepare texts for classification
    print("Preparing texts for classification...")
    texts_to_classify = []
    item_ids = []
    item_types = []  # Track if it's a post or comment

    # First, include the posts themselves
    for post_data in posts_data:
        post_text = post_data['title'] + " " + post_data['text']
        texts_to_classify.append(post_text)
        item_ids.append(post_data['id'])
        item_types.append('post')

    # Then include ALL comments from Ukraine-related posts
    for comment_data in all_comments:
        # Only skip deleted comments
        if comment_data['author'] != "[deleted]":
            texts_to_classify.append(comment_data['body'])
            item_ids.append(comment_data['id'])
            item_types.append('comment')

    print(f"Found {len(texts_to_classify)} items to classify ({item_types.count('post')} posts, {item_types.count('comment')} comments)")
    # 4. Classify with GPT-4
    print(f"Classifying {len(texts_to_classify)} items with GPT-4 in batches (this may take a while)...")
    classifications = classify_sentiment(texts_to_classify, batch_size=10)  # Increased batch size

    # 5. Map classifications to item IDs
    print("Processing classification results...")
    classification_data = {}
    for i, result in enumerate(classifications):
        if 'Index' in result and 'Sentiment' in result:
            index = result['Index']
            if index < len(item_ids):
                item_id = item_ids[index]
                item_type = item_types[index]
                # Store with type prefix to distinguish posts from comments
                classification_data[f"{item_type}_{item_id}"] = result['Sentiment']

    print(f"Successfully classified {len(classification_data)} items ({item_types.count('post')} posts, {len(classification_data) - item_types.count('post')} comments)")
        
    # 6. Aggregate to user level
    print("Aggregating classifications to user level...")
    user_profiles = aggregate_user_sentiment(posts_data, comments_with_parent_author, classification_data)
    # 7. Create network data
    print("Creating network data...")
    nodes_df, edges_df = create_network_data(user_profiles)
    
    # 8. Save to CSV for Gephi
    print("Saving results to CSV files...")
    nodes_df.to_csv('ukraine_funding_nodes.csv', index=False)
    edges_df.to_csv('ukraine_funding_edges.csv', index=False)
    
    # Also save the raw data for reference
    pd.DataFrame(posts_data).to_csv('ukraine_funding_posts.csv', index=False)
    pd.DataFrame(all_comments).to_csv('ukraine_funding_all_comments.csv', index=False)
    
    print(f"Analysis complete! Created network with {len(nodes_df)} nodes and {len(edges_df)} edges")
    print("CSV files saved: ukraine_funding_nodes.csv, ukraine_funding_edges.csv")
    
    return nodes_df, edges_df

# Main execution
if __name__ == "__main__":
    nodes, edges = analyze_ukraine_funding_network(posts_per_year=3)
    
    # Display summary stats
    print("\nNetwork Summary:")
    print(f"Total users (nodes): {len(nodes)}")
    print(f"Total interactions (edges): {len(edges)}")
    
    # Display sentiment distribution
    sentiment_counts = nodes['sentiment'].value_counts()
    print("\nSentiment Distribution:")
    for sentiment, count in sentiment_counts.items():
        print(f"{sentiment}: {count} users ({count/len(nodes)*100:.1f}%)")

Starting Ukraine funding network analysis...
Searching for posts about Ukraine funding...
Found post: Russia says they have proof of US-funded bio labs in Ukraine doing research on (1) Bat coronavirus and (2) Using migratory birds to spread biological weapons. Russia has called for a UN meeting on Friday to reveal what they have found in Ukraine. Will be interesting… (Date: Thu Mar 10 23:12:01 2022)
Found post: Live feed from UN Security Council meeting: Threats to international peace and security. Russia puts forward its case that the US has been funding/conducting dangerous pathogen research in Ukraine. (Date: Fri Mar 11 16:35:32 2022)
Found post: The film the WEF, NWO, Big Tech, & America doesn’t want you to see. “Ukraine on Fire” EXPOSES the 2014 Ukraine coup funded in part by George Soros and the US government. Much more is exposed. (Date: Sat Mar 12 14:16:26 2022)
Found post: In case you missed: Zelensky cancelled the 2024 elections in Ukraine and said he will remain President un

In [None]:
if(len(texts) < 10): 
    skip = True 


In [5]:
import pandas as pd
import openai

# Assumes openai.api_key is already set in your environment or earlier cell

# Load your data
df = pd.read_csv("ukraine_funding_all_comments.csv")

# Group all comments by user and combine into a single text block
grouped = df.groupby("author")["body"].apply(lambda x: "\n".join(x)).reset_index()
authors = grouped["author"].tolist()
texts = grouped["body"].tolist()

# Batch settings
batch_size = 5

# Prepare to store results
results = []

for start_idx in range(0, len(texts), batch_size):
    batch_texts = texts[start_idx:start_idx+batch_size]
    
    classification_prompt = ""
    for idx, text in enumerate(batch_texts):
        cleaned_text = text.replace('\n', ' ')
        classification_prompt += f"{idx}: {cleaned_text}\n\n"
    
    system_prompt = """
Context: You are analyzing Reddit posts and comments related to Ukraine from conspiracy-oriented communities.
Task: For each text, determine the author's stance toward Ukraine in the conflict:
    "Strongly Pro-Ukraine"
    "Mildly Pro-Ukraine"
    "Mildly Anti-Ukraine"
    "Strongly Anti-Ukraine"
    "Neutral"

Classification Indicators:
    Pro-Ukraine signals:
        Criticizes Russian aggression or Putin
        Supports military/financial aid to Ukraine
        Refers to 'Russian propaganda' or 'Putin's lies'
        Sympathizes with Ukrainian civilians/refugees
        Mentions 'defending democracy' or 'Ukrainian sovereignty'
        Condemns the invasion or occupation
    Anti-Ukraine signals:
        Mentions 'NATO expansion' as cause
        References 'US puppet government' in Ukraine
        Talks about 'bioweapons labs' in Ukraine
        Suggests Ukraine is controlled by 'globalists'
        Refers to 'Western propaganda'
        Questions legitimacy of Ukrainian government
        Portrays Russia as justified
        Criticizes aid to Ukraine

Guidelines:
    Use 'Neutral' only when you cannot detect any stance.
    Return one JSON object per line in the form:
    {"Index": 0, "Sentiment": "Strongly Anti-Ukraine"}
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Classify these texts regarding support for Ukraine:\n{classification_prompt}"}
            ],
            temperature=0
        )
        
        # Parse the raw response
        response_content = response.choices[0].message.content.strip()
        
        # Each line should be a JSON object
        lines = response_content.splitlines()
        
        # Store results, matching them back to the authors if needed
        for line in lines:
            results.append(line)

    except Exception as e:
        print("Error in batch:", e)

# 'results' now holds the classification lines. You can parse them further or save them.
for r in results:
    print(r)


{"Index": 0, "Sentiment": "Neutral"}
{"Index": 1, "Sentiment": "Strongly Pro-Ukraine"}
{"Index": 2, "Sentiment": "Neutral"}
{"Index": 3, "Sentiment": "Mildly Anti-Ukraine"}
{"Index": 4, "Sentiment": "Neutral"}
{"Index": 0, "Sentiment": "Neutral"}
{"Index": 1, "Sentiment": "Neutral"}
{"Index": 2, "Sentiment": "Neutral"}
{"Index": 3, "Sentiment": "Neutral"}
{"Index": 4, "Sentiment": "Mildly Anti-Ukraine"}
{"Index": 0, "Sentiment": "Neutral"}
{"Index": 1, "Sentiment": "Neutral"}
{"Index": 2, "Sentiment": "Strongly Pro-Ukraine"}
{"Index": 3, "Sentiment": "Neutral"}
{"Index": 4, "Sentiment": "Neutral"}
{"Index": 0, "Sentiment": "Strongly Pro-Ukraine"}
{"Index": 1, "Sentiment": "Strongly Anti-Ukraine"}
{"Index": 2, "Sentiment": "Neutral"}
{"Index": 3, "Sentiment": "Neutral"}
{"Index": 4, "Sentiment": "Mildly Anti-Ukraine"}
{"Index": 0, "Sentiment": "Neutral"}
{"Index": 1, "Sentiment": "Neutral"}
{"Index": 2, "Sentiment": "Mildly Anti-Ukraine"}
{"Index": 3, "Sentiment": "Neutral"}
{"Index": 4

In [8]:
import json
import pandas as pd

# "authors" is the list of Reddit usernames in the same order you classified
# "results" is the list of strings (some may be valid JSON, some may not)

sentiments_by_author = {}
valid_index = 0  # We'll move through "authors" only when a line is successfully parsed

for line in results:
    line = line.strip()
    if not line:
        continue  # skip any blank lines
    
    try:
        data = json.loads(line)
        # Use the parsed "Sentiment" if possible
        sentiments_by_author[authors[valid_index]] = data["Sentiment"]
        valid_index += 1
    except json.JSONDecodeError:
        # skip any lines that aren't valid JSON
        continue

# Load the node data
df_nodes = pd.read_csv("ukraine_funding_nodes.csv")

# Fill the sentiment column by matching user IDs to the sentiment dictionary
df_nodes["sentiment"] = df_nodes["id"].map(sentiments_by_author).fillna("Neutral")

# Save the updated CSV
df_nodes.to_csv("ukraine_funding_nodes_updated.csv", index=False)
print("Done. Updated CSV: 'ukraine_funding_nodes_updated.csv'")


Done. Updated CSV: 'ukraine_funding_nodes_updated.csv'
