# Reddit Network Analysis

This notebook processes a Reddit dataset to create a network where:
- Nodes are Reddit users
- Edges connect users with similar posting patterns across subreddits (based on cosine similarity)

The dataset used is "reddit_opinion_ru_ua.csv" from Kaggle (downloaded locally).
https://www.kaggle.com/datasets/asaniczka/public-opinion-russia-ukraine-war-updated-daily

In [42]:
# Import all required libraries
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations
import os
from sklearn.metrics.pairwise import cosine_similarity

## Data Loading and Preprocessing

In [41]:
def load_and_clean_data(filepath):
    """
    Load Reddit data and remove duplicate user-subreddit combinations.
    
    Args:
        filepath (str): Path to the CSV data file
        
    Returns:
        pandas.DataFrame: Cleaned dataframe with unique user-subreddit combinations
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Data file not found: {filepath}")
            
        # Load the dataset
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        print(f"Original data shape: {df.shape}")
        
        # Remove duplicate user-subreddit combinations
        df_unique = df.drop_duplicates(subset=['author_name', 'subreddit'])
        print(f"Data shape after cleaning: {df_unique.shape}")
        
        return df_unique
    
    except Exception as e:
        print(f"Error in load_and_clean_data: {str(e)}")
        raise

# Load and clean the dataset
df_clean = load_and_clean_data("reddit_opinion_ru_ua.csv")

Loading data from reddit_opinion_ru_ua.csv...
Original data shape: (4928897, 24)
Data shape after cleaning: (736526, 24)


In [None]:
def filter_by_date(df, start_date=None, end_date=None):
    """
    Filter dataframe to include only posts within a specific date range.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'post_created_time' column
        start_date (str, datetime, optional): Keep posts on or after this date
        end_date (str, datetime, optional): Keep posts on or before this date
        
    Returns:
        pandas.DataFrame: Filtered dataframe
    """
    # Make sure post_created_time is datetime
    if not pd.api.types.is_datetime64_dtype(df['post_created_time']):
        df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    original_count = len(df)
    
    # Apply date filters
    if start_date is not None:
        if isinstance(start_date, str):
            start_date = pd.to_datetime(start_date)
        df = df[df['post_created_time'] >= start_date]
    
    if end_date is not None:
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        df = df[df['post_created_time'] <= end_date]
    
    # Report on filtering
    print(f"Date filtering:")
    if start_date is not None:
        print(f"  Start date: {start_date}")
    if end_date is not None:
        print(f"  End date: {end_date}")
    print(f"  Original records: {original_count}")
    print(f"  Filtered records: {len(df)} ({len(df)/original_count*100:.1f}% retained)")
    
    return df
from datetime import timedelta

# Calculate a date threshold (e.g., most recent 30 days)
cutoff_date = max_date - timedelta(days=1)
print(f"Using posts on or after: {cutoff_date}")

# Filter the dataframe
df_recent = filter_by_date(df_clean, start_date=cutoff_date)

Using posts on or after: 2025-04-03 11:26:19
Date filtering:
  Start date: 2025-04-03 11:26:19
  Original records: 736526
  Filtered records: 3687 (0.5% retained)


In [49]:
total_users = df_recent['author_name'].nunique()
print(f"Total unique users in the filtered dataset: {total_users}")

Total unique users in the filtered dataset: 120635


## Create User-Subreddit Vectors

In [66]:
def create_user_vectors(df):
    """
    Generate vectors for each user where each element represents the 
    fraction of their posts in a particular subreddit.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'author_name' and 'subreddit' columns
        
    Returns:
        dict: Dictionary mapping users to their subreddit distribution vectors
        list: List of all subreddits (to know which index corresponds to which subreddit)
    """
    try:
        # Count posts per user-subreddit combination
        user_subreddit_counts = df.groupby(['author_name', 'subreddit']).size().reset_index(name='count')
        
        # Get total posts per user
        user_total_posts = user_subreddit_counts.groupby('author_name')['count'].sum()
        
        # Get list of all subreddits (for vector indices)
        all_subreddits = sorted(df['subreddit'].unique())
        subreddit_to_idx = {sr: i for i, sr in enumerate(all_subreddits)}
        
        print(f"Creating vectors for {len(user_total_posts)} users across {len(all_subreddits)} subreddits")
        
        # Create user vectors
        user_vectors = {}
        for user in user_total_posts.index:
            # Initialize vector of zeros
            vector = np.zeros(len(all_subreddits))
            
            # Fill in fractions for subreddits this user has posted in
            user_posts = user_subreddit_counts[user_subreddit_counts['author_name'] == user]
            for _, row in user_posts.iterrows():
                sr_idx = subreddit_to_idx[row['subreddit']]
                vector[sr_idx] = row['count'] / user_total_posts[user]
            
            user_vectors[user] = vector
        
        # Print some stats
        print(f"Created vectors for {len(user_vectors)} users")
        print(f"Example vector dimensions: {len(next(iter(user_vectors.values())))}")
        
        return user_vectors, all_subreddits
    
    except Exception as e:
        print(f"Error in create_user_vectors: {str(e)}")
        raise

# Create user vectors
user_vectors, all_subreddits = create_user_vectors(df_recent)  # or df_clean if not sampling

Creating vectors for 3565 users across 25 subreddits
Created vectors for 3565 users
Example vector dimensions: 25


## Generate Network Based on Cosine Similarity

In [None]:
def generate_similarity_network(user_vectors, threshold=0.1, output_file='network_similarity.txt'):
    """
    Generate a network where edges connect users with cosine similarity above threshold.
    
    Args:
        user_vectors (dict): Dictionary mapping users to their subreddit distribution vectors
        threshold (float): Minimum cosine similarity to create an edge (0.0-1.0)
        output_file (str): Path to output edge list file
        
    Returns:
        str: Path to created edge list file
    """
    try:
        users = list(user_vectors.keys())
        total_users = len(users)
        total_edges = 0
        
        print(f"Generating similarity network with {total_users} users (threshold: {threshold})")
        
        with open(output_file, 'w') as f:
            # Generate all user pairs and check similarity
            for i, user1 in enumerate(users):
                vec1 = user_vectors[user1]
                
                # Print progress
                if (i+1) % max(1, total_users // 10) == 0 or i+1 == total_users:
                    print(f"Progress: {i+1}/{total_users} users processed ({(i+1)/total_users*100:.1f}%)")
                
                for j in range(i+1, total_users):
                    user2 = users[j]
                    vec2 = user_vectors[user2]
                    
                    # Skip if both vectors are zero (to avoid division by zero)
                    if np.all(vec1 == 0) or np.all(vec2 == 0):
                        continue
                    
                    # Calculate cosine similarity
                    similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
                    
                    # Create edge if similarity is above threshold
                    if similarity >= threshold:
                        f.write(f"{user1} {user2}\n")
                        total_edges += 1
        
        print(f"Network created with {total_edges} edges and written to '{output_file}'")
        return output_file
    
    except Exception as e:
        print(f"Error in generate_similarity_network: {str(e)}")
        raise

# Generate the network edge list based on similarity
edge_list_file = generate_similarity_network(user_vectors, threshold=0.5)

Generating similarity network with 3565 users (threshold: 0.5)
Progress: 356/3565 users processed (10.0%)
Progress: 712/3565 users processed (20.0%)
Progress: 1068/3565 users processed (30.0%)
Progress: 1424/3565 users processed (39.9%)
Progress: 1780/3565 users processed (49.9%)
Progress: 2136/3565 users processed (59.9%)
Progress: 2492/3565 users processed (69.9%)
Progress: 2848/3565 users processed (79.9%)
Progress: 3204/3565 users processed (89.9%)
Progress: 3560/3565 users processed (99.9%)
Progress: 3565/3565 users processed (100.0%)
Network created with 850163 edges and written to 'network_similarity.txt'


2 mins - 0.9 - 755740 edges - 1 day

## Load Network

In [55]:
def load_network(edge_list_file):
    """
    Load a network from an edge list file.
    
    Args:
        edge_list_file (str): Path to the edge list file
        
    Returns:
        networkx.Graph: The loaded network graph
    """
    try:
        # Check if file exists and has content
        if not os.path.exists(edge_list_file) or os.path.getsize(edge_list_file) == 0:
            raise FileNotFoundError(f"Edge list file not found or empty: {edge_list_file}")
        
        # Load the network
        G = nx.read_edgelist(edge_list_file)
        print(f"Network loaded from {edge_list_file}")
        print(f"  Number of nodes: {G.number_of_nodes()}")
        print(f"  Number of edges: {G.number_of_edges()}")
        
        return G
    
    except Exception as e:
        print(f"Error in load_network: {str(e)}")
        raise

# Load the network
G = load_network(edge_list_file)

Network loaded from network_similarity.txt
  Number of nodes: 3543
  Number of edges: 755740


## Analyze Network

In [56]:
def analyze_network(G):
    """
    Perform basic analysis on a network graph.
    
    Args:
        G (networkx.Graph): The network graph to analyze
        
    Returns:
        networkx.Graph: The analyzed network graph
    """
    try:
        print(f"\nNetwork Analysis:")
        
        # Connected components analysis
        num_components = nx.number_connected_components(G)
        print(f"  Number of connected components: {num_components}")
        
        if num_components > 0:
            # Get largest connected component
            largest_cc = max(nx.connected_components(G), key=len)
            largest_cc_size = len(largest_cc)
            print(f"  Size of largest connected component: {largest_cc_size} nodes")
            print(f"  Percentage of nodes in largest component: {largest_cc_size/G.number_of_nodes()*100:.2f}%")
        
        return G
    
    except Exception as e:
        print(f"Error in analyze_network: {str(e)}")
        raise

# Analyze the network
G = analyze_network(G)


Network Analysis:
  Number of connected components: 42
  Size of largest connected component: 666 nodes
  Percentage of nodes in largest component: 18.80%


In [70]:
# Network density
density = nx.density(G)
print(f"Network density: {density:.4f}")

Network density: 0.1204
