# Reddit Network Analysis

This notebook processes a Reddit dataset to create a network where:
- Nodes are Reddit users
- Edges connect users who have posted to the same subreddit

The dataset used is "reddit_opinion_ru_ua.csv" from Kaggle (downloaded locally).
https://www.kaggle.com/datasets/asaniczka/public-opinion-russia-ukraine-war-updated-daily

In [8]:
# Import all required libraries
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations
import os

## Data Loading and Preprocessing

In [9]:
def load_and_clean_data(filepath):
    """
    Load Reddit data and remove duplicate user-subreddit combinations.
    
    Args:
        filepath (str): Path to the CSV data file
        
    Returns:
        pandas.DataFrame: Cleaned dataframe with unique user-subreddit combinations
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Data file not found: {filepath}")
            
        # Load the dataset
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        print(f"Original data shape: {df.shape}")
        
        # Remove duplicate user-subreddit combinations
        df_unique = df.drop_duplicates(subset=['author_name', 'subreddit'])
        print(f"Data shape after cleaning: {df_unique.shape}")
        
        return df_unique
    
    except Exception as e:
        print(f"Error in load_and_clean_data: {str(e)}")
        raise

# Load and clean the dataset
df_clean = load_and_clean_data("reddit_opinion_ru_ua.csv")

Loading data from reddit_opinion_ru_ua.csv...
Original data shape: (4928897, 24)
Data shape after cleaning: (736526, 24)


## Sample Users by Subreddit

In [10]:
def sample_users_by_subreddit(df, sample_rate=0.05, random_seed=42):
    """
    Group users by subreddit and sample a percentage of users from each subreddit.
    
    Args:
        df (pandas.DataFrame): Cleaned dataframe with unique user-subreddit combinations
        sample_rate (float): Percentage of users to sample from each subreddit (0.0-1.0)
        random_seed (int): Random seed for reproducibility
        
    Returns:
        pandas.Series: Series of lists containing sampled users for each subreddit
    """
    try:
        # Set random seed for reproducibility
        np.random.seed(random_seed)
        
        # Group users by subreddit
        subreddit_users = df.groupby('subreddit')['author_name'].apply(list)
        
        # Sample users from each subreddit
        subreddit_users_sampled = subreddit_users.apply(lambda users: 
            np.random.choice(users, size=max(1, int(len(users) * sample_rate)), replace=False).tolist()
        )
        
        print(f"Sampled {sample_rate*100}% of users from each subreddit")
        print(f"Example of sampled subreddits:")
        for i, (subreddit, users) in enumerate(subreddit_users_sampled.head().items()):
            print(f"  {subreddit}: {len(users)} users")
            if i >= 4:
                break
                
        return subreddit_users_sampled
    
    except Exception as e:
        print(f"Error in sample_users_by_subreddit: {str(e)}")
        raise

# Sample users by subreddit
sampled_users = sample_users_by_subreddit(df_clean, sample_rate=0.05)
# Total nodes in the sample
print("Total nodes in the sampled users:", sum(len(users) for users in sampled_users))

Sampled 5.0% of users from each subreddit
Example of sampled subreddits:
  ANormalDayInRussia: 506 users
  ArtForUkraine: 16 users
  AskARussian: 1047 users
  AskReddit: 200 users
  CombatFootage: 2727 users
Total nodes in the sampled users: 36805


## Generate Network Edge List

In [11]:
def generate_network(sampled_users, output_file='network.txt'):
    """
    Generate a network edge list where edges connect users who have posted in the same subreddit.
    
    Args:
        sampled_users (pandas.Series): Series of lists containing sampled users for each subreddit
        output_file (str): Path to the output edge list file
        
    Returns:
        str: Path to the created edge list file
    """
    try:
        total_edges = 0
        total_subreddits = len(sampled_users)
        
        with open(output_file, 'w') as f:
            for i, (subreddit, users) in enumerate(sampled_users.items()):
                # Generate all user-user pairs (edges) for this subreddit
                subreddit_edges = list(combinations(users, 2))
                
                # Write edges to file
                for u1, u2 in subreddit_edges:
                    f.write(f"{u1} {u2}\n")
                
                total_edges += len(subreddit_edges)
                
                # Print progress every 10% of subreddits
                if (i+1) % max(1, total_subreddits // 10) == 0 or i+1 == total_subreddits:
                    print(f"Progress: {i+1}/{total_subreddits} subreddits processed ({(i+1)/total_subreddits*100:.1f}%)")
        
        print(f"Network edge list created with {total_edges} edges and written to '{output_file}'")
        return output_file
    
    except Exception as e:
        print(f"Error in generate_network: {str(e)}")
        raise

# Generate the network edge list
edge_list_file = generate_network(sampled_users)

Progress: 4/48 subreddits processed (8.3%)
Progress: 8/48 subreddits processed (16.7%)
Progress: 12/48 subreddits processed (25.0%)
Progress: 16/48 subreddits processed (33.3%)
Progress: 20/48 subreddits processed (41.7%)
Progress: 24/48 subreddits processed (50.0%)
Progress: 28/48 subreddits processed (58.3%)
Progress: 32/48 subreddits processed (66.7%)
Progress: 36/48 subreddits processed (75.0%)
Progress: 40/48 subreddits processed (83.3%)
Progress: 44/48 subreddits processed (91.7%)
Progress: 48/48 subreddits processed (100.0%)
Network edge list created with 64766831 edges and written to 'network.txt'


## Load Network

In [12]:
def load_network(edge_list_file):
    """
    Load a network from an edge list file.
    
    Args:
        edge_list_file (str): Path to the edge list file
        
    Returns:
        networkx.Graph: The loaded network graph
    """
    try:
        # Check if file exists and has content
        if not os.path.exists(edge_list_file) or os.path.getsize(edge_list_file) == 0:
            raise FileNotFoundError(f"Edge list file not found or empty: {edge_list_file}")
        
        # Load the network
        G = nx.read_edgelist(edge_list_file)
        print(f"Network loaded from {edge_list_file}")
        print(f"  Number of nodes: {G.number_of_nodes()}")
        print(f"  Number of edges: {G.number_of_edges()}")
        
        return G
    
    except Exception as e:
        print(f"Error in load_network: {str(e)}")
        raise

# Load the network
G = load_network(edge_list_file)

Network loaded from network.txt
  Number of nodes: 35712
  Number of edges: 64757354


## Analyze Network

In [13]:
def analyze_network(G):
    """
    Perform basic analysis on a network graph.
    
    Args:
        G (networkx.Graph): The network graph to analyze
        
    Returns:
        networkx.Graph: The analyzed network graph
    """
    try:
        print(f"\nNetwork Analysis:")
        
        # Connected components analysis
        num_components = nx.number_connected_components(G)
        print(f"  Number of connected components: {num_components}")
        
        if num_components > 0:
            # Get largest connected component
            largest_cc = max(nx.connected_components(G), key=len)
            largest_cc_size = len(largest_cc)
            print(f"  Size of largest connected component: {largest_cc_size} nodes")
            print(f"  Percentage of nodes in largest component: {largest_cc_size/G.number_of_nodes()*100:.2f}%")
        
        return G
    
    except Exception as e:
        print(f"Error in analyze_network: {str(e)}")
        raise

# Analyze the network
G = analyze_network(G)


Network Analysis:
  Number of connected components: 6
  Size of largest connected component: 35609 nodes
  Percentage of nodes in largest component: 99.71%
