# Reddit Network Analysis

This notebook processes a Reddit dataset to create a network where:
- Nodes are Reddit users
- Edges connect users with similar posting patterns across subreddits (based on cosine similarity)

The dataset used is "reddit_opinion_ru_ua.csv" from Kaggle (downloaded locally).
https://www.kaggle.com/datasets/asaniczka/public-opinion-russia-ukraine-war-updated-daily

In [2]:
# Import all required libraries
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations
import os
from sklearn.metrics.pairwise import cosine_similarity

## Data Loading and Preprocessing

In [3]:
def load_and_clean_data(filepath):
    """
    Load Reddit data and remove duplicate user-subreddit combinations.
    
    Args:
        filepath (str): Path to the CSV data file
        
    Returns:
        pandas.DataFrame: Cleaned dataframe with unique user-subreddit combinations
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Data file not found: {filepath}")
            
        # Load the dataset
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        print(f"Original data shape: {df.shape}")
        
        # Remove duplicate user-subreddit combinations
        df_unique = df.drop_duplicates(subset=['author_name', 'subreddit'])
        print(f"Data shape after cleaning: {df_unique.shape}")
        
        return df_unique
    
    except Exception as e:
        print(f"Error in load_and_clean_data: {str(e)}")
        raise

# Load and clean the dataset
df_clean = load_and_clean_data("reddit_opinion_ru_ua.csv")

Loading data from reddit_opinion_ru_ua.csv...
Original data shape: (5168018, 24)
Data shape after cleaning: (766897, 24)


In [4]:
def analyze_post_dates(df):
    # Convert post_created_time to datetime
    df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    # Get the earliest and latest dates
    min_date = df['post_created_time'].min()
    max_date = df['post_created_time'].max()
    
    return min_date, max_date


In [5]:
def filter_by_date(df, start_date=None, end_date=None):
    """
    Filter dataframe to include only posts within a specific date range.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'post_created_time' column
        start_date (str, datetime, optional): Keep posts on or after this date
        end_date (str, datetime, optional): Keep posts on or before this date
        
    Returns:
        pandas.DataFrame: Filtered dataframe
    """
    # Make sure post_created_time is datetime
    if not pd.api.types.is_datetime64_dtype(df['post_created_time']):
        df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    original_count = len(df)
    
    # Apply date filters
    if start_date is not None:
        if isinstance(start_date, str):
            start_date = pd.to_datetime(start_date)
        df = df[df['post_created_time'] >= start_date]
    
    if end_date is not None:
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        df = df[df['post_created_time'] <= end_date]
    
    # Report on filtering
    print(f"Date filtering:")
    if start_date is not None:
        print(f"  Start date: {start_date}")
    if end_date is not None:
        print(f"  End date: {end_date}")
    print(f"  Original records: {original_count}")
    print(f"  Filtered records: {len(df)} ({len(df)/original_count*100:.1f}% retained)")
    
    return df
from datetime import timedelta
# Get the maximum and minimum date from the dataset
min_date, max_date = analyze_post_dates(df_clean)
# Calculate a date threshold (e.g., most recent 30 days)
cutoff_date = max_date - timedelta(days=10)

# Filter the dataframe
df_recent = filter_by_date(df_clean, start_date=cutoff_date)

Date filtering:
  Start date: 2025-04-19 11:00:47
  Original records: 766897
  Filtered records: 41732 (5.4% retained)


## Create User-Subreddit Vectors

In [7]:
def create_user_vectors(df):
    """
    Generate vectors for each user where each element represents the 
    fraction of their posts in a particular subreddit.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'author_name' and 'subreddit' columns
        
    Returns:
        dict: Dictionary mapping users to their subreddit distribution vectors
        list: List of all subreddits (to know which index corresponds to which subreddit)
    """
    try:
        # Count posts per user-subreddit combination
        user_subreddit_counts = df.groupby(['author_name', 'subreddit']).size().reset_index(name='count')
        
        # Get total posts per user
        user_total_posts = user_subreddit_counts.groupby('author_name')['count'].sum()
        
        # Get list of all subreddits (for vector indices)
        all_subreddits = sorted(df['subreddit'].unique())
        subreddit_to_idx = {sr: i for i, sr in enumerate(all_subreddits)}
        
        print(f"Creating vectors for {len(user_total_posts)} users across {len(all_subreddits)} subreddits")
        
        # Create user vectors
        user_vectors = {}
        
        for idx, row in user_subreddit_counts.iterrows():
            user = row['author_name']
            subreddit = row['subreddit']
            count = row['count']
            
            # Initialize vector of zeros
            if user not in user_vectors:
                user_vectors[user] = np.zeros(len(all_subreddits))
            
            sr_idx = subreddit_to_idx[subreddit]
            user_vectors[user][sr_idx] = count/user_total_posts[user]
            
        
        # Print some stats
        print(f"Created vectors for {len(user_vectors)} users")
        print(f"Example vector dimensions: {len(next(iter(user_vectors.values())))}")
        
        return user_vectors, all_subreddits
    
    except Exception as e:
        print(f"Error in create_user_vectors: {str(e)}")
        raise

# Create user vectors
user_vectors, all_subreddits = create_user_vectors(df_recent)  # or df_clean if not sampling

Creating vectors for 35649 users across 34 subreddits
Created vectors for 35649 users
Example vector dimensions: 34


## Generate Network Based on Dot Product

In [14]:
def generate_similarity_network_vectorized(user_vectors, threshold=0.1, output_file='network_similarity_vec.txt'):
    """
    Generate a network where edges connect users with cosine similarity above threshold.
    Uses vectorized operations for better performance.
    """
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Convert dict to ordered lists for consistent indexing
    users = list(user_vectors.keys())
    vectors = np.array([user_vectors[user] for user in users])
    
    # Normalize vectors (to prepare for cosine similarity calculation)
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    # Avoid division by zero
    norms[norms == 0] = 1.0
    normalized_vectors = vectors / norms
    
    print(f"Computing similarities for {len(users)} users...")
    
    # Calculate all pairwise similarities at once
    similarity_matrix = np.dot(normalized_vectors, normalized_vectors.T)
    # digging into the similarity_matrix
    min_similarity = np.min(similarity_matrix)
    max_similarity = np.max(similarity_matrix)
    mean_similarity = np.mean(similarity_matrix)
    median_similarity = np.median(similarity_matrix)

    print(f"Similarity matrix statistics:")
    print(f"  Minimum: {min_similarity:.6f}")
    print(f"  Maximum: {max_similarity:.6f}")
    print(f"  Mean: {mean_similarity:.6f}")
    print(f"  Median: {median_similarity:.6f}")

    # Create edge list where similarity > threshold
    edges = []
    # Only look at upper triangle of similarity matrix (avoid duplicates)
    rows, cols = np.triu_indices_from(similarity_matrix, k=1)
    for i, j in zip(rows, cols):
        if similarity_matrix[i, j] >= threshold:
            edges.append((users[i], users[j]))
    
    # Write edges to file
    with open(output_file, 'w') as f:
        for u1, u2 in edges:
            f.write(f"{u1} {u2}\n")
    
    print(f"Network created with {len(edges)} edges and written to '{output_file}'")
    return output_file
# Generate the network edge list based on similarity using vectorized approach
edge_list_file_vec = generate_similarity_network_vectorized(user_vectors, threshold=0.8)

Computing similarities for 35649 users...
Similarity matrix statistics:
  Minimum: 0.000000
  Maximum: 1.000000
  Mean: 0.132662
  Median: 0.000000
Network created with 58059686 edges and written to 'network_similarity_vec.txt'


## Load Network

In [15]:
def load_network(edge_list_file):
    """
    Load a network from an edge list file.
    
    Args:
        edge_list_file (str): Path to the edge list file
        
    Returns:
        networkx.Graph: The loaded network graph
    """
    try:
        # Check if file exists and has content
        if not os.path.exists(edge_list_file) or os.path.getsize(edge_list_file) == 0:
            raise FileNotFoundError(f"Edge list file not found or empty: {edge_list_file}")
        
        # Load the network
        G = nx.read_edgelist(edge_list_file)
        print(f"Network loaded from {edge_list_file}")
        print(f"  Number of nodes: {G.number_of_nodes()}")
        print(f"  Number of edges: {G.number_of_edges()}")
        
        return G
    
    except Exception as e:
        print(f"Error in load_network: {str(e)}")
        raise

# Load the network
G = load_network(edge_list_file_vec)

Network loaded from network_similarity_vec.txt
  Number of nodes: 35596
  Number of edges: 58059686


## Analyze Network

In [12]:
def analyze_network(G):
    """
    Perform basic analysis on a network graph.
    
    Args:
        G (networkx.Graph): The network graph to analyze
        
    Returns:
        networkx.Graph: The analyzed network graph
    """
    try:
        print(f"\nNetwork Analysis:")
        
        # Connected components analysis
        num_components = nx.number_connected_components(G)
        print(f"  Number of connected components: {num_components}")
        
        if num_components > 0:
            # Get largest connected component
            largest_cc = max(nx.connected_components(G), key=len)
            largest_cc_size = len(largest_cc)
            print(f"  Size of largest connected component: {largest_cc_size} nodes")
            print(f"  Percentage of nodes in largest component: {largest_cc_size/G.number_of_nodes()*100:.2f}%")
        
        return G
    
    except Exception as e:
        print(f"Error in analyze_network: {str(e)}")
        raise

# Analyze the network
G = analyze_network(G)


Network Analysis:
  Number of connected components: 2
  Size of largest connected component: 35644 nodes
  Percentage of nodes in largest component: 99.99%


In [13]:
# Network density
density = nx.density(G)
print(f"Network density: {density:.4f}")

Network density: 0.1473
