# Reddit Network Analysis

This notebook processes a Reddit dataset to create a network where:
- Nodes are Reddit users
- Edges connect users with similar posting patterns across subreddits (based on cosine similarity)

The dataset used is "reddit_opinion_ru_ua.csv" from Kaggle (downloaded locally).
https://www.kaggle.com/datasets/asaniczka/public-opinion-russia-ukraine-war-updated-daily

In [49]:
# Import all required libraries
import pandas as pd
import numpy as np
import os

## Data Loading and Preprocessing

In [50]:
def load_and_clean_data(filepath):
    """
    Load Reddit data and remove duplicate user-subreddit combinations.
    
    Args:
        filepath (str): Path to the CSV data file
        
    Returns:
        pandas.DataFrame: Cleaned dataframe with unique user-subreddit combinations
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Data file not found: {filepath}")
            
        # Load the dataset
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        print(f"Original data shape: {df.shape}")
        
        # Remove exact duplicates
        df_unique = df.drop_duplicates().copy()
        print(f"Data shape after removing exact duplicates: {df_unique.shape}")
        
        return df_unique
    
    except Exception as e:
        print(f"Error in load_and_clean_data: {str(e)}")
        raise

# Load and clean the dataset
df_clean = load_and_clean_data("reddit_opinion_ru_ua.csv")

Loading data from reddit_opinion_ru_ua.csv...
Original data shape: (5168018, 24)
Data shape after removing exact duplicates: (5168018, 24)


In [51]:
def analyze_post_dates(df):
    # Convert post_created_time to datetime
    df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    # Get the earliest and latest dates
    min_date = df['post_created_time'].min()
    max_date = df['post_created_time'].max()
    
    return min_date, max_date


In [52]:
def filter_by_date(df, start_date=None, end_date=None):
    """
    Filter dataframe to include only posts within a specific date range.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'post_created_time' column
        start_date (str, datetime, optional): Keep posts on or after this date
        end_date (str, datetime, optional): Keep posts on or before this date
        
    Returns:
        pandas.DataFrame: Filtered dataframe
    """
    # Make sure post_created_time is datetime
    if not pd.api.types.is_datetime64_dtype(df['post_created_time']):
        df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    original_count = len(df)
    
    # Apply date filters
    if start_date is not None:
        if isinstance(start_date, str):
            start_date = pd.to_datetime(start_date)
        df = df[df['post_created_time'] >= start_date]
    
    if end_date is not None:
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        df = df[df['post_created_time'] <= end_date]
    
    # Report on filtering
    print(f"Date filtering:")
    if start_date is not None:
        print(f"  Start date: {start_date}")
    if end_date is not None:
        print(f"  End date: {end_date}")
    print(f"  Original records: {original_count}")
    print(f"  Filtered records: {len(df)} ({len(df)/original_count*100:.1f}% retained)")
    
    return df
from datetime import timedelta
# Get the maximum and minimum date from the dataset
min_date, max_date = analyze_post_dates(df_clean)
# Calculate a date threshold (e.g., most recent 30 days)
cutoff_date = max_date - timedelta(days=30)

# Filter the dataframe
df_recent = filter_by_date(df_clean, start_date=cutoff_date)

Date filtering:
  Start date: 2025-03-30 11:00:47
  Original records: 5168018
  Filtered records: 285804 (5.5% retained)


## Create User-Subreddit Vectors

In [55]:
def create_user_vectors(df):
    """
    Generate vectors for each user where each element represents the 
    fraction of their posts in a particular subreddit.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'author_name' and 'subreddit' columns
        
    Returns:
        dict: Dictionary mapping users to their subreddit distribution vectors
        list: List of all subreddits (to know which index corresponds to which subreddit)
    """
    try:
        # Count posts per user-subreddit combination
        user_subreddit_counts = df.groupby(['author_name', 'subreddit']).size().reset_index(name='count')
        
        # Get total posts per user
        user_total_posts = user_subreddit_counts.groupby('author_name')['count'].sum()
        
        # Keep only users that have a significant number of comments (no lurkers)
        lurker_threshold = 1
        user_total_posts = user_total_posts[user_total_posts > lurker_threshold]
        user_subreddit_counts = user_subreddit_counts[user_subreddit_counts['author_name'].isin(user_total_posts.index)].copy()

        
        # Get list of all subreddits (for vector indices)
        all_subreddits = sorted(df['subreddit'].unique())
        subreddit_to_idx = {sr: i for i, sr in enumerate(all_subreddits)}
        
        print(f"Creating vectors for {len(user_total_posts)} users across {len(all_subreddits)} subreddits")
        
        # Create user vectors
        user_vectors = {}
        
        for idx, row in user_subreddit_counts.iterrows():
            user = row['author_name']
            subreddit = row['subreddit']
            count = row['count']
            
            # Initialize vector of zeros
            if user not in user_vectors:
                user_vectors[user] = np.zeros(len(all_subreddits))
            
            sr_idx = subreddit_to_idx[subreddit]
            user_vectors[user][sr_idx] = count/user_total_posts[user]
            
        
        # Print some stats
        print(f"Created vectors for {len(user_vectors)} users")
        print(f"Example vector dimensions: {len(next(iter(user_vectors.values())))}")
        
        return user_vectors, all_subreddits
    
    except Exception as e:
        print(f"Error in create_user_vectors: {str(e)}")
        raise

# Create user vectors
user_vectors, all_subreddits = create_user_vectors(df_recent)  # or df_clean if not sampling

Creating vectors for 35626 users across 35 subreddits
Created vectors for 35626 users
Example vector dimensions: 35


## Generate Network Based on Dot Product

In [76]:
def deduplicate(vectors, threshold = 1, runs=5):
    prev = -1
    iteration = 0
    curr = vectors.copy()
    
    while len(curr) != prev and iteration < runs: # Run until the number of nodes stops changing or until we hit max runs
        print(f"\tDeduplication iteration: {iteration}")
        prev = len(curr)
        
        users = list(curr.keys())
        vecs = np.array([curr[user] for user in users])
        norms = np.linalg.norm(vecs, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        normalized_vecs = vecs/norms
        sim_matrix = np.dot(normalized_vecs, normalized_vecs.T)
  
        todrop = set()
        
        for i in range(len(users)):
            if users[i] in todrop:
                continue
            for j in range(i+1, len(users)):
                if users[j] in todrop:
                    continue
                elif sim_matrix[i, j] >= threshold:
                    todrop.add(users[j])
        
        curr = {user: vector for user,vector in curr.items() if user not in todrop}
        iteration+=1
    return curr
        
                    

def generate_similarity_network_vectorized(user_vectors_2, threshold=0.1, output_file='network_similarity_vec.csv'):
    """
    Generate a network where edges connect users with cosine similarity above threshold.
    Uses vectorized operations for better performance.
    """
    import numpy as np

    print(f"Number of users before deduplication: {len(user_vectors_2)}")
    vectors = deduplicate(user_vectors_2, 1)
    print(f"Number of users after deduplication: {len(vectors)}")
    
    # Convert dict to ordered lists for consistent indexing
    users = list(vectors.keys())
    vectors = np.array([vectors[user] for user in users])
    
    # Normalize vectors (to prepare for cosine similarity calculation)
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    # Avoid division by zero
    norms[norms == 0] = 1.0
    normalized_vectors = vectors / norms
    

    
    print(f"Computing similarities for {len(users)} users...")
    
    # Calculate all pairwise similarities at once
    similarity_matrix = np.dot(normalized_vectors, normalized_vectors.T)
    # digging into the similarity_matrix
    min_similarity = np.min(similarity_matrix)
    max_similarity = np.max(similarity_matrix)
    mean_similarity = np.mean(similarity_matrix)
    median_similarity = np.median(similarity_matrix)

    print(f"Similarity matrix statistics:")
    print(f"  Minimum: {min_similarity:.6f}")
    print(f"  Maximum: {max_similarity:.6f}")
    print(f"  Mean: {mean_similarity:.6f}")
    print(f"  Median: {median_similarity:.6f}")

    # Create edge list where similarity > threshold
    edges = []
    # Only look at upper triangle of similarity matrix (avoid duplicates)
    rows, cols = np.triu_indices_from(similarity_matrix, k=1)
    for i, j in zip(rows, cols):
        if similarity_matrix[i, j] >= threshold:
            edges.append((users[i], users[j]))
    
    # Write edges to file
    with open(output_file, 'w') as f:
        for u1, u2 in edges:
            f.write(f"{u1} {u2}\n")
    
    print(f"Network created with {len(edges)} edges and written to '{output_file}'")
    return output_file
# Generate the network edge list based on similarity using vectorized approach
edge_list_file_vec = generate_similarity_network_vectorized(user_vectors, threshold=0.75)

Number of users before deduplication: 35626
0
1
Number of users after deduplication: 12321
Computing similarities for 12321 users...
Similarity matrix statistics:
  Minimum: 0.000000
  Maximum: 1.000000
  Mean: 0.215621
  Median: 0.002879
Network created with 5355399 edges and written to 'network_similarity_vec.csv'
