In [1]:
# Get preprocessing methods from make_network.ipynb to keep consistent.
RANDOM_STATE = 5664
import os
import pandas as pd
def load_and_clean_data(filepath):
    """
    Load Reddit data and remove duplicate user-subreddit combinations.
    
    Args:
        filepath (str): Path to the CSV data file
        
    Returns:
        pandas.DataFrame: Cleaned dataframe with unique user-subreddit combinations
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Data file not found: {filepath}")
            
        # Load the dataset
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        print(f"Original data shape: {df.shape}")
        
        # Remove exact duplicates
        df_unique = df.drop_duplicates().copy()
        
        print(f"Data shape after removing exact duplicates: {df_unique.shape}")
        
        return df_unique
    
    except Exception as e:
        print(f"Error in load_and_clean_data: {str(e)}")
        raise
    
def analyze_post_dates(df):
    # Convert post_created_time to datetime
    df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    # Get the earliest and latest dates
    min_date = df['post_created_time'].min()
    max_date = df['post_created_time'].max()
    
    return min_date, max_date

def filter_by_date(df, start_date=None, end_date=None):
    """
    Filter dataframe to include only posts within a specific date range.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'post_created_time' column
        start_date (str, datetime, optional): Keep posts on or after this date
        end_date (str, datetime, optional): Keep posts on or before this date
        
    Returns:
        pandas.DataFrame: Filtered dataframe
    """
    # Make sure post_created_time is datetime
    if not pd.api.types.is_datetime64_dtype(df['post_created_time']):
        df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    original_count = len(df)
    
    # Apply date filters
    if start_date is not None:
        if isinstance(start_date, str):
            start_date = pd.to_datetime(start_date)
        df = df[df['post_created_time'] >= start_date]
    
    if end_date is not None:
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        df = df[df['post_created_time'] <= end_date]
    
    # Report on filtering
    print(f"Date filtering:")
    if start_date is not None:
        print(f"  Start date: {start_date}")
    if end_date is not None:
        print(f"  End date: {end_date}")
    print(f"  Original records: {original_count}")
    print(f"  Filtered records: {len(df)} ({len(df)/original_count*100:.1f}% retained)")
    
    return df

## Load data

In [2]:
from datetime import timedelta

df_clean = load_and_clean_data("reddit_opinion_ru_ua.csv")
min_date, max_date = analyze_post_dates(df_clean)
cutoff_date = max_date - timedelta(days=5) # Get enough to be meaningful, but not too many to be impractical to run

df_recent = filter_by_date(df_clean, start_date=cutoff_date)

Loading data from reddit_opinion_ru_ua.csv...
Original data shape: (5168018, 24)
Data shape after removing exact duplicates: (5168018, 24)
Date filtering:
  Start date: 2025-04-24 11:00:47
  Original records: 5168018
  Filtered records: 50011 (1.0% retained)


In [3]:
todrop = ["comment_id", "created_time","post_id","user_is_verified","user_account_created_time", "user_awardee_karma", "user_awarder_karma", "user_comment_karma", "user_link_karma", "post_created_time"]
df_recent.drop(todrop, axis=1).copy()
df_recent

Unnamed: 0,comment_id,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
0,mpn18ju,1,I'd have to agree that it's hard to shop for m...,AskARussian,2025-04-29 11:08:21,1kaa04k,rsaachit,0,1,0,...,1462.0,810.0,2272.0,8,hello!! I’m currently trying to come up with g...,Gifts for Russian man - from an American,0.78,8,0,2025-04-28 23:37:58
1,mpn188l,1,"They don't, so you don't have to worry about t...",europe,2025-04-29 11:08:16,1kajrb4,potatolulz,0,1,0,...,11179.0,594349.0,605528.0,590,,Zelensky dismisses Putin’s declaration of a 72...,0.98,590,0,2025-04-29 09:27:03
2,mpn16la,1,Trump is just buying time. He won't do shit t...,UkraineWarVideoReport,2025-04-29 11:07:55,1kajrn6,Many-Cartographer-45,0,1,0,...,1.0,2491.0,2492.0,134,,The real Putin is now clear to Trump - and his...,0.95,134,0,2025-04-29 09:27:45
3,mpn14md,1,They are being randomly called up for military...,worldnews,2025-04-29 11:07:29,1kaipov,Corka,0,1,0,...,789.0,136268.0,137057.0,2185,,"Russia has lost over 950,000 soldiers since Fe...",0.97,2185,0,2025-04-29 08:07:53
4,mpn141c,1,Most of your assumptions here are wrong.\n\nFi...,worldnews,2025-04-29 11:07:21,1kaipov,LeSygneNoir,0,1,0,...,32188.0,180480.0,212668.0,2185,,"Russia has lost over 950,000 soldiers since Fe...",0.97,2185,0,2025-04-29 08:07:53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53486,morpkzi,23,Krasnov,europe,2025-04-24 11:16:13,1k6pk64,davidd679,0,23,0,...,1.0,1342.0,1343.0,4204,,Trump’s Plan to Sell Out Ukraine to Russia. Hi...,0.98,4204,0,2025-04-24 11:12:13
53525,morp8gl,1,Would we give up a section of the US to secure...,geopolitics,2025-04-24 11:13:37,1k6phqk,UnusualAir1,0,1,0,...,163647.0,183516.0,347163.0,265,It should be argued that Trump himself is prol...,Trump says Zelenskyy is prolonging war in Ukra...,0.89,265,0,2025-04-24 11:08:13
53528,morp7gg,7,good to see Poland and Ukraine coming together...,worldnews,2025-04-24 11:13:25,1k6ph7j,Appropriate_Age_8918,0,7,0,...,70.0,725.0,795.0,116,,Poland and Ukraine jointly condemn vandalism o...,0.92,116,0,2025-04-24 11:07:22
53588,morofnx,2,Pinged EUROPE ([subscribe](https://reddit.com/...,neoliberal,2025-04-24 11:07:22,1k6pgt2,groupbot,0,2,0,...,3.0,137119.0,137122.0,99,Poland and Ukraine have issued a joint stateme...,Poland and Ukraine jointly condemn vandalism o...,0.98,99,0,2025-04-24 11:06:42


In [4]:
# Get basic information
print("Number of comments:", len(df_recent))
print("All subreddits:")
pd.DataFrame(df_recent.subreddit.explode().unique())

Number of comments: 50011
All subreddits:


Unnamed: 0,0
0,AskARussian
1,europe
2,UkraineWarVideoReport
3,worldnews
4,UkraineRussiaReport
5,conspiracy
6,ukraine
7,ANormalDayInRussia
8,UkrainianConflict
9,russiawarinukraine


## Set up for topic modeling and sentiment analysis for comments, posts, and post titles

In [5]:
# nltk.download("stopwords")
# nltk.download("punkt_tab")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import *

def preprocess_one_doc(text: str, lower: bool, punct: bool, stop: bool, stem: bool):
    puncts = set(string.punctuation)
    puncts.add("...") # punct does not contain ellipses
    puncts.add("…")
    puncts.add("===")
    puncts.add("—")
    puncts.add("–")
    puncts.add("“")
    puncts.add("”")
    puncts.add("’")
    puncts.add("‘")
    stops = set(stopwords.words("english"))
    # stops.add("")
    
    
    # Lowercase the words depending on sentiment or topic modeling
    if lower:
        step0 = text.lower()
    else:
        step0 = text
    step1 = word_tokenize(step0)
    
    
    if punct:
        step2 = "".join([ch for ch in " ".join(step1) if ch not in puncts]).split()
    else:
        step2 = step1
        
        
    
    if stop:
        # Remove stopwords
        step3 = [token for token in step2
                    if token not in stops # drop stopwords
                    # and len(token) > 2 # drop words of insignificant length
                    and (not token.startswith("http"))] # drop links
    else:
        step3 = step2
        
    if stem:
        stemmer = PorterStemmer()
        step4 = [stemmer.stem(i) for i in step3]
    else:
        step4 = step3
        
    return step4

from gensim import corpora, models

def make_dictionary(alltexts):
    d = corpora.Dictionary(alltexts)
    d.filter_extremes(no_below=5, no_above=0.3) # Keep words that are in more than 5 documents, but not in more than a third of all documents
    d.compactify()
    return d

from langdetect import detect, DetectorFactory
DetectorFactory.seed = RANDOM_STATE

def filter_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

def clean_column(df, text_column_name):
    # Drop all missing values
    dfc = df.copy()
    dfc.dropna(subset=[text_column_name], inplace=True)
    
    # Filter non-english text
    is_english = dfc[text_column_name].apply(filter_english)
    dfc = dfc[is_english]
    return dfc

def make_all_components(df, text_column_name):
    dfc = clean_column(df, text_column_name)
    
    # Create with standard preprocessing
    preprocessed = dfc[text_column_name].apply(lambda x: preprocess_one_doc(x, lower=True, stop=True, punct=True, stem=True)) # Preprocess all docs
    dictionary = make_dictionary(preprocessed.tolist()) # Use list of lists of strings
    corpus = [dictionary.doc2bow(text) for text in preprocessed.tolist()] # bag of words representation of documents
    return preprocessed, dictionary, corpus

In [6]:
# Create with standard preprocessing
print("Processing comments")
preprocessed_comments, dictionary_comments, corpus_comments = make_all_components(df_recent, "self_text")
print("Processing post content")
preprocessed_post_content, dictionary_post_content, corpus_post_content = make_all_components(df_recent, "post_self_text")
print("Processing titles")
preprocessed_title, dictionary_title, corpus_title = make_all_components(df_recent, "post_title")

Processing comments
Processing post content
Processing titles


## Evaluate to find best number of topics

In [7]:
# Taken from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

def eval_model(model, text, corp, dic):
    # Compute Perplexity
    perp = model.log_perplexity(corp)
    # a measure of how good the model is. lower the better.
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dic, coherence='c_v')
    coherence = coherence_model_lda.get_coherence()
    
    return perp, coherence

def plot_evals(perps, coherences, ks):
    
    fig = plt.figure("Perplexity and Coherence Analysis", figsize=(8, 8))
    axgrid = fig.add_gridspec(1, 2)
    
    ax1 = fig.add_subplot(axgrid[0, 0])
    ax1.plot(ks, perps)
    ax1.set_title("Number of topics vs Perplexity Score")
    ax1.set_xlabel("Number of Topics")
    ax1.set_ylabel("Perplexity Score")
    ax1.grid()
    
    ax2 = fig.add_subplot(axgrid[0, 1])
    ax2.plot(ks, coherences)
    ax2.set_title("Number of topics vs Coherence Score")
    ax2.set_xlabel("Number of Topics")
    ax2.set_ylabel("Coherence Score")
    ax2.grid()
    
    fig.tight_layout()
    plt.show()
    
    
def full_model_test_loop(text, corpus, dictionary, ks=[1,5,10,20]):
    # Run with a smaller, default set of ks first to reduce the range to search
    perps = []
    coherences = []
    for k in ks:
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=k, id2word=dictionary, passes=20, random_state=RANDOM_STATE)
        scores = eval_model(ldamodel, text.tolist(), corpus, dictionary)
        perps.append(scores[0])
        coherences.append(scores[1])
    plot_evals(perps, coherences, ks)

In [8]:
# # Run this to find the best k. Commented to save time. If the number of comments changes, this has to be rerun. Start with no k list.
# print("Comments eval:")
# full_model_test_loop(preprocessed_comments, corpus_comments, dictionary_comments, [3,4,5,6,7]) # 1st run best around 5
# print("Post Content eval:")
# full_model_test_loop(preprocessed_post_content, corpus_post_content, dictionary_post_content, [18,19,20,21,22]) # 1st run best around 20 (elbow)
# print("Post Title eval:")
# full_model_test_loop(preprocessed_title, corpus_title, dictionary_title, [18,19,20,21,22]) # 1st run best around 20

# 60-90 minutes.

## Topic Modeling

In [9]:
k_comment = 5
k_post_content = 21
k_title = 20

In [10]:
ldamodel_comments = models.ldamodel.LdaModel(corpus_comments, num_topics=k_comment, id2word=dictionary_comments, passes=20, random_state=RANDOM_STATE)
ldamodel_post_content = models.ldamodel.LdaModel(corpus_post_content, num_topics=k_post_content, id2word=dictionary_post_content, passes=20, random_state=RANDOM_STATE)
ldamodel_title = models.ldamodel.LdaModel(corpus_title, num_topics=k_title, id2word=dictionary_title, passes=20, random_state=RANDOM_STATE)

# 8 min 30 sec

In [11]:
cp,cc = eval_model(ldamodel_comments, preprocessed_comments, corpus_comments, dictionary_comments)
pcp, pcc = eval_model(ldamodel_post_content, preprocessed_post_content, corpus_post_content, dictionary_post_content)
tp, tc = eval_model(ldamodel_title, preprocessed_title, corpus_title, dictionary_title)

model_names = ["comments", "post_content", "title"]
ks = [k_comment, k_post_content, k_title]
ps = [cp, pcp, tp]
cs = [cc, pcc, tc]
labels = ["Model","K-value","Perplexity","Coherence"]
table = pd.DataFrame({
    labels[0]: model_names,
    labels[1]: ks,
    labels[2]: ps,
    labels[3]: cs,
})
table

Unnamed: 0,Model,K-value,Perplexity,Coherence
0,comments,5,-7.58263,0.617682
1,post_content,21,-9.404658,0.476215
2,title,20,-12.437024,0.523333


In [12]:
print("Comment topics:")
ldamodel_comments.show_topics()

Comment topics:


[(0,
  '0.067*"trump" + 0.038*"putin" + 0.014*"ukrain" + 0.013*"zelenski" + 0.013*"deal" + 0.013*"peac" + 0.012*"say" + 0.011*"presid" + 0.010*"said" + 0.010*"want"'),
 (1,
  '0.010*"year" + 0.010*"like" + 0.009*"drone" + 0.008*"use" + 0.007*"time" + 0.007*"russian" + 0.006*"one" + 0.006*"also" + 0.005*"day" + 0.005*"bomb"'),
 (2,
  '0.038*"ukrain" + 0.038*"russia" + 0.019*"war" + 0.014*"us" + 0.014*"russian" + 0.011*"gt" + 0.011*"would" + 0.009*"europ" + 0.008*"ukrainian" + 0.007*"crimea"'),
 (3,
  '0.034*"nt" + 0.020*"like" + 0.014*"get" + 0.014*"would" + 0.013*"go" + 0.012*"know" + 0.011*"one" + 0.011*"think" + 0.010*"good" + 0.009*"russia"'),
 (4,
  '0.017*"peopl" + 0.015*"russian" + 0.012*"nt" + 0.009*"like" + 0.009*"american" + 0.009*"countri" + 0.007*"one" + 0.007*"think" + 0.006*"us" + 0.006*"even"')]

In [13]:
print("Post content topics:")
ldamodel_post_content.show_topics()

Post content topics:


[(3,
  '0.063*"soldier" + 0.031*"wound" + 0.029*"gt" + 0.027*"hey" + 0.025*"one" + 0.021*"zelenskyy" + 0.019*"colombian" + 0.017*"kuzin" + 0.015*"drone" + 0.014*"kill"'),
 (4,
  '0.040*"would" + 0.032*"stock" + 0.029*"member" + 0.022*"congress" + 0.022*"lawmak" + 0.021*"us" + 0.020*"fund" + 0.020*"elect" + 0.020*"act" + 0.020*"offic"'),
 (8,
  '0.041*"fighter" + 0.027*"air" + 0.024*"forc" + 0.022*"product" + 0.021*"new" + 0.018*"aircraft" + 0.016*"close" + 0.016*"continu" + 0.016*"bomb" + 0.013*"reach"'),
 (9,
  '0.059*"name" + 0.027*"like" + 0.025*"peopl" + 0.023*"would" + 0.022*"mean" + 0.019*"nt" + 0.017*"also" + 0.014*"someth" + 0.012*"case" + 0.012*"chatgpt"'),
 (20,
  '0.019*"china" + 0.018*"weapon" + 0.017*"power" + 0.017*"use" + 0.013*"global" + 0.013*"would" + 0.013*"domin" + 0.013*"europ" + 0.012*"like" + 0.011*"someth"'),
 (19,
  '0.021*"iran" + 0.017*"need" + 0.017*"could" + 0.016*"would" + 0.016*"one" + 0.015*"2024" + 0.013*"nation" + 0.013*"attack" + 0.013*"us" + 0.012*"n

In [13]:
print("Title topics:")
ldamodel_title.show_topics()

Title topics:


[(2,
  '0.133*"russia" + 0.082*"hour" + 0.078*"fight" + 0.065*"offic" + 0.062*"time" + 0.050*"violat" + 0.043*"first" + 0.042*"offici" + 0.034*"admit" + 0.032*"troop"'),
 (4,
  '0.225*"truce" + 0.190*"ceasefir" + 0.078*"ua" + 0.074*"sinc" + 0.074*"pov" + 0.047*"may" + 0.041*"announc" + 0.026*"territori" + 0.022*"includ" + 0.018*"order"'),
 (19,
  '0.119*"claim" + 0.078*"air" + 0.060*"place" + 0.058*"back" + 0.056*"play" + 0.048*"within" + 0.048*"gener" + 0.046*"jet" + 0.043*"await" + 0.040*"true"'),
 (15,
  '0.105*"attack" + 0.093*"moscow" + 0.080*"along" + 0.072*"join" + 0.065*"show" + 0.049*"pope" + 0.049*"nsfw" + 0.046*"negoti" + 0.039*"may" + 0.034*"9"'),
 (11,
  '0.099*"trump" + 0.081*"peopl" + 0.077*"russia" + 0.076*"plan" + 0.074*"propos" + 0.051*"away" + 0.046*"reject" + 0.045*"one" + 0.044*"launch" + 0.043*"warn"'),
 (18,
  '0.245*"putin" + 0.128*"war" + 0.126*"trump" + 0.116*"declar" + 0.061*"end" + 0.048*"call" + 0.045*"start" + 0.033*"donald" + 0.032*"say" + 0.026*"deal"'),

## Sentiment Analysis

In [15]:
#TODO plot histogram of scores, plot distribution of compound scores

In [36]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
sid = SentimentIntensityAnalyzer()


def find_all_sentiments(document):
    sentences = sent_tokenize(document)
    document_scores = {"pos":0, "neu":0, "neg":0, "compound":0}
    for sentence in sentences:
        sentence_scores = sid.polarity_scores(sentence)
        document_scores["compound"] += sentence_scores["compound"]
        document_scores["neg"] += sentence_scores["neg"]
        document_scores["neu"] += sentence_scores["neu"]
        document_scores["pos"] += sentence_scores["pos"]    
    num_sent = len(sentences)
    document_scores["compound"] /= num_sent
    document_scores["neg"] /= num_sent
    document_scores["neu"] /= num_sent
    document_scores["pos"] /= num_sent
    return document_scores


    
def find_all_topic_sentiments(corp, documents, model):
    dominant_topics = []
    document_scores = []
    corpdoc = zip(corp, documents) # Link each corpus "bag" representation with the full document
    for bag, document in corpdoc:
        topics = model.get_document_topics(bag)
        dominant_topic = max(topics, key=lambda x: x[1])[0]
        dominant_topics.append(dominant_topic)
        document_scores.append(find_all_sentiments(document))
        
    document_scores_df = pd.DataFrame(document_scores) # Each set of sentiments represents a document
    document_scores_df["text"] = documents.reset_index(drop=True) # Ensure alignment and drop nans
    document_scores_df["topic"] = dominant_topics # Also add the dominant topic
    
    topic_sentiments = document_scores_df.groupby("topic")[["pos","neu","neg","compound"]].mean()
        
    return topic_sentiments, document_scores_df.drop("topic", axis=1)

In [None]:
#TODO Index alignment to ensure df text and scores are aligned

In [37]:
clean_comments = clean_column(df_recent, "self_text")["self_text"]
comment_topic_sentiments, comment_document_sentiments = find_all_topic_sentiments(corpus_comments, clean_comments, ldamodel_comments)
# 3 minutes

In [38]:
comment_document_sentiments

Unnamed: 0,pos,neu,neg,compound,text
0,0.2140,0.726000,0.060000,0.557400,I'd have to agree that it's hard to shop for m...
1,0.4170,0.583000,0.000000,0.753500,"They don't, so you don't have to worry about t..."
2,0.2660,0.660667,0.073333,0.249300,Trump is just buying time. He won't do shit t...
3,0.0000,0.962750,0.037250,-0.096625,They are being randomly called up for military...
4,0.0528,0.835800,0.111350,-0.139580,Most of your assumptions here are wrong.\n\nFi...
...,...,...,...,...,...
38166,0.3100,0.690000,0.000000,0.202300,
38167,0.0000,1.000000,0.000000,0.000000,
38168,0.1750,0.755000,0.070000,0.255300,
38169,0.0000,0.328000,0.672000,-0.624900,


In [None]:
clean_column(df_recent, "self_text")["self_text"]

In [39]:
comment_topic_sentiments

Unnamed: 0_level_0,pos,neu,neg,compound
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.06621,0.814469,0.106611,-0.066809
1,0.109539,0.766219,0.120293,-0.036291
2,0.140546,0.677626,0.169508,-0.039746
3,0.15508,0.668022,0.161753,-0.053668
4,0.120145,0.768581,0.106382,0.014652
5,0.110581,0.764752,0.119601,-0.022583
6,0.14591,0.741989,0.103407,0.036854
7,0.087981,0.805794,0.10089,-0.046051
8,0.109032,0.802655,0.085213,0.030286
9,0.091021,0.805793,0.099886,-0.029056
