In [74]:
# Get preprocessing methods from make_network.ipynb to keep consistent.
RANDOM_STATE = 5664
import os
import pandas as pd
def load_and_clean_data(filepath):
    """
    Load Reddit data and remove duplicate user-subreddit combinations.
    
    Args:
        filepath (str): Path to the CSV data file
        
    Returns:
        pandas.DataFrame: Cleaned dataframe with unique user-subreddit combinations
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Data file not found: {filepath}")
            
        # Load the dataset
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        print(f"Original data shape: {df.shape}")
        
        # Remove duplicate user-subreddit combinations
        df_unique = df.drop_duplicates(subset=['author_name', 'subreddit']).copy()
        
        # Remove empty texts
        df_unique = df_unique.dropna(subset=["self_text", "post_self_text", "post_title"])
        
        print(f"Data shape after cleaning: {df_unique.shape}")
        
        return df_unique
    
    except Exception as e:
        print(f"Error in load_and_clean_data: {str(e)}")
        raise
    
def analyze_post_dates(df):
    # Convert post_created_time to datetime
    df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    # Get the earliest and latest dates
    min_date = df['post_created_time'].min()
    max_date = df['post_created_time'].max()
    
    return min_date, max_date

def filter_by_date(df, start_date=None, end_date=None):
    """
    Filter dataframe to include only posts within a specific date range.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'post_created_time' column
        start_date (str, datetime, optional): Keep posts on or after this date
        end_date (str, datetime, optional): Keep posts on or before this date
        
    Returns:
        pandas.DataFrame: Filtered dataframe
    """
    # Make sure post_created_time is datetime
    if not pd.api.types.is_datetime64_dtype(df['post_created_time']):
        df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    original_count = len(df)
    
    # Apply date filters
    if start_date is not None:
        if isinstance(start_date, str):
            start_date = pd.to_datetime(start_date)
        df = df[df['post_created_time'] >= start_date]
    
    if end_date is not None:
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        df = df[df['post_created_time'] <= end_date]
    
    # Report on filtering
    print(f"Date filtering:")
    if start_date is not None:
        print(f"  Start date: {start_date}")
    if end_date is not None:
        print(f"  End date: {end_date}")
    print(f"  Original records: {original_count}")
    print(f"  Filtered records: {len(df)} ({len(df)/original_count*100:.1f}% retained)")
    
    return df

## Load data

In [75]:
from datetime import timedelta

df_clean = load_and_clean_data("reddit_opinion_ru_ua.csv")
min_date, max_date = analyze_post_dates(df_clean)
cutoff_date = max_date - timedelta(days=10)

df_recent = filter_by_date(df_clean, start_date=cutoff_date)

Loading data from reddit_opinion_ru_ua.csv...
Original data shape: (5168018, 24)
Data shape after cleaning: (118477, 24)
Date filtering:
  Start date: 2025-04-19 10:05:00
  Original records: 118477
  Filtered records: 7297 (6.2% retained)


In [76]:
todrop = ["comment_id", "created_time","post_id","user_is_verified","user_account_created_time", "user_awardee_karma", "user_awarder_karma", "user_comment_karma", "user_link_karma", "post_created_time"]
df_recent.drop(todrop, axis=1)

Unnamed: 0,score,self_text,subreddit,author_name,controversiality,ups,downs,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received
0,1,I'd have to agree that it's hard to shop for m...,AskARussian,rsaachit,0,1,0,2272.0,8,hello!! I’m currently trying to come up with g...,Gifts for Russian man - from an American,0.78,8,0
9,1,It looks like a complicated procedure. In a te...,UkraineWarVideoReport,Spare-Sandwich8848,0,1,0,740.0,155,Video was edited by the source.\n\nThis video ...,NSFW/NSFL: 2 Russian soldiers shot themselves....,0.97,155,0
11,1,"It is atmospheric phenomena, we're due another...",conspiracy,South-Rabbit-4064,0,1,0,92672.0,214,"The massive blackout that hit Spain, Portugal,...",Spain and Portugal Blackout due to 'Atmospheri...,0.82,214,0
43,1,Nope. Plasma penetration. Viewable on multip...,conspiracy,fr33lancr,0,1,0,15888.0,214,"The massive blackout that hit Spain, Portugal,...",Spain and Portugal Blackout due to 'Atmospheri...,0.82,214,0
49,1,"It's not the simple men, but the women who don...",AskARussian,Basic_Ad_2235,0,1,0,10827.0,8,hello!! I’m currently trying to come up with g...,Gifts for Russian man - from an American,0.78,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98590,2,J'ai une offre de thése en IA mais je ne suis ...,france,No-Psychology-7771,0,2,0,100.0,0,"Bonjour à tous,\n\nJe suis récemment diplômé e...",Quel domaine IT choisir en tant que jeune dipl...,0.20,0,0
98633,20,Bots are mad,conspiracy,mykidsnever_call,0,20,0,11222.0,341,[**Guardian**](https://archive.is/2tA6J) — The...,Trump pretty much accused Fauci of crimes agai...,0.72,341,0
98643,78,Trump loves to take credit for operation warp ...,conspiracy,moanysopran0,0,78,0,17167.0,341,[**Guardian**](https://archive.is/2tA6J) — The...,Trump pretty much accused Fauci of crimes agai...,0.72,341,0
98674,-10,Why does Trump love him much?!!,conspiracy,General-Priority-479,0,-10,0,14141.0,341,[**Guardian**](https://archive.is/2tA6J) — The...,Trump pretty much accused Fauci of crimes agai...,0.72,341,0


In [77]:
# Get basic information
print("Number of posts:", len(df_recent))
print("All subreddits:")
pd.DataFrame(df_recent.subreddit.explode().unique())

Number of posts: 7297
All subreddits:


Unnamed: 0,0
0,AskARussian
1,UkraineWarVideoReport
2,conspiracy
3,RussiaUkraineWar2022
4,CombatFootage
5,UkraineRussiaReport
6,FreedomofRussia
7,EndlessWar
8,ukraine
9,UkraineConflict


## Set up for topic modeling and sentiment analysis for comments, posts, and post titles

In [78]:
# nltk.download("stopwords")
# nltk.download("punkt_tab")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import *

def preprocess_one_doc(text: str, lower: bool, punct: bool, stop: bool, stem: bool):
    puncts = set(string.punctuation)
    puncts.add("...") # punct does not contain ellipses
    puncts.add("…")
    puncts.add("===")
    puncts.add("—")
    puncts.add("–")
    puncts.add("“")
    puncts.add("”")
    puncts.add("’")
    stops = set(stopwords.words("english"))
    # stops.add("")
    
    
    # Lowercase the words depending on sentiment or topic modeling
    if lower:
        step0 = text.lower()
    else:
        step0 = text
    step1 = word_tokenize(step0)
    
    
    if punct:
        step2 = "".join([ch for ch in " ".join(step1) if ch not in puncts]).split()
    else:
        step2 = step1
        
        
    
    if stop:
        # Remove stopwords
        step3 = [token for token in step2
                    if token not in stops # drop stopwords
                    # and len(token) > 2 # drop words of insignificant length
                    and (not token.startswith("http"))] # drop links
    else:
        step3 = step2
        
    if stem:
        stemmer = PorterStemmer()
        step4 = [stemmer.stem(i) for i in step3]
    else:
        step4 = step3
        
    return step4

from gensim import corpora, models

def make_dictionary(alltexts):
    d = corpora.Dictionary(alltexts)
    d.filter_extremes(no_below=5, no_above=0.3) # Keep words that are in more than 5 documents, but not in more than a third of all documents
    d.compactify()
    return d

#TODO: Filter out non-english text
#TODO: Dropna here instead of at cleanup to preserve data?
def make_all_components(df, text_column_name):
    # Create with standard preprocessing
    preprocessed = df[text_column_name].apply(lambda x: preprocess_one_doc(x, lower=True, stop=True, punct=True, stem=True)) # Preprocess all docs
    dictionary = make_dictionary(preprocessed.tolist()) # Use list of lists of strings
    corpus = [dictionary.doc2bow(text) for text in preprocessed.tolist()] # bag of words representation of documents
    return preprocessed, dictionary, corpus

In [79]:
# Create with standard preprocessing
preprocessed_comments, dictionary_comments, corpus_comments = make_all_components(df_recent, "self_text")
preprocessed_post_content, dictionary_post_content, corpus_post_content = make_all_components(df_recent, "post_self_text")
preprocessed_title, dictionary_title, corpus_title = make_all_components(df_recent, "post_title")

In [80]:
preprocessed_comments

0        [agre, hard, shop, men, unless, nich, hobbi, l...
9        [look, like, complic, procedur, technic, sens,...
11       [atmospher, phenomena, due, anoth, carrington,...
43       [nope, plasma, penetr, viewabl, multipl, sourc...
49                         [simpl, men, women, nt, imagin]
                               ...                        
98590    [jai, une, offr, de, thése, en, ia, mai, je, n...
98633                                           [bot, mad]
98643    [trump, love, take, credit, oper, warp, speed,...
98674                                  [trump, love, much]
98764              [complet, protect, individu, state, go]
Name: self_text, Length: 7297, dtype: object

## Evaluate to find best number of topics

In [81]:
# Taken from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

def eval_model(model, text, dic, corp):
    # Compute Perplexity
    perp = model.log_perplexity(corp)
    # a measure of how good the model is. lower the better.
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dic, coherence='c_v')
    coherence = coherence_model_lda.get_coherence()
    
    return perp, coherence

def plot_evals(perps, coherences, ks):
    
    fig = plt.figure("Perplexity and Coherence Analysis", figsize=(8, 8))
    axgrid = fig.add_gridspec(1, 2)
    
    ax1 = fig.add_subplot(axgrid[0, 0])
    ax1.plot(ks, perps)
    ax1.set_title("Number of topics vs Perplexity Score")
    ax1.set_xlabel("Number of Topics")
    ax1.set_ylabel("Perplexity Score")
    ax1.grid()
    
    ax2 = fig.add_subplot(axgrid[0, 1])
    ax2.plot(ks, coherences)
    ax2.set_title("Number of topics vs Coherence Score")
    ax2.set_xlabel("Number of Topics")
    ax2.set_ylabel("Coherence Score")
    ax2.grid()
    
    fig.tight_layout()
    plt.show()
    
    
def full_model_test_loop(text, corpus, dictionary):
    ks = [1,2,3,4,5,6,7,8,9,10, 20, 30]
    perps = []
    coherences = []
    for k in ks:
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=k, id2word=dictionary, passes=20, random_state=RANDOM_STATE)
        scores = eval_model(ldamodel, text.tolist(), dictionary, corpus)
        perps.append(scores[0])
        coherences.append(scores[1])
    plot_evals(perps, coherences, ks)

In [82]:
# print("Comments eval:")
# full_model_test_loop(preprocessed_comments, corpus_comments, dictionary_comments)
# print("Post Content eval:")
# full_model_test_loop(preprocessed_post_content, corpus_post_content, dictionary_post_content)
# print("Post Title eval:")
# full_model_test_loop(preprocessed_title , corpus_title, dictionary_title)

# 22 minutes to run all

## Topic Modeling

In [83]:
ldamodel_comments = models.ldamodel.LdaModel(corpus_comments, num_topics=10, id2word=dictionary_comments, passes=20, random_state=RANDOM_STATE)
ldamodel_post_content = models.ldamodel.LdaModel(corpus_post_content, num_topics=5, id2word=dictionary_post_content, passes=20, random_state=RANDOM_STATE)
ldamodel_title = models.ldamodel.LdaModel(corpus_title, num_topics=20, id2word=dictionary_title, passes=20, random_state=RANDOM_STATE)

# TODO: call eval_model to get the perplexity and coherence of the top k model

In [84]:
print("Comment topics:")
ldamodel_comments.show_topics()

Comment topics:


[(0,
  '0.044*"russia" + 0.028*"ukrain" + 0.021*"war" + 0.020*"us" + 0.019*"would" + 0.014*"countri" + 0.012*"nt" + 0.011*"want" + 0.011*"russian" + 0.009*"trump"'),
 (1,
  '0.022*"like" + 0.022*"fuck" + 0.015*"love" + 0.014*"peopl" + 0.011*"trump" + 0.009*"differ" + 0.009*"said" + 0.009*"thing" + 0.009*"want" + 0.008*"see"'),
 (2,
  '0.015*"like" + 0.015*"peopl" + 0.013*"russian" + 0.012*"nt" + 0.012*"go" + 0.012*"also" + 0.010*"good" + 0.009*"moscow" + 0.008*"ask" + 0.007*"see"'),
 (3,
  '0.039*"и" + 0.038*"в" + 0.034*"news" + 0.032*"не" + 0.019*"что" + 0.018*"это" + 0.018*"на" + 0.017*"fox" + 0.017*"na" + 0.016*"ai"'),
 (4,
  '0.063*"nt" + 0.018*"peopl" + 0.017*"think" + 0.014*"get" + 0.014*"say" + 0.013*"ca" + 0.011*"even" + 0.010*"govern" + 0.010*"would" + 0.010*"work"'),
 (5,
  '0.015*"drone" + 0.011*"would" + 0.010*"ukrainian" + 0.010*"happen" + 0.010*"militari" + 0.008*"ukrain" + 0.008*"never" + 0.007*"way" + 0.007*"time" + 0.006*"year"'),
 (6,
  '0.018*"thank" + 0.016*"shit" +

In [85]:
print("Post content topics:")
ldamodel_post_content.show_topics()

Post content topics:


[(0,
  '0.019*"banksi" + 0.015*"nt" + 0.012*"artist" + 0.009*"rob" + 0.007*"oper" + 0.007*"work" + 0.007*"never" + 0.007*"one" + 0.006*"would" + 0.006*"art"'),
 (1,
  '0.014*"like" + 0.013*"would" + 0.012*"russia" + 0.010*"nt" + 0.009*"think" + 0.007*"peopl" + 0.007*"lawn" + 0.006*"one" + 0.006*"someth" + 0.006*"know"'),
 (2,
  '0.009*"russia" + 0.008*"would" + 0.007*"ukrainian" + 0.007*"amp" + 0.006*"one" + 0.006*"ukrain" + 0.006*"like" + 0.005*"year" + 0.005*"want" + 0.005*"time"'),
 (3,
  '0.035*"de" + 0.023*"la" + 0.020*"et" + 0.019*"le" + 0.018*"soldier" + 0.014*"un" + 0.013*"l" + 0.010*"colombian" + 0.008*"roman" + 0.008*"en"'),
 (4,
  '0.020*"ukrain" + 0.017*"war" + 0.017*"trump" + 0.015*"us" + 0.010*"video" + 0.009*"inform" + 0.008*"sourc" + 0.008*"russia" + 0.007*"presid" + 0.007*"militari"')]

In [86]:
print("Title topics:")
ldamodel_title.show_topics()

Title topics:


[(18,
  '0.095*"ukrain" + 0.081*"russia" + 0.053*"fight" + 0.045*"war" + 0.043*"trump" + 0.041*"forc" + 0.036*"fighter" + 0.036*"zelenskyy" + 0.032*"europ" + 0.024*"jet"'),
 (10,
  '0.140*"think" + 0.112*"russia" + 0.093*"north" + 0.083*"war" + 0.071*"korean" + 0.037*"kursk" + 0.027*"ru" + 0.027*"pov" + 0.024*"warfar" + 0.020*"fought"'),
 (6,
  '0.058*"x" + 0.057*"reach" + 0.055*"still" + 0.044*"zelenski" + 0.043*"routin" + 0.039*"amaz" + 0.036*"1280" + 0.029*"deploy" + 0.025*"soldier" + 0.024*"probabl"'),
 (5,
  '0.146*"trump" + 0.115*"ukrain" + 0.082*"news" + 0.064*"war" + 0.061*"accus" + 0.056*"end" + 0.051*"us" + 0.046*"peopl" + 0.041*"russia" + 0.034*"back"'),
 (3,
  '0.064*"moscow" + 0.049*"ukrainian" + 0.048*"region" + 0.034*"troop" + 0.030*"pov" + 0.026*"forc" + 0.022*"offici" + 0.022*"militari" + 0.021*"accord" + 0.021*"russia"'),
 (12,
  '0.076*"say" + 0.066*"ukrain" + 0.062*"crimea" + 0.038*"new" + 0.035*"struck" + 0.032*"occupi" + 0.029*"wound" + 0.028*"univers" + 0.026*"ar

## Sentiment Analysis

In [95]:
av = 0
for text in df_recent["self_text"]:
    av+=len(text)
av/len(df_recent["self_text"])

164.58681649993147

In [None]:
#TODO adapt to generic scores, include average pos, neg, neu scores, plot histogram of scores, plot distribution of compound scores

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
sid = SentimentIntensityAnalyzer()

def find_compound_sentiment(document):
    sentences = sent_tokenize(document)
    s = 0
    for sentence in sentences:
        sentiment = sid.polarity_scores(sentence)["compound"]
        s += sentiment
    return s/len(sentences)
    
def find_all_topic_sentiments(corp, raws, model, refs):
    dominant_topics = []
    document_scores = []
    corp_plus_raws = zip(corp, raws)
    for bag, raw in corp_plus_raws:
        topics = model.get_document_topics(bag)
        dominant_topic = sorted(topics, key=lambda x: -x[1])[0][0]
        dominant_topics.append(dominant_topic)
        document_scores.append(find_compound_sentiment(raw))
        
    topic_sentiments = {}
    topic_refs = {}
    document_topic_score = list(zip(dominant_topics, document_scores, refs))
    for i in range(k):
        sentiments_in_topic = [sentiment for topic, sentiment, _ in document_topic_score if topic == i]
        topic_sentiments[i] = sum(sentiments_in_topic)/len(sentiments_in_topic)
        refs_in_topic = [refs for topic, _, refs in document_topic_score if topic == i]
        topic_refs[i] = sum(refs_in_topic)/len(refs_in_topic)
        
    return [topic_sentiments, document_scores, topic_refs]

In [None]:
topic_scores, dataset_scores, topic_nrefs = find_all_topic_sentiments(corpus, dataset["content"], ldamodel, dataset["nrefs"])