In [1]:
# Get preprocessing methods from make_network.ipynb to keep consistent.
RANDOM_STATE = 5664
import os
import pandas as pd
def load_and_clean_data(filepath):
    """
    Load Reddit data and remove duplicate user-subreddit combinations.
    
    Args:
        filepath (str): Path to the CSV data file
        
    Returns:
        pandas.DataFrame: Cleaned dataframe with unique user-subreddit combinations
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Data file not found: {filepath}")
            
        # Load the dataset
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        print(f"Original data shape: {df.shape}")
        
        # Remove exact duplicates
        df_unique = df.drop_duplicates().copy()
        
        print(f"Data shape after removing exact duplicates: {df_unique.shape}")
        
        return df_unique
    
    except Exception as e:
        print(f"Error in load_and_clean_data: {str(e)}")
        raise
    
def analyze_post_dates(df):
    # Convert post_created_time to datetime
    df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    # Get the earliest and latest dates
    min_date = df['post_created_time'].min()
    max_date = df['post_created_time'].max()
    
    return min_date, max_date

def filter_by_date(df, start_date=None, end_date=None):
    """
    Filter dataframe to include only posts within a specific date range.
    
    Args:
        df (pandas.DataFrame): DataFrame with 'post_created_time' column
        start_date (str, datetime, optional): Keep posts on or after this date
        end_date (str, datetime, optional): Keep posts on or before this date
        
    Returns:
        pandas.DataFrame: Filtered dataframe
    """
    # Make sure post_created_time is datetime
    if not pd.api.types.is_datetime64_dtype(df['post_created_time']):
        df['post_created_time'] = pd.to_datetime(df['post_created_time'])
    
    original_count = len(df)
    
    # Apply date filters
    if start_date is not None:
        if isinstance(start_date, str):
            start_date = pd.to_datetime(start_date)
        df = df[df['post_created_time'] >= start_date]
    
    if end_date is not None:
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        df = df[df['post_created_time'] <= end_date]
    
    # Report on filtering
    print(f"Date filtering:")
    if start_date is not None:
        print(f"  Start date: {start_date}")
    if end_date is not None:
        print(f"  End date: {end_date}")
    print(f"  Original records: {original_count}")
    print(f"  Filtered records: {len(df)} ({len(df)/original_count*100:.1f}% retained)")
    
    return df

## Load data

In [2]:
from datetime import timedelta

df_clean = load_and_clean_data("reddit_opinion_ru_ua.csv")
min_date, max_date = analyze_post_dates(df_clean)
cutoff_date = max_date - timedelta(days=10)

df_recent = filter_by_date(df_clean, start_date=cutoff_date)

Loading data from reddit_opinion_ru_ua.csv...
Original data shape: (5168018, 24)
Data shape after removing exact duplicates: (5168018, 24)
Date filtering:
  Start date: 2025-04-19 11:00:47
  Original records: 5168018
  Filtered records: 95660 (1.9% retained)


In [None]:
todrop = ["comment_id", "created_time","post_id","user_is_verified","user_account_created_time", "user_awardee_karma", "user_awarder_karma", "user_comment_karma", "user_link_karma", "post_created_time"]
df_recent.drop(todrop, axis=1).copy()
df_recent

In [None]:
# Get basic information
print("Number of posts:", len(df_recent))
print("All subreddits:")
pd.DataFrame(df_recent.subreddit.explode().unique())

## Set up for topic modeling and sentiment analysis for comments, posts, and post titles

In [None]:
import nltk
nltk.download("stopwords")
nltk.download("punkt_tab")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import *

def preprocess_one_doc(text: str, lower: bool, punct: bool, stop: bool, stem: bool):
    puncts = set(string.punctuation)
    puncts.add("...") # punct does not contain ellipses
    puncts.add("…")
    puncts.add("===")
    puncts.add("—")
    puncts.add("–")
    puncts.add("“")
    puncts.add("”")
    puncts.add("’")
    puncts.add("‘")
    stops = set(stopwords.words("english"))
    # stops.add("")
    
    
    # Lowercase the words depending on sentiment or topic modeling
    if lower:
        step0 = text.lower()
    else:
        step0 = text
    step1 = word_tokenize(step0)
    
    
    if punct:
        step2 = "".join([ch for ch in " ".join(step1) if ch not in puncts]).split()
    else:
        step2 = step1
        
        
    
    if stop:
        # Remove stopwords
        step3 = [token for token in step2
                    if token not in stops # drop stopwords
                    # and len(token) > 2 # drop words of insignificant length
                    and (not token.startswith("http"))] # drop links
    else:
        step3 = step2
        
    if stem:
        stemmer = PorterStemmer()
        step4 = [stemmer.stem(i) for i in step3]
    else:
        step4 = step3
        
    return step4

from gensim import corpora, models

def make_dictionary(alltexts):
    d = corpora.Dictionary(alltexts)
    d.filter_extremes(no_below=5, no_above=0.3) # Keep words that are in more than 5 documents, but not in more than a third of all documents
    d.compactify()
    return d

from langdetect import detect, DetectorFactory
DetectorFactory.seed = RANDOM_STATE

def filter_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

def clean_column(df, text_column_name):
    # Drop all missing values
    dfc = df.copy()
    dfc.dropna(subset=[text_column_name], inplace=True)
    
    # Filter non-english text
    is_english = dfc[text_column_name].apply(filter_english)
    dfc = dfc[is_english]
    return dfc

def make_all_components(df, text_column_name):
    dfc = clean_column(df, text_column_name)
    
    # Create with standard preprocessing
    preprocessed = dfc[text_column_name].apply(lambda x: preprocess_one_doc(x, lower=True, stop=True, punct=True, stem=True)) # Preprocess all docs
    dictionary = make_dictionary(preprocessed.tolist()) # Use list of lists of strings
    corpus = [dictionary.doc2bow(text) for text in preprocessed.tolist()] # bag of words representation of documents
    return preprocessed, dictionary, corpus

In [None]:
# Create with standard preprocessing
preprocessed_comments, dictionary_comments, corpus_comments = make_all_components(df_recent, "self_text")
preprocessed_post_content, dictionary_post_content, corpus_post_content = make_all_components(df_recent, "post_self_text")
preprocessed_title, dictionary_title, corpus_title = make_all_components(df_recent, "post_title")

In [None]:
preprocessed_comments

## Evaluate to find best number of topics

In [None]:
# Taken from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

def eval_model(model, text, dic, corp):
    # Compute Perplexity
    perp = model.log_perplexity(corp)
    # a measure of how good the model is. lower the better.
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dic, coherence='c_v')
    coherence = coherence_model_lda.get_coherence()
    
    return perp, coherence

def plot_evals(perps, coherences, ks):
    
    fig = plt.figure("Perplexity and Coherence Analysis", figsize=(8, 8))
    axgrid = fig.add_gridspec(1, 2)
    
    ax1 = fig.add_subplot(axgrid[0, 0])
    ax1.plot(ks, perps)
    ax1.set_title("Number of topics vs Perplexity Score")
    ax1.set_xlabel("Number of Topics")
    ax1.set_ylabel("Perplexity Score")
    ax1.grid()
    
    ax2 = fig.add_subplot(axgrid[0, 1])
    ax2.plot(ks, coherences)
    ax2.set_title("Number of topics vs Coherence Score")
    ax2.set_xlabel("Number of Topics")
    ax2.set_ylabel("Coherence Score")
    ax2.grid()
    
    fig.tight_layout()
    plt.show()
    
    
def full_model_test_loop(text, corpus, dictionary):
    ks = [1,2,3,4,5,6,7,8,9,10, 20, 30]
    perps = []
    coherences = []
    for k in ks:
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=k, id2word=dictionary, passes=20, random_state=RANDOM_STATE)
        scores = eval_model(ldamodel, text.tolist(), dictionary, corpus)
        perps.append(scores[0])
        coherences.append(scores[1])
    plot_evals(perps, coherences, ks)

In [None]:
# print("Comments eval:")
# full_model_test_loop(preprocessed_comments, corpus_comments, dictionary_comments)
# print("Post Content eval:")
# full_model_test_loop(preprocessed_post_content, corpus_post_content, dictionary_post_content)
# print("Post Title eval:")
# full_model_test_loop(preprocessed_title , corpus_title, dictionary_title)

# 22 minutes to run all

## Topic Modeling

In [None]:
ldamodel_comments = models.ldamodel.LdaModel(corpus_comments, num_topics=10, id2word=dictionary_comments, passes=20, random_state=RANDOM_STATE)
ldamodel_post_content = models.ldamodel.LdaModel(corpus_post_content, num_topics=5, id2word=dictionary_post_content, passes=20, random_state=RANDOM_STATE)
ldamodel_title = models.ldamodel.LdaModel(corpus_title, num_topics=20, id2word=dictionary_title, passes=20, random_state=RANDOM_STATE)

# 6 min 30 sec

In [None]:
# TODO: call eval_model to get the perplexity and coherence of the top k model and present them in table
def evaluate_models_and_present_table(text, corpus, dictionary, ks):
    results = []

    for k in ks:
        print(f"Evaluating model with {k} topics...")
        # Train the LDA model
        ldamodel = models.ldamodel.LdaModel(
            corpus=corpus,
            num_topics=k,
            id2word=dictionary,
            passes=20,
            random_state=RANDOM_STATE
        )
        # Evaluate the model
        perplexity, coherence = eval_model(ldamodel, text, dictionary, corpus)
        results.append({"Topics": k, "Perplexity": perplexity, "Coherence": coherence})
    results_df = pd.DataFrame(results)

    # Print the table
    print(results_df)

    return results_df

# Example usage
ks = [2, 5, 10, 15, 20]  # List of topic numbers to evaluate
results_table = evaluate_models_and_present_table(preprocessed_comments.tolist(), corpus_comments, dictionary_comments, ks)

In [None]:
print("Comment topics:")
ldamodel_comments.show_topics()

In [None]:
print("Post content topics:")
ldamodel_post_content.show_topics()

In [None]:
print("Title topics:")
ldamodel_title.show_topics()

## Sentiment Analysis

In [None]:
import seaborn as sns
#TODO plot histogram of scores, plot distribution of compound scores
def plot_scores(scores):
    ax = sns.histplot(scores.melt(value_vars=["neg", "pos", "neu"]), x="value", hue="variable",
    multiple="dodge", stat="percent", common_bins=True, common_norm=True, bins=20)
    ax.set_ylim(0, 50)
    return ax

def plot_sentiment_distribution(scores, title):
    plt.figure(figsize=(12, 6))
    plot_scores(scores)
    plt.title('Distribution of Sentiment Scores for ' + title)
    plt.xlabel('Score')
    plt.ylabel('Percent')
    plt.show()
def plot_compound_distribution(scores, title):
    plt.figure(figsize=(12, 6))
    ax = sns.histplot(scores["compound"], bins=20)
    ax.set_ylim(0, 50)
    plt.title('Distribution of Compound Sentiment Scores for ' + title)
    plt.xlabel('Compound Score')
    plt.ylabel('Percent')
    plt.show()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
sid = SentimentIntensityAnalyzer()


def find_all_sentiments(document):
    sentences = sent_tokenize(document)
    document_scores = {"pos":0, "neu":0, "neg":0, "compound":0}
    for sentence in sentences:
        sentence_scores = sid.polarity_scores(sentence)
        document_scores["compound"] += sentence_scores["compound"]
        document_scores["neg"] += sentence_scores["neg"]
        document_scores["neu"] += sentence_scores["neu"]
        document_scores["pos"] += sentence_scores["pos"]    
    num_sent = len(sentences)
    document_scores["compound"] /= num_sent
    document_scores["neg"] /= num_sent
    document_scores["neu"] /= num_sent
    document_scores["pos"] /= num_sent
    return document_scores


    
def find_all_topic_sentiments(corp, documents, model):
    dominant_topics = []
    document_scores = []
    corpdoc = zip(corp, documents) # Link each corpus "bag" representation with the full document
    for bag, document in corpdoc:
        topics = model.get_document_topics(bag)
        dominant_topic = max(topics, key=lambda x: x[1])[0]
        dominant_topics.append(dominant_topic)
        document_scores.append(find_all_sentiments(document))
        
    document_scores_df = pd.DataFrame(document_scores) # Each set of sentiments represents a document
    document_scores_df["text"] = documents
    document_scores_df["topic"] = dominant_topics # Also add the dominant topic
    
    topic_sentiments = document_scores_df.groupby("topic")[["pos","neu","neg","compound"]].mean()
        
    return topic_sentiments, document_scores_df.drop("topic", axis=1)

In [None]:
clean_comments = clean_column(df_recent, "self_text")["self_text"]
comment_topic_sentiments, comment_document_sentiments = find_all_topic_sentiments(corpus_comments, clean_comments, ldamodel_comments)
# 3 minutes

In [None]:
comment_document_sentiments

In [None]:
# Plot the sentiment distribution for all posts
plot_sentiment_distribution(comment_document_sentiments[['neg', 'pos', 'neu']], 'All Posts')
plot_compound_distribution(comment_document_sentiments['compound'], 'All Posts')

In [None]:
clean_column(df_recent, "self_text")["self_text"]

In [None]:
comment_topic_sentiments