In [None]:
%load_ext autoreload 
%autoreload 2 
# all imported modules will be automatically reloaded

In [None]:
# Import packages
from models.reddit_scraper import RedditScraper
from config.settings import USER_AGENT
from utils.analysis import *
from utils.network_builder import *
from collections import Counter
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3_unweighted, venn3
import numpy as np


# Get post data

In [None]:
# Get Reddit data
scraper = RedditScraper(USER_AGENT) # create a RedditScraper object

subs_of_interest = ["AskMen", "AskWomen","TooAfraidToAsk"] # list of subreddits to analyze

dfs = [] # list to store DataFrames

for sub in subs_of_interest:
    posts = scraper.get_subreddit_posts(sub, limit=100,cache=True) # scrape 1000 posts#
    dfs.append(create_posts_dataframe(posts)) # convert posts to a pandas DataFrame

AskMen_df = dfs[0]
AskWomen_df = dfs[1]
TooAfraidToAsk_df = dfs[2]

subs_of_interest_dfs = [AskMen_df, AskWomen_df, TooAfraidToAsk_df]

# Author Analysis

There will be a bigger overlap in authors that post in AskMen and TooAfraidToAsk than in AskMen and AskWomen since the majority of Reddit users are men and thus a more general subreddit like TooAfraidToAsk will have a bigger overlap with AskMen than AskWomen.


In [None]:
## Filter DataFrames
def filter_df(df:pd.DataFrame) -> pd.DataFrame:
    df = df[(df["author"] != "[deleted]") & (df["author"] != "AutoModerator")] # remove deleted authors & bots
    return df

AskMen_df = filter_df(AskMen_df)
AskWomen_df = filter_df(AskWomen_df)
TooAfraidToAsk_df = filter_df(TooAfraidToAsk_df)


## Get unique authors

def get_unique_authors(df:pd.DataFrame) -> set:
    return set(df["author"])

author_men = get_unique_authors(AskMen_df)
author_women = get_unique_authors(AskWomen_df)
author_tooafraidtoask = get_unique_authors(TooAfraidToAsk_df)


print(f"""
    Number of unique authors in AskMen: {len(author_men)}
    Number of unique authors in AskWomen: {len(author_women)}
    Number of unique authors in TooAfraidToAsk: {len(author_tooafraidtoask)}
    The number of unqiue authors in all three Subreddits is similiar.
""")

## Jacard Similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    jaccard_similarity = intersection / union
    return jaccard_similarity

print(f""" 
    Jaccard Similarity:
    Men & Women: {jaccard_similarity(author_men, author_women):.04f}
    Women & TooAfraidToAsk: {jaccard_similarity(author_women, author_tooafraidtoask):.04f}
    Men & TooAfraidToAsk: {jaccard_similarity(author_tooafraidtoask, author_men):.04f}""")

v=venn3(subsets = (author_women, author_tooafraidtoask, author_men), set_labels = ("AskWomen", "TooAfraidToAsk", "AskMen"))
plt.show()

print("""
The Jaccard Similarity shows that the distance between each Subreddit when considering the authors of each SubReddit is similiar but very small since there is barely any overlap between the authors of the Subreddits.
The reason why the overlap is small may be because the Reddit API only returns the last 1000 posts and hence for AskMen and TooAfraidToAsk, we only have data from the last 2 weeks (for AskWomen we can go back a bit further).
People in these "Question" Subreddits may post less frequently and thus we don't observe a big overlap between the authors of the Subreddits.
""")

In [None]:
def recurring_authors(df:pd.DataFrame, sort:str) -> float:
    """
    Calculate the fraction or sum of recurring authors in a dataframe
    """
    author_counts = Counter(df["author"])
    recurring_authors_count = sum(1 for count in author_counts.values() if count > 1)
    if sort == "fraction":
        fraction_recurring=recurring_authors_count/len(author_counts)
        return fraction_recurring
    elif sort == "count":
        return recurring_authors_count
    else:
        return "Invalid sort parameter. Choose 'fraction' or 'count'"

print(f"""
    Fraction of recurring authors:
    AskMen:{recurring_authors(AskMen_df, sort="fraction"):.02%}
    AskWomen:{recurring_authors(AskWomen_df, sort="fraction"):.02%}
    TooAfraidToAsk:{recurring_authors(TooAfraidToAsk_df, sort="fraction"):.02%}

    The fraction of reoccuring authors is low in each SubReddit which supports the hypothesis that the reason we don't oberserve big overlaps may be driven by the fact that people post less frequently in "Question" Subreddits.""")

In [None]:
# Compute bin edges and convert to integers
fig, ax = plt.subplots(3, 1, figsize=(10, 15))

list_counts = []
for sub_df in [AskMen_df, AskWomen_df, TooAfraidToAsk_df]:
    author_counts = Counter(sub_df["author"])
    recurring_counts = [count for count in author_counts.values() if count > 1]
    list_counts.append(recurring_counts)

for index, counts in enumerate(list_counts):
    count_bins = np.arange(2, max(counts) + 2) - 0.5
    ax[index].hist(counts, bins=count_bins, edgecolor='black')
    ax[index].set_xticks(np.arange(2, max(counts) + 1))
    ax[index].set_xlabel('Number of Posts by Author')
    ax[index].set_ylabel('Count of Authors')
    ax[index].set_title(f'Distribution of Recurring Authors by Number of Posts ({subs_of_interest[index]})')
plt.subplots_adjust(hspace=0.5)
plt.tight_layout()
plt.show()

print("These barplots show that even for reoccuring authors, most authors have only posted twice. This supports the hypothesis that the reason we don't observe big overlaps may be driven by the fact that people may not post frequently in \"Question\" Subreddits.")

# Comment Depth Analysis
AskMen and TooAfraidToAsk may, on average, have shorter comment trees due to a focus on direct advice and minimal follow-up engagement, whereas AskWomen encourage more elaborate discussions.

In [None]:
#askmen_comments_posts, askmen_comments_df = get_comments_df("AskMen", sort="hot",user_agent=USER_AGENT, limit=100)
tata_comments_posts, tata_comments_df = get_comments_df("TooAfraidToAsk", sort="hot",user_agent=USER_AGENT, limit=5)

#askwomen_comments_posts, askwomen_comments_df = get_comments_df("AskWomen", sort="hot",user_agent=USER_AGENT, limit=100)

In [None]:
def comment_depth(df,posts,limit=100):
    """
    Calculate the depth of the comment tree for each post in the dataframe

    Inputs:
    - df: DataFrame containing comments
    - posts: list of dictionaries containing post information

    """
    depth_list = []
    for i in range(0,limit-1):
        post_comments = df[df['post_id'] == posts[i]['id']]
        comment_tree = usercomment_tree(post_comments, include_root=True)
        depth=depth = nx.dag_longest_path_length(comment_tree)
        depth_list.append(depth)
    return depth_list

depth_askmen = comment_depth(askmen_comments_df,askmen_comments_posts,limit=100)
depth_askwomen = comment_depth(askwomen_comments_df,askwomen_comments_posts,limit=100)
depth_tata= comment_depth(tata_comments_df,tata_comments_posts,limit=5)

In [None]:
def mean_list(list):
    return sum(list)/len(list)

print(f"Mean comment depth for AskMen: {mean_list(depth_askmen):.02f}")
print(f"Mean comment depth for AskWomen: {mean_list(depth_askwomen):.02f}")
print(f"Mean comment depth for TooAfraidToAsk: {mean_list(depth_tata):.02f}")

In [None]:
# Anova test
from scipy.stats import f_oneway
f_stat, p_val = f_oneway(depth_askmen,depth_askwomen,depth_tata)

if p_val < 0.05:
    print(f"Reject the null hypothesis that the mean comment depth is the same for all Subreddits (p-value: {p_val:.04f})")
else:
    print(f"Fail to reject the null hypothesis that the mean comment depth is the same for all Subreddits (p-value: {p_val:.04f})")

In [None]:
# Post hoc test (if reject H0)
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Combine all depths into a single list and create corresponding labels
all_depths = depth_askmen + depth_askwomen + depth_tata
labels = ['AskMen'] * len(depth_askmen) + ['AskWomen'] * len(depth_askwomen) + ['TooAfraidToAsk'] * len(depth_tata)

# Perform Tukey's HSD test
tukey = pairwise_tukeyhsd(endog=all_depths, groups=labels, alpha=0.05)
print(tukey)

In [None]:
# Visualse mean depth of comment tree
fig, ax = plt.subplots(1, 3, figsize=(30, 10))
sns.histplot(depth_askmen, bins=max(depth_askmen), color='blue', alpha=0.5, label='AskMen', ax=ax[0])
sns.histplot(depth_askwomen, bins=max(depth_askwomen), color='red', alpha=0.5, label='AskWomen', ax=ax[1])
sns.histplot(depth_tata, bins=max(depth_tata), color='green', alpha=0.5, label='TooAfraidToAsk', ax=ax[2])

for a in ax:
	a.legend()

plt.tight_layout()
plt.show()

# Clean text data

In [None]:
# Transform the data into headline + body text and label
posts_df['text'] = posts_df['title'] + ' ' + posts_df['selftext']

# convert text to string
posts_df['text'] = posts_df['text'].astype(str)

# Tokenize the data

posts_df['tokenised_text_both'] = posts_df['text'].map(lambda x: preprocess_text_hyphen(x, option_stopwords="True", option_lemmatise="True", shortword=2))
posts_df['tokenised_text_lemmatise'] = posts_df['text'].map(lambda x: preprocess_text_hyphen(x, option_stopwords="False", option_lemmatise="True", shortword=2))

# Keeping hyphens so words such as ex-boyfriend or make-up are kept together. Also keeps words such as full-time or self-esteem together

# Turn columns into a list of lists
def column_to_list(df, column1, column2):
    """ 
    Convert two columns of a dataframe into a list of lists in order to use them in a text processing exercise where need label and text.
    """
    return df[[column1, column2]].values.tolist()

tokenised_text_both = column_to_list(posts_df, 'tokenised_text_both', 'subreddit')
tokenised_text_lemmatise = column_to_list(posts_df, 'tokenised_text_lemmatise', 'subreddit')

# Save each subreddit in df 
askwomen_df= posts_df[posts_df['subreddit'] == 'AskWomen']
askmen_df= posts_df[posts_df['subreddit'] == 'AskMen']
tooafraid_df= posts_df[posts_df['subreddit'] == 'TooAfraidtoask']

# This is the tokenisation used:
tokenised_list = tokenised_text_both

# Split the list into text and label
corpus_text = [doc[0] for doc in tokenised_list]
corpus_label = [doc[1] for doc in tokenised_list]

# Plot post similarities (t-SNE)

In [None]:
# Plot t-SNE similiarity of posts
vectorizer = TfidfVectorizer(min_df=2, token_pattern=r"(?u)\b\w+[-]?\w+\b") # allow for hyphens in words
tfidf_matrix = vectorizer.fit_transform(corpus_text)
feature_names = vectorizer.get_feature_names_out()

fig_doc, ax_doc = plot_similarities(tfidf_matrix, corpus_label, "Post Similarities without stop words (t-SNE of post vectors)",label_color=False)

The t-SNE plot shows a lot of overlap between all three Subreddits, indicating that similar topics may be discussed across Subreddits.
However, it seems like that there may be a topic or group of topics that are only discussed in the AskMen and TooAfraidToAsk since there is one cluster on the left side containing AskMen and TooAfraidToAsk posts but not contain many AskWomen posts, which is in line with our hypothesis that AskMen and TooAfraidToAsk are less distinct than AskWomen.

In [None]:
# This is the tokenisation used:
tokenised_list = tokenised_text_lemmatise

# Split the list into text and label
corpus_text = [doc[0] for doc in tokenised_list]
corpus_label = [doc[1] for doc in tokenised_list]

# Plot t-SNE similiarity of posts
vectorizer = TfidfVectorizer(min_df=2, token_pattern=r"(?u)\b\w+[-]?\w+\b") # allow for hyphens in words
tfidf_matrix = vectorizer.fit_transform(corpus_text)
feature_names = vectorizer.get_feature_names_out()

fig_doc, ax_doc = plot_similarities(tfidf_matrix, corpus_label, "Document Similarities with stop words (t-SNE of document vectors)",label_color=False)

# Better clustering when not removing stop words -> subreddits use different stop words

When not removing stop words the clustering seems to be more distinct and in line with our intial hypothesis since we can now differentiate between a AskWomen cluster and a mixed AskMen and TooAfraidToAsk cluster. This supports our hypothesis that stopwords may help to differentiate between subreddits due to using them in different ways. 

 We then carry out a Chi2 to examine whether the relative frequency of "and" is statistically different across the 3 subreddits and find that AskWomen uses "and" relatively less than AskMen and TooAfraidToAsk. Hence, this is against our hypothesis and may be driven by the fact that men maybe write more informally than women and thus use more "and".

# Chi2 and post-hoc chi2 for word frequency

In [None]:
from scipy.stats import chi2_contingency

# Quantify difference in the use of "and"

# Count the number of times "and" appears in the text and how many words there are
posts_df = extract_term("and", posts_df["tokenised_text_lemmatise"], posts_df, "")
posts_df["count_text"] = posts_df["tokenised_text_lemmatise"].str.split().str.len()
posts_df["fraction_and_post"] = (posts_df['count_and_'] / posts_df['count_text']) * 100

# Sum counts based on subreddit
subreddit_counts = posts_df.groupby('subreddit')[['count_and_', 'count_text']].sum()

# Calculate fraction of counts in each subreddit as percentage of overall word count
subreddit_counts['fraction_and'] = (subreddit_counts['count_and_'] / subreddit_counts['count_text']) * 100
subreddit_counts['fraction_and'] = subreddit_counts['fraction_and'].round(2)

subreddit_counts.rename(columns={'count_and_': 'Count of "and"', 'count_text': 'Word count', 'fraction_and': 'Fraction of "and" (%)'}, inplace=True)

print(subreddit_counts[['Count of "and"', 'Word count', 'Fraction of "and" (%)']])

# Create a contingency table
contingency_table = subreddit_counts[['Count of "and"', 'Word count']].values

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-square statistic: {chi2:.2f}")
print(f"P-value: {p:.2f}")

# Plot count of words and counts of why on bar plot

fig, ax1 = plt.subplots(figsize=(12, 10))

# Plot count of "and" on primary y-axis
sns.boxplot(x='subreddit', y='count_and_', data=posts_df, ax=ax1)
ax1.set_ylabel('Count of "and"')
plt.show()

# Post-hoc pairwise comparisons chi2 test

# Subset the data for each pair of subreddits
askmen_vs_askwomen = subreddit_counts.loc[['AskMen', 'AskWomen']]
askmen_vs_tooafraid = subreddit_counts.loc[['AskMen', 'TooAfraidtoask']]
askwomen_vs_tooafraid = subreddit_counts.loc[['TooAfraidtoask', 'AskWomen']]

# Function to create a contingency table and perform chi2 test
def perform_chi_square(df):
    df=df.drop(columns=['fraction_and'])
    contingency_table = df.values
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return chi2, p, dof, expected

# Perform chi-square tests for each pair
chi2_am_aw, p_am_aw, _, _ = perform_chi_square(askmen_vs_askwomen)
chi2_am_ta, p_am_ta, _, _ = perform_chi_square(askmen_vs_tooafraid)
chi2_aw_ta, p_aw_ta, _, _ = perform_chi_square(askwomen_vs_tooafraid)

# Set the Bonferroni-corrected significance level because we are doing 3 post-hoc tests
alpha = 0.05
corrected_alpha = alpha / 3 
# Print the results
print(f"AskMen vs AskWomen: Chi-square = {chi2_am_aw:.2f}, p-value = {p_am_aw:.2f}")
print(f"AskMen vs TooAfraidToAsk: Chi-square = {chi2_am_ta:.2f}, p-value = {p_am_ta:.2f}")
print(f"AskWomen vs TooAfraidToAsk: Chi-square = {chi2_aw_ta:.2f}, p-value = {p_aw_ta:.2f}")

# Check if p-values are below the corrected alpha threshold
print(f"\nCorrected alpha threshold: {corrected_alpha:.2f}")
print(f"AskMen vs AskWomen statistically different? {p_am_aw < corrected_alpha}")
print(f"AskMen vs TooAfraidToAsk statistically different? {p_am_ta < corrected_alpha}")
print(f"AskWomen vs TooAfraidToAsk  statistically different? {p_aw_ta < corrected_alpha}")
