In [None]:
# Plot terms over time

def extract_term(term:str,string:pd.DataFrame,df:pd.DataFrame,name:str)->pd.DataFrame:
    """
    Extracts the count of a term in a string and adds it to a dataframe.
    Needed for frequency_top_terms_ts function.
    String is column with text of pd.DataFrame
    """
    string_series=pd.Series(string)
    count=string_series.str.count(rf"\b{term}\b") # to make sure that we are only counting the word and not part of a word
    df[f"count_{term}_{name}"]=count
    return df

def freq_top_terms_ts(df: pd.DataFrame, time_column: str, title_column: str, text_column: str, top_words: dict, resampling: str) -> pd.DataFrame:
    """
    Plots the top terms by TF-IDF score for a subreddit.
    If Reddit data stored as json need to convert to DataFrame first with create_posts_dataframe function.
    Counts relative frequency of top words.
    """

    df[time_column] = pd.to_datetime(df[time_column], unit="s") # convert to datetime
    df = df[[time_column, title_column, text_column]].copy() # keep only relevant columns

    df[f"{text_column}_processed"] = df[text_column].apply(preprocess_text) # preprocess text
    df[f"{title_column}_processed"] = df[title_column].apply(preprocess_text) # preprocess title

    # Extract term counts for each word in top_words
    for word in top_words:
        df = extract_term(word, df[f"{title_column}_processed"], df, "title")
        df = extract_term(word, df[f"{text_column}_processed"], df, "text")

    # Calculate total word count for each post
    df["total_title"] = df[f"{title_column}_processed"].str.split().str.len()
    df["total_text"] = df[f"{text_column}_processed"].str.split().str.len()
    df["total_words"] = df["total_title"] + df["total_text"]

    # Calculate total count for each word
    for word in top_words:
        df[f"count_{word}_total"] = df[f"count_{word}_title"] + df[f"count_{word}_text"]
        df.drop(columns=[f"count_{word}_title", f"count_{word}_text"], inplace=True)

    # Group by time and sum counts
    df_grouped = df.resample(resampling, on=time_column).sum()

    # Calculate frequency for each word in top_words
    for word in top_words:
        df_grouped[f"frequency_{word}"] = df_grouped[f"count_{word}_total"] / df_grouped["total_words"]

    return df_grouped


def plot_freq_top_terms_ts(df_grouped:pd.DataFrame, top_words:dict, title:str) -> None:
    """
    Plot the frequency of top terms by TF-IDF score in a subreddit as a time series.
    Visualises results from freq_top_terms_ts function.
    """
    # Create axis and plot time series
    fig, ax = plt.subplots(figsize=(15, 10))
    for word in top_words:
        df_grouped[f"frequency_{word}"].plot(ax=ax, label=word)

    # Format plot
    ax.set_title(title)
    ax.set_ylabel("Frequency")
    ax.set_xlabel("Date")
    ax.yaxis.set_major_formatter(PercentFormatter(1.0))
    if len(df_grouped.index)<10:
        ax.set_xticks(df_grouped.index)
        ax.set_xticklabels(df_grouped.index.strftime('%Y-%m-%d'))

    plt.legend(fontsize=14)
    plt.show()

NameError: name 'pd' is not defined

Hypothesis 1: There will be a bigger overlap in authors that post in AskMen and TooAfraidToAsk than in AskMen and AskWomen since the majority of Reddit users are men and thus a more general subreddit like TooAfraidToAsk will have a bigger overlap with AskMen than AskWomen.

In [None]:
# Import packages
from models.reddit_scraper import RedditScraper
from config.settings import USER_AGENT
from utils.analysis import *
from collections import Counter
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3_unweighted
import numpy as np

ModuleNotFoundError: No module named 'models'

In [None]:
# Get Reddit data
scraper = RedditScraper(USER_AGENT) # create a RedditScraper object

subs_of_interest = ["AskMen", "AskWomen","TooAfraidToAsk"] # list of subreddits to analyze

dfs = [] # list to store DataFrames

for sub in subs_of_interest:
    posts = scraper.get_subreddit_posts(sub, limit=1000,cache=True) # scrape 1000 posts#
    dfs.append(create_posts_dataframe(posts)) # convert posts to a pandas DataFrame

AskMen_df = dfs[0]
AskWomen_df = dfs[1]
TooAfraidToAsk_df = dfs[2]

subs_of_interest_dfs = [AskMen_df, AskWomen_df, TooAfraidToAsk_df]

In [None]:
## Filter DataFrames
def filter_df(df:pd.DataFrame) -> pd.DataFrame:
    df = df[(df["author"] != "[deleted]") & (df["author"] != "AutoModerator")] # remove deleted authors & bots
    return df

AskMen_df = filter_df(AskMen_df)
AskWomen_df = filter_df(AskWomen_df)
TooAfraidToAsk_df = filter_df(TooAfraidToAsk_df)


## Get unique authors

def get_unique_authors(df:pd.DataFrame) -> set:
    return set(df["author"])

author_men = get_unique_authors(AskMen_df)
author_women = get_unique_authors(AskWomen_df)
author_tooafraidtoask = get_unique_authors(TooAfraidToAsk_df)


print(f"""
    Number of unique authors in AskMen: {len(author_men)}
    Number of unique authors in AskWomen: {len(author_women)}
    Number of unique authors in AskReddit: {len(author_tooafraidtoask)}
    The number of unqiue authors in all three Subreddits is similiar.
""")

## Jacard Similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    jaccard_similarity = intersection / union
    return jaccard_similarity

print(f"""
    Jaccard Similarity:
    Men & Women: {jaccard_similarity(author_men, author_women):.04f}
    Women & TooAfraidToAsk: {jaccard_similarity(author_women, author_tooafraidtoask):.04f}
    Men & TooAfraidToAsk: {jaccard_similarity(author_tooafraidtoask, author_men):.04f}""")

v=venn3_unweighted(subsets = (author_women, author_tooafraidtoask, author_men), set_labels = ("AskWomen", "TooAfraidToAsk", "AskMen"))
plt.show()

print("""
The Jaccard Similarity shows that the distance between each Subreddit when considering the authors of each SubReddit is similiar but very small since there is barely any overlap between the authors of the Subreddits.
However, in line with hypothesis 1, the overlap between AskMen and TooAfraidToAsk is slightly bigger than the overlap between AskWomen and TooAfraidToAsk.
The reason why the overlap is small may be because the Reddit API only returns the last 1000 posts and hence for AskMen and TooAfraidToAsk, we only have data from the last 2 weeks (for AskWomen we can go back a bit further).
People in these "Question" Subreddits may post less frequently and thus we don't observe a big overlap between the authors of the Subreddits.
""")

In [None]:
def fraction_recurring_authors(df:pd.DataFrame) -> float:
    """
    Calculate the fraction of recurring authors in a Subreddit
    """
    author_counts = Counter(df["author"])
    recurring_authors_count = sum(1 for count in author_counts.values() if count > 1)
    fraction_recurring=recurring_authors_count/len(author_counts)
    return fraction_recurring

print(f"""
    AskMen:{fraction_recurring_authors(AskMen_df):.02%}
    AskWomen:{fraction_recurring_authors(AskWomen_df):.02%}
    TooAfraidToAsk:{fraction_recurring_authors(TooAfraidToAsk_df):.02%}

    The fraction of reoccuring authors is low in each SubReddit which supports the hypothesis that the reason we don't oberserve big overlaps may be driven by the fact that people post less frequently in "Question" Subreddits.""")

In [None]:
def count_recurring_authors(df:pd.DataFrame) -> int:
    """
    Count the number of recurring authors in a Subreddit
    """
    author_counts = Counter(df["author"])
    recurring_authors_count = list({author: count for author,count in author_counts.items() if count > 1}.values())

    return recurring_authors_count

In [None]:
# Compute bin edges and convert to integers
fig, ax = plt.subplots(3, 1, figsize=(10, 15))

list_counts = []
for sub_df in [AskMen_df, AskWomen_df, TooAfraidToAsk_df]:
    list_counts.append(count_recurring_authors(sub_df))


for index, number in enumerate(list_counts):
    count_bins = np.arange(2, max(number) + 2) - 0.5
    ax[index].hist(number, bins=count_bins, edgecolor='black')
    ax[index].set_xticks(np.arange(2, max(number) + 1))
    ax[index].set_xlabel('Number of Posts by Author')
    ax[index].set_ylabel('Count of Authors')
    ax[index].set_title(f'Distribution of Recurring Authors by Number of Posts ({subs_of_interest[index]})')
plt.subplots_adjust(hspace=0.5)
plt.tight_layout()
plt.show()

print("These barplots show that even for reoccuring authors, most authors have only posted twice. This supports the hypothesis that the reason we don't observe big overlaps may be driven by the fact that people may not post frequently in \"Question\" Subreddits.")

Hypothesis 2: There will be a bigger overlap in vocabulary that is used in AskMen and TooAfraidToAsk than in AskMen and AskWomen since the majority of Reddit users are men and thus a more general subreddit like TooAfraidToAsk will have a bigger overlap with AskMen.

Hypothesis 3: In AskMen "woman" will be more noteworthy than "man" in AskWoman

In [None]:
# Code for t-SNE scatterplot
# Transform the data into headline + body text and label
posts_df['text'] = posts_df['title'] + ' ' + posts_df['selftext']

# convert text to string
posts_df['text'] = posts_df['text'].astype(str)

posts_df['tokenised_text_both'] = posts_df['text'].map(lambda x: preprocess_text_option(x, option_stopwords="True", option_lemmatise="True", shortword=1))
#posts_df['tokenised_text_lemmatise'] = posts_df['text'].map(lambda x: preprocess_text_option(x, option_stopwords="False", option_lemmatise="True", shortword=2))
#posts_df['tokenised_text_stopwords'] = posts_df['text'].map(lambda x: preprocess_text_option(x, option_stopwords="True", option_lemmatise="False", shortword=2))
#posts_df['tokenised_text_nothing'] = posts_df['text'].map(lambda x: preprocess_text_option(x, option_stopwords="False", option_lemmatise="False", shortword=2))

# Turn columns into a list of lists
def column_to_list(df, column1, column2):
    return df[[column1, column2]].values.tolist()

tokenised_text_both = column_to_list(posts_df, 'tokenised_text_both', 'subreddit')

# This is the tokenisation used:
tokenised_list = tokenised_text_both

corpus_text = [doc[0] for doc in tokenised_list]
corpus_label = [doc[1] for doc in tokenised_list]

vectorizer = TfidfVectorizer(min_df=2)
tfidf_matrix = vectorizer.fit_transform(corpus_text)
feature_names = vectorizer.get_feature_names_out()

fig_doc, ax_doc = plot_similarities(tfidf_matrix, corpus_label, "Document Similarities (t-SNE of document vectors)",label_color=False)

# Functions from Bernie changed:

def plot_similarities(tfidf_matrix, labels,
                      title="term document plot",
                        method='tsne', is_documents=True, label_color=False,
                      top_terms=None, figsize=(12, 8)):
    """
    Create projection visualization of document or term similarities

    Parameters:
    - tfidf_matrix: scipy sparse matrix
    - labels: list of labels (document texts or terms)
    - title: plot title
    - method: 'tsne' or 'mds' for dimensionality reduction
    - top_terms: if int, only annotate top n terms
    - is_documents: if True, plot documents, else plot terms
    - figsize: tuple for figure size
    """

    # Convert to dense array and transpose if visualizing terms
    matrix = tfidf_matrix.toarray()
    if not is_documents:
        matrix = matrix.T

    # Dimensionality reduction method
    if method == 'tsne':
        tsne = TSNE(n_components=2,
                    perplexity=min(30, len(labels)-1),
                    random_state=42,
                    metric='cosine')
        coords = tsne.fit_transform(matrix)
    elif method == 'mds':
        mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
        distances = 1 - cosine_similarity(matrix)
        coords = mds.fit_transform(distances)
    else:
        raise ValueError("Method must be 'tsne' or 'mds'")

    # Create visualization
    fig, ax = plt.subplots(figsize=figsize)
    sns.scatterplot(x=coords[:, 0], y=coords[:, 1], alpha=0.6, hue=labels)

    # Add labels
    if top_terms and isinstance(top_terms, int):
        mean_tfidf = tfidf_matrix.mean(axis=0).A1 if is_documents else tfidf_matrix.mean(axis=1).A1
        top_indices = mean_tfidf.argsort()[-top_terms:][::-1]
        labels_to_annotate = [labels[i] for i in top_indices]
        coords_to_annotate = coords[top_indices]
    else:
        labels_to_annotate = labels
        coords_to_annotate = coords

    if label_color:
        unique_labels = list(set(labels_to_annotate))
        color_map = {label: color for label, color in zip(unique_labels, plt.cm.rainbow(np.linspace(0, 1, len(unique_labels))))}
        colors = [color_map[label] for label in labels_to_annotate]
    else:
        colors = ['black'] * len(labels_to_annotate)

    for i, (label, color) in enumerate(zip(labels_to_annotate, colors)):
        # Split long labels for documents
        if is_documents:
            label = split_label(label, 20)
    if  label_color:
        ax.annotate(label, (coords_to_annotate[i, 0], coords_to_annotate[i, 1]),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8 if is_documents else 12, alpha=0.7, color=color)


    ax.set_title(title)
    ax.grid(True, linestyle='--', alpha=0.3)
    return fig, ax


def preprocess_text_option(text, option_stopwords, option_lemmatise, shortword): # Different to Day 2 because also lemmatises
    """
    Clean and normalize text using NLTK.
    """
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    if option_stopwords=="True":
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize based on POS tag
    if option_lemmatise=="True":
        lemmatizer = WordNetLemmatizer()
        tokens = pos_tag(tokens)
        # Lemmatizes words either as verbs or nouns
        tokens = [lemmatizer.lemmatize(word, 'v') if tag.startswith('V')
                  else lemmatizer.lemmatize(word)
                  for word, tag in tokens
                  ]

    # Remove short words
    tokens = [token for token in tokens if len(token) > shortword]

    return ' '.join(tokens)







NameError: name 'posts_df' is not defined

In [None]:
# k-means

custom_cmap = ListedColormap(['#1f77b4', '#2ca02c', '#ff7f0e'])

def k_means(minimum_occurrences_word: int, corpus_text: list, corpus_label: int, clusters: int, tokenpattern=r"(?u)\b\w\w+\b"):

    """Function to perform K-means clustering on a given dataset and plot the results"""

    vectorizer = TfidfVectorizer(min_df=minimum_occurrences_word, token_pattern=tokenpattern)
    tfidf_matrix = vectorizer.fit_transform(corpus_text)
    feature_names = vectorizer.get_feature_names_out()

    # Create and fit the k-means model
    kmeans = KMeans(n_clusters=clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(tfidf_matrix)

    # Reduce dimensionality for plotting
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(feature_names)/4), metric='cosine')
    tfidf_matrix_2d = tsne.fit_transform(tfidf_matrix.toarray())

    # Shilouette score
    silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)

    # Plot the results
    plt.figure(figsize=(10, 6))

    # Plot the data points, colored by their cluster assignments
    scatter = plt.scatter(tfidf_matrix_2d[:, 0], tfidf_matrix_2d[:, 1], c=cluster_labels, cmap=custom_cmap, alpha=0.7)

    # Plot the cluster centers
    # cluster_centers_2d = tsne.fit_transform(kmeans.cluster_centers_)
    # plt.scatter(cluster_centers_2d[:, 0],
                # cluster_centers_2d[:, 1],
                # c='red',
                # marker='x',
                # s=200,
                # linewidth=3,
                # label='Centroids')

    plt.title(f'K-means Clustering (k={clusters})')

    # Add a legend
    legend_labels = [f'Cluster {i}' for i in range(clusters)]
    legend_handles = scatter.legend_elements()[0]  # Get the handles for the legend
    plt.legend(legend_handles, legend_labels, title="Clusters", loc='upper right')

    plt.show()

     # Determine the majority label for each cluster
    cluster_label_counts = {}
    for cluster_label, true_label in zip(cluster_labels, corpus_label):
        if cluster_label not in cluster_label_counts:
            cluster_label_counts[cluster_label] = {}
        if true_label not in cluster_label_counts[cluster_label]:
            cluster_label_counts[cluster_label][true_label] = 0
        cluster_label_counts[cluster_label][true_label] += 1

    # Map clusters to majority labels
    cluster_to_label = {
      cluster: max(counts.items(), key=lambda x: x[1])[0]
        for cluster, counts in cluster_label_counts.items()
    }

    # Convert cluster numbers to predicted labels
    kmeans_pred = [cluster_to_label[label] for label in cluster_labels]

    return kmeans_pred, cluster_label_counts, cluster_to_label, silhouette_avg

kmeans_pred, cluster_label_counts,cluster_to_label, silhouette_avg = k_means(2, corpus_text, corpus_label, 3, tokenpattern=r"(?u)\b\w+[-]?\w+\b")

print("\nK-means Clustering Results:")
print(classification_report(corpus_label, kmeans_pred))
print("\n Cluster to Label Counts:")
print(cluster_label_counts)
print("\n Shilouette Score:")
print(silhouette_avg)

In [None]:
# Zoom into woman

def plot_word_associations_tsne(tfidf_matrix, feature_names, target_word='women', n_highlight=5, title=None, zoom_factor=2, jitter_strength=0):
    """
    Plot word associations using t-SNE and highlight words closest to the target word after t-SNE.
    """
    # Get vectors for all terms
    term_vectors = tfidf_matrix.T.toarray()

    # Calculate t-SNE for all terms
    tsne = TSNE(n_components=2,
                perplexity=min(30, len(feature_names) / 4),
                random_state=42,
                metric='cosine')
    coords = tsne.fit_transform(term_vectors)

    # Find the index of the target word
    if target_word not in feature_names:
        print(f"'{target_word}' not found in feature names.")
        return None, None
    target_index = np.where(feature_names == target_word)[0][0]

    # Calculate distances from the target word in t-SNE space
    distances = np.linalg.norm(coords - coords[target_index], axis=1)

    # Get the indices of the closest words (excluding the target word itself)
    closest_indices = np.argsort(distances)[1:n_highlight + 1]  # Exclude the first one, which is the word itself

    # Highlight terms within the specified distance threshold
    highlight_indices = [target_index] + closest_indices.tolist()

    # Plot
    fig, ax = plt.subplots(figsize=(10, 10))

    # Plot all points in light gray
    ax.scatter(coords[:, 0], coords[:, 1], c='lightgray', alpha=0.5, s=30)

    # Highlight top terms including the target word
    ax.scatter(coords[highlight_indices, 0], coords[highlight_indices, 1], c='#4682B4', s=100)

    # Add labels for highlighted terms
    texts = []
    for i in highlight_indices:
        jitter_x = np.random.uniform(-jitter_strength, jitter_strength)
        jitter_y = np.random.uniform(-jitter_strength, jitter_strength)
        texts.append(ax.text(coords[i, 0] + jitter_x, coords[i, 1] + jitter_y, feature_names[i], fontsize=14,
                             bbox=dict(facecolor='white', edgecolor='gray', alpha=0.7)))

    # Set limits to zoom into the area around the target word
    x_min, x_max = coords[highlight_indices, 0].min() - zoom_factor, coords[highlight_indices, 0].max() + zoom_factor
    y_min, y_max = coords[highlight_indices, 1].min() - zoom_factor, coords[highlight_indices, 1].max() + zoom_factor
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)

    if title:
        ax.set_title(f'Word Associations with "{target_word}" in {title} (Closest {n_highlight} Words Highlighted)')
    else:
        ax.set_title(f'Word Associations with "{target_word}" (Closest {n_highlight} Words Highlighted)')

    plt.tight_layout()
    return fig, ax






# With what is the word "women" associated with in different subreddits?

askmen_text= askmen_df['tokenised_text_both'].tolist()

vectorizer = TfidfVectorizer(min_df=2, token_pattern=r"(?u)\b\w+[-]?\w+\b") # allow for hyphens in words
tfidf_matrix = vectorizer.fit_transform(askmen_text)
feature_names = vectorizer.get_feature_names_out()

plot_word_associations_tsne(tfidf_matrix, feature_names,target_word='woman', n_highlight=15, title="AskMen", zoom_factor=3, jitter_strength=0.75)

askwomen_text= askwomen_df['tokenised_text_both'].tolist()

vectorizer = TfidfVectorizer(min_df=2, token_pattern=r"(?u)\b\w+[-]?\w+\b") # allow for hyphens in words
tfidf_matrix = vectorizer.fit_transform(askwomen_text)
feature_names = vectorizer.get_feature_names_out()

plot_word_associations_tsne(tfidf_matrix, feature_names,target_word='woman', n_highlight=15, title="AskWomen", zoom_factor=3, jitter_strength=0.35)

tooafraid_text= tooafraid_df['tokenised_text_both'].tolist()

vectorizer = TfidfVectorizer(min_df=2, token_pattern=r"(?u)\b\w+[-]?\w+\b") # allow for hyphens in words
tfidf_matrix = vectorizer.fit_transform(tooafraid_text)
feature_names = vectorizer.get_feature_names_out()

plot_word_associations_tsne(tfidf_matrix, feature_names,target_word='woman', n_highlight=15, title="TooAfraidToAsk", zoom_factor=3, jitter_strength=0.6)




