# **Import libraries**

In [67]:
!pip install -q zipfile36 contractions

In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import re
from collections import Counter
from wordcloud import WordCloud
import nltk

import contractions
import kagglehub
from google.colab import files

In [69]:
# Download and load NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
# Load spaCy English model
import spacy
nlp = spacy.load('en_core_web_sm')

In [71]:
# Set styles
sns.set_style("whitegrid")
sns.set_context("talk")

# **Importing datasets**

## mental_health

In [72]:
path = kagglehub.dataset_download("szegeelim/mental-health")
mental_health = pd.read_csv(path + "/Combined Data.csv", index_col=0)

KeyboardInterrupt: 

## suicidal_tweet_detection_dataset

In [None]:
path = kagglehub.dataset_download("aunanya875/suicidal-tweet-detection-dataset")
suicidal_tweet_detection_dataset = pd.read_csv(path + "/Suicide_Ideation_Dataset(Twitter-based).csv")

## reddit_mental_health_data

In [None]:
# Import dataset
path = kagglehub.dataset_download("neelghoshal/reddit-mental-health-data")
reddit_mental_health_data = pd.read_csv(path + "/data_to_be_cleansed.csv", index_col=0)

# Decode target variable
mental_health_target_map = {
    0: "Stress",
    1: "Depression",
    2: "Bipolar disorder",
    3: "Personality disorder",
    4: "Anxiety"
}

reddit_mental_health_data["target"] = reddit_mental_health_data["target"].map(mental_health_target_map)

## dreaddit

In [None]:
!curl -o ./dreaddit.zip "http://www.cs.columbia.edu/~eturcan/data/dreaddit.zip"

In [None]:
# Concat CSV
with zipfile.ZipFile("dreaddit.zip") as z:

   with z.open("dreaddit-train.csv") as f:
      dreaddit_train = pd.read_csv(f)

   with z.open("dreaddit-test.csv") as f:
      dreaddit_test = pd.read_csv(f)

# Select training dataset
dreaddit = dreaddit_train

# Decode target variable
dreaddit['label'] = dreaddit['label'].map({0: 'Not stressful', 1: 'Stressful'})

# **Descriptive analysis - Raw**

## mental_health

In [None]:
mental_health.dtypes

In [None]:
mental_health.describe()

## suicidal_tweet_detection_dataset

In [None]:
suicidal_tweet_detection_dataset.dtypes

In [None]:
suicidal_tweet_detection_dataset.describe()

## reddit_mental_health_data

In [None]:
reddit_mental_health_data.dtypes

In [None]:
reddit_mental_health_data.describe()

## Dreaddit

In [None]:
dreaddit.dtypes

In [None]:
dreaddit.describe()

In [None]:
dreaddit.describe(include='object')

# **Data quality - Raw**

## mental_health

In [None]:
mental_health.isna().sum()

In [None]:
mental_health[mental_health['statement'].isna()]['status'].value_counts()

## suicidal_tweet_detection_dataset

In [None]:
suicidal_tweet_detection_dataset.isna().sum()

## reddit_mental_health_data

In [None]:
mental_health.isna().sum()

 ## Dreaddit

In [None]:
dreaddit_na = dreaddit.isna().sum()

dreaddit_na.loc[dreaddit_na > 0]

# **Preprocessing**

## mental_health

In [None]:
# Delete rows with missing statement
mental_health = mental_health.dropna()

# Drop statement's dupicates
mental_health = mental_health.drop_duplicates(subset='statement')

## suicidal_tweet_detection_dataset

In [None]:
# Delete rows with missing Tweet
suicidal_tweet_detection_dataset = suicidal_tweet_detection_dataset.dropna()

# Drop tweet's dupicates
suicidal_tweet_detection_dataset = suicidal_tweet_detection_dataset.drop_duplicates(subset='Tweet')

# Add column with name of the social where the text was posted
suicidal_tweet_detection_dataset['social'] = 'Twitter'

## reddit_mental_health_data

In [None]:
# Delete rows with missing Tweet
reddit_mental_health_data = reddit_mental_health_data.dropna()

# Drop tweet's dupicates
reddit_mental_health_data = reddit_mental_health_data.drop_duplicates(subset='text')

# Add column with name of the social where the text was posted
reddit_mental_health_data['social'] = 'Reddit'

## dreaddit

In [None]:
# Drop rows where text in missing
dreaddit = dreaddit.loc[dreaddit['text'] != '#NAME?']

# Drop dupicates
dreaddit = dreaddit.drop_duplicates(subset=['subreddit', 'text'])

# Add column with name of the social where the text was posted
dreaddit['social'] = 'Reddit'

# **Descriptive analysis - Preprocessed**

## mental_health

In [None]:
mental_health.describe()

## suicidal_tweet_deterction_dataset

In [None]:
mental_health.describe()

## reddit_mental_health_data

In [None]:
reddit_mental_health_data.describe()

## Dreaddit

In [None]:
dreaddit.columns

In [None]:
dreaddit.describe()

In [None]:
dreaddit.describe(include='object')

# **Data quality - Preprocessed**

## mental_health

In [None]:
mental_health.isna().sum()

In [None]:
mental_health['status'].value_counts()

## suicidal_tweet_detection_dataset

In [None]:
suicidal_tweet_detection_dataset.isna().sum()

In [None]:
suicidal_tweet_detection_dataset['Suicide'].value_counts()

## reddit_mental_health_data

In [None]:
reddit_mental_health_data.isna().sum()

In [None]:
reddit_mental_health_data['target'].value_counts()

# **Feature engineering**

## mental_health

In [None]:
# Add text length feature (word count)
mental_health['text_length'] = mental_health['statement'].apply(lambda x: len(str(x).split()))

In [None]:
# Clean statement
def preprocess(text):
    """Lowercase, remove non-alpha chars, strip stopwords."""
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        words = text.strip().split()
        meaningful_words = [word for word in words if word not in nltk_stopwords]
        return ' '.join(meaningful_words)
    return ""

mental_health['clean_statement'] = mental_health['statement'].apply(preprocess)

## suicidal_tweet_detection_dataset

In [None]:
# Calculate text length
suicidal_tweet_detection_dataset['text_length'] = suicidal_tweet_detection_dataset['Tweet'].apply(lambda x: len(str(x).split()))

## reddit_mental_health_data

In [None]:
# Calculate text length
reddit_mental_health_data['text_length'] = reddit_mental_health_data['text'].apply(lambda x: len(str(x).split()))

## dreaddit

In [None]:
# Add text length feature (word count)
dreaddit['text_length'] = dreaddit['text'].apply(lambda x: len(str(x).split()))

In [None]:
# Map categorical stress label to numeric
dreaddit['label_num'] = dreaddit['label'].map({'Not stressful': 0, 'Stressful': 1})

# **Data visualization**

## mental_health

In [None]:
# Plot a frequency plot of the status associated to the tweet
sns.countplot(data=mental_health,
              y='status',
              order=mental_health['status'].value_counts().index)

plt.title("Mental health discorder frequency in mental_health")

plt.show()

In [None]:
# Create figure and axis for better layout control
fig, ax = plt.subplots(figsize=(10, 6))

# Define the unique number of statuses
unique_statuses = mental_health['status'].nunique()

# Plot the distribution of mental health conditions by post count
sns.countplot(
    data=mental_health,
    y='status',
    hue='status',
    order=mental_health['status'].value_counts().index,
    legend=False,
    ax=ax
)

# Add a descriptive title and axis labels for clarity
ax.set_title("Frequency of Mental Health Conditions in Dataset", fontsize=16, fontweight='bold')
ax.set_xlabel("Number of Posts", fontsize=14)
ax.set_ylabel("Mental Health Condition", fontsize=14)

# Adjust tick label sizes to improve readability
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Remove top and right spines for a cleaner look
sns.despine(ax=ax)

# Adjust layout and make space at the bottom for the caption
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

# Add figure number and caption below the plot
fig.text(
    0.5, 0.02,
    'Figure 6.1.2',
    fontsize=12,
    fontstyle='italic',
    ha='center'
)

plt.show()

In [None]:
# Calculate 95th percentile threshold
percentile_95 = mental_health['text_length'].quantile(0.95)

# Filter data below the 95th percentile
filtered_data = mental_health[mental_health['text_length'] <= percentile_95]

# Calculate summary stats on filtered data
mean_length = filtered_data['text_length'].mean()
median_length = filtered_data['text_length'].median()
max_length = filtered_data['text_length'].max()
min_length = filtered_data['text_length'].min()

# Create figure and axis for better layout control
fig, ax = plt.subplots(figsize=(12, 7))

# Plot histogram on filtered data
sns.histplot(
    data=filtered_data,
    x='text_length',
    bins=50,
    kde=True,
    color='#4C72B0',
    edgecolor='white',
    ax=ax
)

# Style the KDE line
plt.setp(ax.lines, linewidth=2, color='#333F4B')

# Add mean and median vertical lines
ax.axvline(mean_length, color='#FF5733', linestyle='--', linewidth=2, label=f'Mean: {mean_length:.0f} words')
ax.axvline(median_length, color='#28B463', linestyle='-.', linewidth=2, label=f'Median: {median_length:.0f} words')

# Add title and labels
ax.set_title('Distribution of Statement Lengths (Filtered at 95th Percentile)', fontsize=18, fontweight='bold', pad=15)
ax.set_xlabel('Number of Words', fontsize=14, labelpad=10)
ax.set_ylabel('Frequency', fontsize=14, labelpad=10)

# Legend configuration
ax.legend(frameon=False, fontsize=12, loc='upper right', bbox_to_anchor=(0.95, 0.95))

# Tick label size adjustments
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Remove top and right spines for a cleaner look
sns.despine(ax=ax)

# Add summary stats box inside the figure
fig.text(
    0.75, 0.5,
    f'''
Min: {min_length:.0f}
Median: {median_length:.0f}
Mean: {mean_length:.0f}
Max: {max_length:.0f}
''',
    fontsize=12,
    bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.5')
)

# Adjust layout to leave space for the figure caption
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

# Add figure caption below the plot
fig.text(
    0.5, 0.02,
    'Figure 6.1.3',
    fontsize=12,
    fontstyle='italic',
    ha='center'
)

# Set x axis lower limit
plt.xlim(0)

# Show the plot
plt.show()

In [None]:
# Define a function to generate and display word clouds
def generate_wordcloud(text, title, colormap='viridis'):
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap=colormap,
        max_words=100
    ).generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16, fontweight='bold')
    plt.show()

# Loop through each unique status and generate a word cloud
for status in mental_health['status'].unique():
    # Join all clean statements for this condition
    text = ' '.join(mental_health[mental_health['status'] == status]['clean_statement'])

    # Generate the word cloud
    generate_wordcloud(text, f"Most Common Words in '{status}' Posts", colormap='coolwarm')


## suicidal_tweet_detection_dataset

In [None]:
# Plot a frequency plot of the status associated to the tweet
sns.countplot(data=suicidal_tweet_detection_dataset,
              y='Suicide',
              order=suicidal_tweet_detection_dataset['Suicide'].value_counts().index
            )
# Set title
plt.title("Suicide frequency in suicidal_tweet_detection_dataset")

plt.show()

In [None]:
# Get counts of each class
suicide_counts = suicidal_tweet_detection_dataset['Suicide'].value_counts()

# Convert to a DataFrame for better readability
suicide_counts_df = suicide_counts.reset_index()
suicide_counts_df.columns = ['Suicide Status', 'Count']

print(suicide_counts_df)

In [None]:
# Create figure and axis for better layout control
fig, ax = plt.subplots(figsize=(10, 6))

# Define the unique number of statuses
unique_statuses = suicidal_tweet_detection_dataset['Suicide'].nunique()

# Plot the distribution of mental health conditions by post count
sns.countplot(
    data=suicidal_tweet_detection_dataset,
    y='Suicide',
    hue='Suicide',
    order=suicidal_tweet_detection_dataset['Suicide'].value_counts().index,
    legend=False,
    ax=ax  # Pass the axis to sns.countplot
)

# Add a descriptive title and axis labels for clarity
ax.set_title("Frequency of Suicide Flag in Dataset", fontsize=16, fontweight='bold')
ax.set_xlabel("Number of Posts", fontsize=14)
ax.set_ylabel("Suicide Flag", fontsize=14)

# Adjust tick label sizes to improve readability
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Remove top and right spines for a cleaner look
sns.despine(ax=ax)

# Adjust layout and make space at the bottom for the caption
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

# Add figure number and caption below the plot
fig.text(
    0.5, 0.02,
    'Figure 6.2.2',
    fontsize=12,
    fontstyle='italic',
    ha='center'
)

plt.show()

In [None]:
# Calculate 95th percentile threshold
percentile_95 = suicidal_tweet_detection_dataset['text_length'].quantile(0.95)

# Filter data below the 95th percentile to reduce skew from outliers
filtered_data = suicidal_tweet_detection_dataset[suicidal_tweet_detection_dataset['text_length'] <= percentile_95]

# Calculate summary statistics on filtered data
mean_length = filtered_data['text_length'].mean()
median_length = filtered_data['text_length'].median()
max_length = filtered_data['text_length'].max()
min_length = filtered_data['text_length'].min()

# Create figure and axis for better layout control
fig, ax = plt.subplots(figsize=(12, 7))

# Plot histogram on filtered data
sns.histplot(
    data=filtered_data,
    x='text_length',
    bins=50,
    kde=True,
    color='#4C72B0',
    edgecolor='white',
    ax=ax
)

# Style the KDE line for better visibility
plt.setp(ax.lines, linewidth=2, color='#333F4B')

# Add mean and median vertical lines
ax.axvline(mean_length, color='#FF5733', linestyle='--', linewidth=2, label=f'Mean: {mean_length:.0f} words')
ax.axvline(median_length, color='#28B463', linestyle='-.', linewidth=2, label=f'Median: {median_length:.0f} words')

# Add title and axis labels
ax.set_title('Distribution of Tweet Lengths (Filtered at 95th Percentile)', fontsize=18, fontweight='bold', pad=15)
ax.set_xlabel('Number of Words', fontsize=14, labelpad=10)
ax.set_ylabel('Frequency', fontsize=14, labelpad=10)

# Configure legend
ax.legend(frameon=False, fontsize=12, loc='upper right', bbox_to_anchor=(0.95, 0.95))

# Adjust tick label sizes for readability
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Remove spines for a cleaner appearance
sns.despine(ax=ax)

# Add summary stats box inside the figure
fig.text(
    0.75, 0.5,
    f'''
Min: {min_length:.0f}
Median: {median_length:.0f}
Mean: {mean_length:.0f}
Max: {max_length:.0f}
''',
    fontsize=12,
    bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.5')
)

# Adjust layout and leave space at the bottom for a caption
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

# Add figure caption below the plot
fig.text(
    0.5, 0.02,
    'Figure 6.2.3',
    fontsize=12,
    fontstyle='italic',
    ha='center'
)

# Set x axis lower limit
plt.xlim(0)

plt.show()

In [None]:
def clean_tokenize_spacy(text):
    """Expand contractions, lowercase, tokenize, and retain alphabetic tokens (including expanded ones)."""
    if isinstance(text, str):
        expanded_text = contractions.fix(text)
        doc = nlp(expanded_text.lower())
        tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
        return tokens
    return []

def get_top_ngrams(tweets, ngram_range=2, top_n=20):
    """Extract top n-grams from a list of tweets."""
    ngram_list = []

    for tweet in tweets:
        tokens = clean_tokenize_spacy(tweet)
        if len(tokens) >= ngram_range:
            ngrams = zip(*[tokens[i:] for i in range(ngram_range)])
            ngram_list.extend([' '.join(gram) for gram in ngrams])

    ngram_counts = Counter(ngram_list)
    return ngram_counts.most_common(top_n)

# Strip whitespace from 'Suicide' column values
suicidal_tweet_detection_dataset['Suicide'] = suicidal_tweet_detection_dataset['Suicide'].str.strip()

# Filter suicidal and non-suicidal tweets
suicidal_tweets = suicidal_tweet_detection_dataset[
    suicidal_tweet_detection_dataset['Suicide'] == 'Potential Suicide post'
]['Tweet']

non_suicidal_tweets = suicidal_tweet_detection_dataset[
    suicidal_tweet_detection_dataset['Suicide'] == 'Not Suicide post'
]['Tweet']

# Get top bigrams for each class
top_bigrams_suicidal = get_top_ngrams(suicidal_tweets, ngram_range=2, top_n=20)
top_bigrams_non_suicidal = get_top_ngrams(non_suicidal_tweets, ngram_range=2, top_n=20)

print("Top Bigrams in Suicidal Tweets:")
print(top_bigrams_suicidal)

print("\nTop Bigrams in Non-Suicidal Tweets:")
print(top_bigrams_non_suicidal)

# Plot suicidal tweet bigrams
if top_bigrams_suicidal:
    suicidal_bigram_words = [bigram for bigram, freq in top_bigrams_suicidal]
    suicidal_bigram_freqs = [freq for bigram, freq in top_bigrams_suicidal]

    fig, ax = plt.subplots(figsize=(12, 6))  # Use fig and ax for better control
    sns.barplot(x=suicidal_bigram_freqs, y=suicidal_bigram_words, palette='Reds_r', ax=ax)

    ax.set_title('Top 20 Bigrams in Suicidal Tweets', fontsize=16, fontweight='bold')
    ax.set_xlabel('Frequency', fontsize=14)
    ax.set_ylabel('Bigrams', fontsize=14)
    ax.tick_params(axis='x', labelsize=12)
    ax.tick_params(axis='y', labelsize=12)

    sns.despine(ax=ax)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.2)

    # Add figure caption below the plot
    fig.text(
        0.5, 0.02,
        'Figure X - Bhavesh to add after we add other figures to report',
        ha='center',
        fontsize=12,
        fontstyle='italic'
    )

    plt.show()
else:
    print("No bigrams found in suicidal tweets.")

# Plot non-suicidal tweet bigrams
if top_bigrams_non_suicidal:
    non_suicidal_bigram_words = [bigram for bigram, freq in top_bigrams_non_suicidal]
    non_suicidal_bigram_freqs = [freq for bigram, freq in top_bigrams_non_suicidal]

    fig, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(x=non_suicidal_bigram_freqs, y=non_suicidal_bigram_words, palette='Blues_r', ax=ax)

    ax.set_title('Top 20 Bigrams in Non-Suicidal Tweets', fontsize=16, fontweight='bold')
    ax.set_xlabel('Frequency', fontsize=14)
    ax.set_ylabel('Bigrams', fontsize=14)
    ax.tick_params(axis='x', labelsize=12)
    ax.tick_params(axis='y', labelsize=12)

    sns.despine(ax=ax)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.2)

    # Add figure caption below the plot
    fig.text(
        0.5, 0.02,
        'Figure X - Bhavesh to add after we add other figures to report',
        ha='center',
        fontsize=12,
        fontstyle='italic'
    )

    plt.show()
else:
    print("No bigrams found in non-suicidal tweets.")


Bhavesh

Thoughts following the above plot:

* Bigrams/trigrams might be informative features to use in the model given how different they are across labels
* Context is important as bigrams won't pick up on them
* Indicates the need to look at sentiment polarity


## reddit_mental_health_data

In [None]:
# Plot a frequency plot of the status associated to the tweet
sns.countplot(data=reddit_mental_health_data,
              y='target',
              order=reddit_mental_health_data['target'].value_counts().index
            )

# Set title
plt.title("Mental health discorder frequency in reddit_mental_health_data")

plt.show()

In [None]:
# Create figure and axis for better layout control
fig, ax = plt.subplots(figsize=(10, 6))

# Define the unique number of targets (mental health conditions)
unique_targets = reddit_mental_health_data['target'].nunique()

# Plot the distribution of mental health conditions by post count
sns.countplot(
    data=reddit_mental_health_data,
    y='target',
    order=reddit_mental_health_data['target'].value_counts().index
    ax=ax,
    legend=False
)

# Add a descriptive title and axis labels for clarity
ax.set_title("Frequency of Mental Health Conditions in reddit_mental_health_data", fontsize=16, fontweight='bold')
ax.set_xlabel("Number of Posts", fontsize=14)
ax.set_ylabel("Mental Health Condition", fontsize=14)

# Adjust tick label sizes to improve readability
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Remove top and right spines for a cleaner look
sns.despine(ax=ax)

# Adjust layout and make space at the bottom for the caption
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

# Add figure number and caption below the plot
fig.text(
    0.5, 0.02,
    'Figure 6.3.2',
    fontsize=12,
    fontstyle='italic',
    ha='center'
)

# Display the plot
plt.show()

In [None]:
# Calculate 95th percentile threshold to reduce skew from long outliers
percentile_95 = reddit_mental_health_data['text_length'].quantile(0.95)

# Filter data below the 95th percentile
filtered_data = reddit_mental_health_data[reddit_mental_health_data['text_length'] <= percentile_95]

# Calculate summary statistics on filtered data
mean_length = filtered_data['text_length'].mean()
median_length = filtered_data['text_length'].median()
max_length = filtered_data['text_length'].max()
min_length = filtered_data['text_length'].min()

# Create figure and axis for better layout control
fig, ax = plt.subplots(figsize=(12, 7))

# Plot histogram on filtered data
sns.histplot(
    data=filtered_data,
    x='text_length',
    bins=50,
    kde=True,
    color='skyblue',
    edgecolor='white',
    ax=ax
)

# Style the KDE line for better visibility
plt.setp(ax.lines, linewidth=2, color='#333F4B')

# Add mean and median vertical lines
ax.axvline(mean_length, color='#FF5733', linestyle='--', linewidth=2, label=f'Mean: {mean_length:.0f} words')
ax.axvline(median_length, color='#28B463', linestyle='-.', linewidth=2, label=f'Median: {median_length:.0f} words')

# Add title and axis labels
ax.set_title('Distribution of Post Lengths on Reddit (Filtered at 95th Percentile)', fontsize=18, fontweight='bold', pad=15)
ax.set_xlabel('Number of Words', fontsize=14, labelpad=10)
ax.set_ylabel('Frequency', fontsize=14, labelpad=10)

# Configure legend
ax.legend(frameon=False, fontsize=12, loc='upper right', bbox_to_anchor=(0.95, 0.95))

# Adjust tick label sizes for readability
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Remove spines for a cleaner appearance
sns.despine(ax=ax)

# Add summary stats box inside the figure
fig.text(
    0.75, 0.5,
    f'''
Min: {min_length:.0f}
Median: {median_length:.0f}
Mean: {mean_length:.0f}
Max: {max_length:.0f}
''',
    fontsize=12,
    bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.5')
)

# Adjust layout and leave space at the bottom for a caption
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

# Add figure caption below the plot
fig.text(
    0.5, 0.02,
    'Figure 6.3.3',
    fontsize=12,
    fontstyle='italic',
    ha='center'
)

# Set x axis lower limit
plt.xlim(0)

# Display the plot
plt.show()

In [None]:


def clean_tokenize_filtered(text):
    """
    Tokenize text, lemmatize, remove stopwords,
    and keep only NOUN and ADJ tokens.
    """
    if isinstance(text, str):
        doc = nlp(text.lower())
        allowed_pos = ['NOUN', 'ADJ']
        tokens = [
            token.lemma_
            for token in doc
            if token.is_alpha and not token.is_stop and token.pos_ in allowed_pos
        ]
        return tokens
    return []

def get_top_ngrams(texts, ngram_range=2, top_n=20):
    """
    Extract top n-grams from a list of texts after filtering.
    """
    ngram_list = []

    for text in texts:
        tokens = clean_tokenize_filtered(text)
        if len(tokens) >= ngram_range:
            ngrams = zip(*[tokens[i:] for i in range(ngram_range)])
            ngram_list.extend([' '.join(gram) for gram in ngrams])

    ngram_counts = Counter(ngram_list)
    return ngram_counts.most_common(top_n)

# Iterate through each mental health condition in the dataset
unique_conditions = reddit_mental_health_data['target'].unique()

# Start figure numbering (optional)
figure_num = 1

for condition in unique_conditions:
    print(f"\nTop Bigrams for {condition} Posts (Filtered):\n")

    # Combine 'text' and 'title' fields for analysis
    condition_texts = (
        reddit_mental_health_data[reddit_mental_health_data['target'] == condition]['text'] + ' ' +
        reddit_mental_health_data[reddit_mental_health_data['target'] == condition]['title']
    )

    # Get top bigrams for the condition
    top_bigrams = get_top_ngrams(condition_texts, ngram_range=2, top_n=10)

    # Print bigrams and their frequencies
    for bigram, freq in top_bigrams:
        print(f"{bigram}: {freq}")

    # Plot top bigrams if any are found
    if top_bigrams:
        bigram_phrases = [bigram for bigram, freq in top_bigrams]
        bigram_freqs = [freq for bigram, freq in top_bigrams]

        # Create figure and axis
        fig, ax = plt.subplots(figsize=(10, 6))

        # Barplot for bigrams
        sns.barplot(x=bigram_freqs, y=bigram_phrases, palette='viridis', ax=ax)

        # Titles and labels
        ax.set_title(f'Top 10 Filtered Bigrams in {condition} Posts', fontsize=16, fontweight='bold')
        ax.set_xlabel('Frequency', fontsize=14)
        ax.set_ylabel('Bigrams', fontsize=14)

        # Adjust tick label sizes
        ax.tick_params(axis='x', labelsize=12)
        ax.tick_params(axis='y', labelsize=12)

        sns.despine(ax=ax)

        # Adjust layout and reserve space for caption
        plt.tight_layout()
        plt.subplots_adjust(bottom=0.2)

        # Add figure caption below the plot
        fig.text(
            0.5, 0.02,  # Centered at the bottom
            f'Figure {figure_num}: Top bigrams in {condition} posts',
            ha='center',
            fontsize=12,
            fontstyle='italic'
        )

        plt.show()

        # Increment figure number for next plot
        figure_num += 1

    else:
        print(f"No meaningful bigrams found for {condition} posts.")

## dreaddit

In [None]:
# Plot a frequency plot of the status associated to the tweet
sns.countplot(data=dreaddit,
              y='label',
              order=dreaddit['label'].value_counts().index
            )
# Set title
plt.title("Stress label frequency in dreaddit")

plt.show()

In [None]:
# Calculate label counts for summary stats
label_counts = dreaddit['label'].value_counts()

# Basic statistics
total_labels = len(dreaddit)
num_classes = dreaddit['label'].nunique()
most_common_label = label_counts.idxmax()
most_common_count = label_counts.max()

# Create figure and axis for better layout control
fig, ax = plt.subplots(figsize=(12, 7))

# Plot the frequency of labels
sns.countplot(
    data=dreaddit,
    y='label',
    order=label_counts.index,
    palette='Set2',
    ax=ax
)

# Add title and axis labels
ax.set_title('Stress Label Frequency in Dreaddit Dataset', fontsize=18, fontweight='bold', pad=15)
ax.set_xlabel('Number of Posts', fontsize=14, labelpad=10)
ax.set_ylabel('Label', fontsize=14, labelpad=10)

# Adjust tick label sizes for readability
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Remove spines for a cleaner appearance
sns.despine(ax=ax)

# Add summary stats box inside the figure
fig.text(
    0.7, 0.5,
    f'''
Total Posts: {total_labels}
Unique Labels: {num_classes}
Most Common: {most_common_label} ({most_common_count})
''',
    fontsize=12,
    bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.5')
)

# Adjust layout and leave space at the bottom for a caption
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

# Add figure caption below the plot
fig.text(
    0.5, 0.02,
    'Figure 6.4.2',
    fontsize=12,
    fontstyle='italic',
    ha='center'
)

# Display the plot
plt.show()

In [None]:
# Calculate proportion of stressful posts per subreddit
subreddit_stress = (
    dreaddit.groupby('subreddit')['label_num']
    .mean()
    .sort_values(ascending=False)
)

# Plot proportion of stressful posts by subreddit
plt.figure(figsize=(10, 6))
sns.barplot(x=subreddit_stress.values, y=subreddit_stress.index)
plt.title('Proportion of Stressful Posts by Subreddit')
plt.xlabel('Proportion of Stressful Posts')
plt.ylabel('Subreddit')
plt.show()

In [None]:
# Plot post text length by stress label
plt.figure(figsize=(8, 6))
sns.boxplot(x='label', y='text_length', data=dreaddit)
plt.title('Post Text Length by Stress Label')
plt.xlabel('Label')
plt.ylabel('Text Length (Word Count)')
plt.show()

In [None]:
# Compare LIWC emotional features across stress labels
liwc_emotion_cols = [
    'lex_liwc_negemo', 'lex_liwc_anx', 'lex_liwc_sad',
    'lex_liwc_anger', 'lex_liwc_death'
]

(
    dreaddit[liwc_emotion_cols + ['label_num']]
    .groupby('label_num')
    .mean()
    .T
    .plot(kind='bar', figsize=(12, 6))
)
plt.title('Average LIWC Emotional Features by Stress Label')
plt.ylabel('Average LIWC Score')
plt.xlabel('LIWC Emotional Features')
plt.legend(title='Stress Label', labels=['Not Stressful', 'Stressful'])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Compare DAL features across stress labels
dal_cols = [
    'lex_dal_avg_pleasantness',
    'lex_dal_avg_activation',
    'lex_dal_avg_imagery'
]

(
    dreaddit[dal_cols + ['label_num']]
    .groupby('label_num')
    .mean()
    .T
    .plot(kind='bar', figsize=(12, 6))
)

plt.title('Average DAL Features by Stress Label')
plt.ylabel('Average DAL Score')
plt.xlabel('DAL Features')
plt.legend(title='Stress Label', labels=['Not Stressful', 'Stressful'])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Correlation matrix for psycholinguistic features (LIWC and DAL)
psycholinguistic_cols = liwc_emotion_cols + dal_cols
correlation_matrix = dreaddit[psycholinguistic_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of LIWC and DAL Features')
plt.show()