In [None]:
!pip install gensim

In [None]:
# Importing Libraries

# Data Manipulation
import pandas as pd
import copy

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Counter for frequency counts
from collections import Counter

# WordCloud Visualization
from wordcloud import WordCloud

# Text Feature Extraction and Statistical Analysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2

# Word and Post Embeddings
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE


### Dataset Dowmload and Loading

Before loading the data, download the following files:  
- [reddit_cleaned_ml.csv](https://github.com/TuliDas/MindScan-NLP/blob/main/data/reddit_cleaned_ml.csv)  
- [reddit_cleaned_bert.csv](https://github.com/TuliDas/MindScan-NLP/blob/main/data/reddit_cleaned_bert.csv)  

and upload them to your **Google Colab** environment.  


In [None]:
# load cleaned dataset that prepared for both for ML and Bert model training
df_cleaned_ml = pd.read_csv('/content/reddit_cleaned_ml.csv')
df_cleaned_bert = pd.read_csv('/content/reddit_cleaned_bert.csv')

# **Exploratory Data Analysis (EDA)**

This section explores the mental health Reddit dataset to understand its structure and characteristics before model training.

**Steps included:**
1. **Dataset Structure & Quality**: Inspect dataset size and missing values.  
2. **Label Analysis**: Examine class distribution, percentage distribution to identify any imbalance.  
3. **Text Characteristics**: Explore average words and characters per post, also in classes.
4. **Word-Level Analysis**: Generate word clouds and top words(based of frequency) per class to see common and class-specific words.  
5. **Lexical Diversity**: Compute unique words ratio per class to compare vocabulary richness.  
6. **Statistical Distinctive Keywords**: Identify words most characteristic of each class using chi-square.  
7. **Embedding Visualization**: Visualize semantic structure using Word2Vec + t-SNE and BERT embeddings + t-SNE.  


## **1. Dataset Structure & Quality**





In [None]:
# Check Shape
print(df_cleaned_ml.shape)
print(df_cleaned_bert.shape)

In [None]:
# Check for missing values
print(df_cleaned_ml.isnull().sum())
print(df_cleaned_bert.isnull().sum())

### **2. Label Analysis**


In [None]:
# verify label distribition after cleaning
class_counts = df_cleaned_ml['label'].value_counts()
class_counts

#### **Output:** Class distribution

| label      | count |
|------------|------:|
| ADHD       | 2000  |
| Addiction  | 2000  |
| Anxiety    | 2000  |
| Depression | 2000  |
| Normal     | 2000  |
| OCD        | 2000  |
| PTSD       | 2000  |
| Suicidal   | 1913  |

Total: 15913 rows


In [None]:
# bar plots of each class distribution

plt.figure(figsize=(8, 6))
sns.barplot(x=class_counts.index, y=class_counts.values, palette="viridis")
plt.title('Number of posts per class')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()


#### Output:
[Barplot-Number of posts per class.png](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/barplots-histplots/posts_per_class_barplot.png)

In [None]:
# Percentage Distribution

# Calculate Percentage
class_percentages = df_cleaned_ml['label'].value_counts(normalize=True)*100

# Convert to Dataframe for better visualization
class_percentages_df = class_percentages.reset_index()
class_percentages_df.columns = ['Class', 'Percentage']
class_percentages_df


#### **Output :** Class distribution (percentage)

| Class      | Percentage |
|------------|-----------:|
| ADHD       | 12.57%    |
| Addiction  | 12.57%    |
| Anxiety    | 12.57%    |
| Depression | 12.57%    |
| Normal     | 12.57%    |
| OCD        | 12.57%    |
| PTSD       | 12.57%    |
| Suicidal   | 12.02%    |


### **3. Text Characteristics**

In [None]:
# Number of words and characters per post
df_cleaned_ml['num_words'] = df_cleaned_ml['text'].apply(lambda x: len(x.split()))
df_cleaned_ml['num_chars'] = df_cleaned_ml['text'].apply(len)
df_cleaned_ml

#### **Output :** (ml_cleaned_dataset)

| text | label | num_words | num_chars |
|------|-------|-----------|-----------|
| small success finally fill ice cube tray pre... | ADHD | 47 | 330 |
| adderall shiver shakiness anybody experience s... | ADHD | 28 | 192 |
| constant mental exhaustion relate undereate ... | ADHD | 53 | 369 |
| get consistent nighttime route finally hate no... | ADHD | 41 | 255 |
| actual lifestyle advice tired post subreddit s... | ADHD | 27 | 216 |
| ... | ... | ... | ... |
| not leave know lot people suicide selfish hone... | Suicidal | 26 | 155 |
| ahh ahhim freaking world help | Suicidal | 5 | 31 |
| lose girlfriend year lose yesterday hang b... | Suicidal | 58 | 374 |
| lose friend trigger friend recently move text ... | Suicidal | 47 | 289 |
| m feel lose feel discouraged tired comp... | Suicidal | 46 | 288 |


In [None]:
# Histograms plot of number of word per post (ML_Cleaned_Dataset)

plt.figure(figsize=(10,8))
sns.histplot(df_cleaned_ml['num_words'], bins=50, kde=True, color='skyblue', label='ML')
plt.title('Distribution of Number of Words per Post (ML-Cleaned-Dataset)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()


#### **Output :**
[histplot-words-per-post-ml](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/barplots-histplots/histplot-words-per-post-ml.png)

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x='label', y='num_words', data= df_cleaned_ml,palette='Set2')
plt.title('Distribution of Word Count Per Class (ML-Cleaned-dataset)')
plt.xlabel('Class')
plt.ylabel('Number of Words')
plt.xticks(rotation=45)
plt.show()


#### **Output:**
[boxplot-word-count-per-class-ml](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/barplots-histplots/boxplot-word-count-per-class-ml.png)

In [None]:
# Number of words and characters per post
df_cleaned_bert['num_words'] = df_cleaned_bert['text'].apply(lambda x: len(x.split()))
df_cleaned_bert['num_chars'] = df_cleaned_bert['text'].apply(len)
df_cleaned_bert

#### **Output :** (Bert_Cleaned_Dataset)

| text | label | num_words | num_chars |
|------|-------|-----------|-----------|
| small success: i finally filled the ice cube t... | ADHD | 113 | 599 |
| adderall shivers/shakiness has anybody experie... | ADHD | 62 | 350 |
| could my constant mental exhaustion be related... | ADHD | 158 | 831 |
| i’ve been getting into a consistent nighttime ... | ADHD | 97 | 495 |
| actual lifestyle advice i’m so tired of every ... | ADHD | 53 | 345 |
| ... | ... | ... | ... |
| why cant i leave i know a lot of people call s... | Suicidal | 70 | 321 |
| ahhhhhhhhhhhhhhhhhhhh ahhhhhhhhhim so freaking... | Suicidal | 11 | 83 |
| just lost my girlfriend of almost 4 years i lo... | Suicidal | 170 | 829 |
| might lose a friend and it’s triggering me my ... | Suicidal | 108 | 542 |
| 30m feeling lost i feel discouraged, tired, an... | Suicidal | 99 | 491 |


In [None]:
plt.figure(figsize=(10,8))
sns.histplot(df_cleaned_bert['num_words'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Number of Words per Post (BERT)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

#### **Output:**
[histplot-word-per-post-bert.png](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/barplots-histplots/histplot-word-per-post-bert.png)

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x='label', y='num_words', data= df_cleaned_bert,palette='Set2')
plt.title('Distribution of Word Count Per Class (BERT)')
plt.xlabel('Class')
plt.ylabel('Number of Words')
plt.xticks(rotation=45)
plt.show()

#### **Output:**
[boxplot-word-count-per-class-bert.png](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/barplots-histplots/boxplot-word-count-per-class-bert.png)

In [None]:
# Average text length per class

avg_words = df_cleaned_ml.groupby('label')['num_words'].mean()
avg_chars = df_cleaned_ml.groupby('label')['num_chars'].mean()
print(f"Average word per class (cleaned_text_ml) : {avg_words}")
print(f"\nAverage char per class (cleaned_text_ml) : {avg_chars}")

#### **Output** :  Text Statistics per Class (ML Cleaned Text)

##### Average Words per Post
| Label       | Avg. Words |
|------------|-----------:|
| ADHD       | 44.63     |
| Addiction  | 35.34     |
| Anxiety    | 37.05     |
| Depression | 35.14     |
| Normal     | 38.80     |
| OCD        | 34.77     |
| PTSD       | 37.09     |
| Suicidal   | 35.58     |

##### Average Characters per Post
| Label       | Avg. Characters |
|------------|----------------:|
| ADHD       | 307.42          |
| Addiction  | 235.16          |
| Anxiety    | 248.78          |
| Depression | 229.51          |
| Normal     | 259.75          |
| OCD        | 232.54          |
| PTSD       | 252.31          |
| Suicidal   | 230.25          |


In [None]:
avg_words = df_cleaned_bert.groupby('label')['num_words'].mean()
avg_chars = df_cleaned_bert.groupby('label')['num_chars'].mean()
print(f"Average word per class (cleaned_text_bert) : {avg_words}")
print(f"\nAverage char per class (cleaned_text_bert) : {avg_chars}")

## **4. Word-Level Analysis**

In [None]:
# ------- 1. Word Clouds for Each Class -------------

def plot_wordcloud(class_name, text):
  wc = WordCloud( width=800, height=400, max_words= 100,
                 background_color= 'white', colormap = 'viridis').generate(" ".join(text))
  plt.figure(figsize=(10, 6))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis('off')
  plt.title(f'Word Cloud for Class: {class_name}', fontsize = 16)
  plt.show()

In [None]:
# Plot Word Cloud
for label in df_cleaned_ml['label'].unique():
  labeled_text = df_cleaned_ml[df_cleaned_ml['label'] == label]['text']
  plot_wordcloud(label, labeled_text)



In [None]:
# ----------- 2.Top N Most Frequent Words per Class --------
def top_words_per_class(df, label, N = 20):
  words = df[df['label'] == label]['text'].str.lower().str.cat(sep=' ').split()
  counter = Counter(words)
  return counter.most_common(N)


In [None]:
for label in df_cleaned_ml['label'].unique():
  top_words = top_words_per_class(df_cleaned_ml, label)
  print(f"\nTop {len(top_words)} words for class {label}: {top_words}")

### **Output :** Top 20 Frequent Words per Class (without frequency counts)  

**ADHD:**  
not, adhd, like, feel, know, work, time, day, thing, get, want, help, start, try, take, think, med, medication, go, people  

**Addiction:**  
not, day, feel, year, quit, like, drink, time, go, want, know, smoke, get, today, sober, think, month, life, no, stop  

**Anxiety:**  
not, feel, anxiety, like, panic, attack, know, go, think, people, day, time, get, bad, want, help, start, thing, work, anxious  

**Depression:**  
not, feel, like, want, know, life, go, no, people, think, time, depression, get, day, thing, help, friend, try, year, bad  

**Normal:**  
not, like, lpt, feel, time, day, get, good, work, want, people, go, know, year, think, happy, thing, love, life, today  

**OCD:**  
not, ocd, like, feel, thought, know, think, thing, want, bad, help, intrusive, time, get, go, try, day, tell, people, anxiety  

**PTSD:**  
not, feel, like, ptsd, know, want, time, trauma, think, help, get, year, go, thing, trigger, people, happen, bad, try, day  

**Suicidal:**  
not, feel, want, know, life, like, go, think, no, year, die, day, time, suicide, people, kill, get, love, live, friend  


#### Common Words Removal for clear wordcloud

- For **EDA only**: Remove **common words and stopwords** to focus on class-distinctive words.

In [None]:
# Common words to remove for EDA only
common_words = set([
    'work', 'today', 'year', 'day', 'thing',
    'like', 'feel', 'know', 'want', 'time', 'go',
    'think', 'get', 'take', 'people' , 'life' , 'start',
    'help' , 'try' , 'good' , 'bad' ,
    'need' , 'tell' , 'way' ,
    'no', 'not' , 'never', 'nor'  # also added negations
])

# Create a temporary copy for EDA
eda_texts = df_cleaned_ml.copy()

def filter_text_for_eda(text):
    words = text.split()
    filtered = [w for w in words if w not in common_words]
    return " ".join(filtered)

eda_texts['text'] = eda_texts['text'].apply(filter_text_for_eda)


In [None]:
# Plot Word Cloud
for label in eda_texts['label'].unique():
  labeled_text = eda_texts[eda_texts['label'] == label]['text']
  plot_wordcloud(label, labeled_text)

#### **Output**
[word_clouds.png for each class/label](https://github.com/TuliDas/MindScan-NLP/tree/main/images/eda/wordclouds)

In [None]:
for label in eda_texts['label'].unique():
  top_words = top_words_per_class(eda_texts, label)
  print(f"\nTop {len(top_words)} words for class {label}: {top_words}")

#### **Top 20 Words per Class (Words Only)**

#####**ADHD**
adhd, med, medication, find, diagnose, mg, experience, adderall, month, week, long, focus, lot, make, say, anxiety, hour, look, struggle, school

#####**Addiction**
quit, drink, smoke, sober, month, stop, week, addiction, long, thank, alcohol, cigarette, hard, nicotine, well, night, ago, clean, friend, drug

#####**Anxiety**
anxiety, panic, attack, anxious, heart, social, have, make, symptom, happen, talk, health, come, friend, stop, look, experience, die, say, week

#####**Depression**
depression, friend, anymore, hate, well, talk, live, depressed, tired, end, love, hard, care, happy, make, find, die, thought, stop, long

#####**Normal**
lpt, happy, love, friend, look, s, find, lot, say, edit, make, new, come, thank, ask, little, old, talk, see, post

#####**OCD**
ocd, thought, intrusive, anxiety, compulsion, stop, make, happen, have, find, say, fear, come, person, look, brain, right, head, experience, lot

#####**PTSD**
ptsd, trauma, trigger, happen, experience, flashback, have, nightmare, therapy, sleep, talk, find, therapist, come, friend, lot, hard, anxiety, stop, make

#####**Suicidal**
die, suicide, kill, love, live, friend, find, lose, end, wish, hate, anymore, well, leave, family, fuck, brother, month, miss, say

## **5. Lexical Diversity**


Common metric:

Lexical Diversity (TTR) =
Number of Unique Words /
Total Number of Words


In [None]:
def lexical_diversity(text):
  tokens = text.split()
  if len(tokens) == 0:
    return 0
  return len(set(tokens)) / len(tokens)

In [None]:
df_cleaned_ml['lexical_diversity'] = df_cleaned_ml['text'].apply(lexical_diversity)
lexical_scores_ml = df_cleaned_ml.groupby('label')['lexical_diversity'].mean().reset_index()
lexical_scores_ml

#### **Output :** for df_cleaned_ml data  

| Label      | Lexical Diversity |
|------------|-------------------|
| ADHD       | 0.799321          |
| Addiction  | 0.837921          |
| Anxiety    | 0.810511          |
| Depression | 0.813238          |
| Normal     | 0.818335          |
| OCD        | 0.799929          |
| PTSD       | 0.823304          |
| Suicidal   | 0.823510          |

In [None]:
df_cleaned_bert['lexical_diversity'] = df_cleaned_bert['text'].apply(lexical_diversity)
lexical_scores_bert = df_cleaned_bert.groupby('label')['lexical_diversity'].mean().reset_index()
lexical_scores_bert

#### **Output :** for df_cleaned_bert data  

| Label      | Lexical Diversity |
|------------|-------------------|
| ADHD       | 0.742390          |
| Addiction  | 0.795363          |
| Anxiety    | 0.770159          |
| Depression | 0.753351          |
| Normal     | 0.775004          |
| OCD        | 0.762195          |
| PTSD       | 0.768020          |
| Suicidal   | 0.763578          |


In [None]:
# Compare Lexical diversity of both dataset

ml_scores = {}
for index, row in lexical_scores_ml.iterrows():
  ml_scores[row['label']] = row['lexical_diversity']

bert_scores = {}
for index, row in lexical_scores_bert.iterrows():
  bert_scores[row['label']] = row['lexical_diversity']

# Convert to DataFrame for plotting
df_plot = pd.DataFrame({
    "ML_cleaned": ml_scores,
    "BERT_data": bert_scores
}).T  # transpose to get labels as rows

df_plot = df_plot.T  # flip back so classes are x-axis

# Plot
ax = df_plot.plot(kind="bar", figsize=(10,6), width=0.8)

plt.title("Lexical Diversity Comparison by Class")
plt.xlabel("Class")
plt.ylabel("Lexical Diversity Score")
plt.xticks(rotation=45)
plt.legend(title="Dataset")
plt.tight_layout()
plt.show()


[lexical-diversity-comparison-by-class](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/barplots-histplots/lexical-diversity-comparison-by-class.png)

## **6. Statistical Distinctive word(Chi-Square)**
- Chi-Square test measures the association between a word and a class.  
- Words with high Chi-Square scores are more distinctive for that class compared to others.


In [None]:
vectorizer = CountVectorizer(min_df=5, stop_words='english')
X = vectorizer.fit_transform(df_cleaned_ml['text'])
y = df_cleaned_ml['label']

feature_names = vectorizer.get_feature_names_out()
distinctive_words = {}

for label in df_cleaned_ml['label'].unique():
  chi2_scores , p_values = chi2(X, (y == label))
  scores = list(zip(feature_names,chi2_scores))

  sorted_scores = sorted(scores, key = lambda x: x[1] , reverse=True)
  distinctive_words[label] = sorted_scores[:15]

In [None]:
# Show top distinctive words per class
for label, words in distinctive_words.items():
    print(f"\nTop distinctive words for {label}:")
    print([w for w, s in words])

#### **Output : Top Distinctive Words per Class**

- **ADHD:**  
  adhd, adderall, med, medication, vyvanse, mg, concerta, diagnose, stimulant, focus, xr, ritalin, task, work, diagnosis  

- **Addiction:**  
  quit, sober, smoke, drink, addiction, nicotine, cigarette, day, alcohol, craving, iwndwyt, smoking, sobriety, addict, relapse  

- **Anxiety:**  
  anxiety, panic, attack, anxious, heart, social, ha, health, symptom, chest, ocd, palpitation, adhd, calm, breath  

- **Depression:**  
  depression, depressed, anymore, don, feel, want, ocd, life, antidepressant, tired, hate, adhd, care, sad, loser  

- **Normal:**  
  lpt, happy, edit, feel, anxiety, ocd, buy, save, adhd, travel, thought, small, item, bad, award  

- **OCD:**  
  ocd, thought, intrusive, compulsion, obsession, pocd, theme, erp, contamination, obsessive, hocd, false, fear, obsess, harm  

- **PTSD:**  
  ptsd, trauma, flashback, trigger, nightmare, abuse, assault, traumatic, emdr, therapy, abuser, therapist, sexual, tw, sexually  

- **Suicidal:**  
  suicide, kill, die, brother, grief, life, commit, miss, death, suicidal, dad, wish, son, anxiety, pain  

In [None]:
def plot_distinctive_words(distinctive_words, top_n=10):
    """
    Plot bar charts of top distinctive words per class (Chi-Square scores).
    """
    for label, words_scores in distinctive_words.items():
        words, scores = zip(*words_scores[:top_n])

        plt.figure(figsize=(10,6))
        plt.barh(words, scores, color="skyblue")
        plt.gca().invert_yaxis()  # highest score at top
        plt.title(f"Top {top_n} Distinctive Words for {label}", fontsize=14)
        plt.xlabel("Chi-Square Score", fontsize=12)
        plt.ylabel("Words", fontsize=12)
        plt.tight_layout()
        plt.show()


In [None]:
plot_distinctive_words(distinctive_words, top_n=10)


#### **Output :** Barplots
- repo folder link : ['class/label'-distinctive-class-barplots](https://github.com/TuliDas/MindScan-NLP/tree/main/images/eda/distinctive-words)


##**7. Embedding visualization**

### **Word Embedding (word2Vec , t-TSNE)**
- **Word2Vec:** Converts words into dense numerical vectors such that semantically similar words are closer in vector space.  
- **t-SNE:** A dimensionality reduction technique that projects high-dimensional word vectors into 2D or 3D for visualization.  
- **Purpose in EDA:** Visualize how words from different classes group together and see semantic relationships or overlaps between classes.


In [None]:
# 1. Prepare corpus (tokenized texts)
corpus = [text.split() for text in df_cleaned_ml['text']]

# 2. Train Word2Vec
model = Word2Vec(
    sentences=corpus,
    vector_size=100,  # embedding dimensions
    window=5,
    min_count=2,      # ignore words with total freq < 2
    workers=4,
    sg=1              # skip-gram model
)


# 4. Collect embeddings for top words
words = []
labels = []
for i, (cls, top_words) in enumerate(distinctive_words.items()):
    for word, score in top_words:
        if word in model.wv.key_to_index:   # ensure word exists in vocab
            words.append(model.wv[word])
            labels.append((word, cls))

# Convert to DataFrame
word_vectors = pd.DataFrame(words)
word_labels = pd.DataFrame(labels, columns=["word", "class"])

# 5. Reduce dimension with t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=10)
embeddings_2d = tsne.fit_transform(word_vectors)

# 6. Plot with colors per class
plt.figure(figsize=(12, 8))
unique_classes = word_labels['class'].unique()
palette = sns.color_palette("tab10", len(unique_classes))

for i, cls in enumerate(unique_classes):
    idx = word_labels['class'] == cls   # idx is all selected rows of current selected class/label
    plt.scatter(embeddings_2d[idx, 0],   # x-coordinates of current label/class's words
                embeddings_2d[idx, 1],   # y-coordinates of current label/class's words
                label=cls,
                color=palette[i],
                alpha=0.7, s=60)
    # plot the words
    for (x, y), word in zip(embeddings_2d[idx], word_labels[idx]['word']):
        plt.text(x+0.02, y+0.02, word, fontsize=9)

plt.legend(title="Class")
plt.title("t-SNE Visualization of Top Words per Class (Word2Vec)")
plt.show()

#### **Output :**
[t-SNE Visualization of Top Distinctive Words per Class (Word2Vec).png](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/embeddings/Word2Vec_tSNE_top_distinctive_words_visualization.png)
#### **Observation :**
- Most words from ADHD, Addiction, Suicidal, Depression, Anxiety, and Normal classes cluster together.  
- PTSD and OCD words are closer to each other, with some overlap with Normal class words.  

### **Sentence/Post BERT Embeddings (t-SNE Visualization)**

- Generate sentence embeddings using **BERT** for each post.  
- Apply **t-SNE** to reduce high-dimensional embeddings to 2D for visualization.  
- **Purpose in EDA:** Observe how posts from different mental health classes cluster in semantic space, and identify overlaps or distinct groups.


In [None]:
# Step 1: BERT embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # light + fast
X_bert = model.encode(df_cleaned_bert['text'].tolist(), show_progress_bar=True)

# Step 2: t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_bert)

# Step 3: Plot
plt.figure(figsize=(10,7))
for label in df_cleaned_bert['label'].unique():
    idx = df_cleaned_bert['label'] == label
    plt.scatter(X_tsne[idx, 0], X_tsne[idx, 1], label=label, alpha=0.5)
plt.legend()
plt.title("BERT + t-SNE: Post Embeddings")
plt.show()


#### **Output:**
[BERT + t-SNE -Post Embeddings.png](https://github.com/TuliDas/MindScan-NLP/blob/main/images/eda/embeddings/BERT_tSNE_Post_Embeddings.png)
#### **Observation:**
- The BERT-based t-SNE visualization of post embeddings reveals clear separations for several classes. ADHD, Addiction, OCD, PTSD, and Normal form distinguishable clusters.
- Anxiety also forms a visible cluster, although some PTSD posts overlap into this region.
- However, Depression and Suicidal classes show significant overlap, suggesting strong semantic similarity in the way users express these two conditions. This overlap may explain why models could struggle to differentiate between them.