Date: 01/06/2025 <br>
Author: Wan Xuen <br>
Notebook01: Text Mining for Mental Health Chatbot <br>
Aim: To conduct text mining and save the lemmatized texts


In [1]:
import os
import pandas as pd
import re
from symspellpy.symspellpy import SymSpell, Verbosity
import spacy
import json
from functools import lru_cache
from tqdm import tqdm  # Progress bar for feedback

FILE_DIR = 'raw data'
DIC_DIR = 'dictionary'

### Read all files 

In [2]:
def read_and_concat_all_csvs(base_path):
    all_dfs = []

    # Loop over years (folders under base_path)
    for year in sorted(os.listdir(base_path)):
        year_path = os.path.join(base_path, year)
        if not os.path.isdir(year_path):
            continue  # Skip non-folder files

        # Loop over months inside each year
        for month in sorted(os.listdir(year_path)):
            month_path = os.path.join(year_path, month)
            if not os.path.isdir(month_path):
                continue

            # Loop over CSV files inside each month
            for filename in sorted(os.listdir(month_path)):
                if filename.endswith('.csv'):
                    file_path = os.path.join(month_path, filename)
                    df = pd.read_csv(file_path)
                    all_dfs.append(df)

    if all_dfs:
        return pd.concat(all_dfs, axis=0, ignore_index=True)
    else:
        print("No CSV files found.")
        return pd.DataFrame()

In [3]:
df_final = read_and_concat_all_csvs(FILE_DIR)
print(df_final.shape)

(389387, 8)


In [4]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389387 entries, 0 to 389386
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   389387 non-null  int64 
 1   author       389387 non-null  object
 2   created_utc  389387 non-null  int64 
 3   score        389387 non-null  int64 
 4   selftext     378246 non-null  object
 5   subreddit    389387 non-null  object
 6   title        389387 non-null  object
 7   timestamp    389387 non-null  object
dtypes: int64(3), object(5)
memory usage: 23.8+ MB


In [5]:
df_final = df_final.drop(columns=["Unnamed: 0", "timestamp", "score", "created_utc", "author"])

In [6]:
df_final.isnull().sum()

selftext     11141
subreddit        0
title            0
dtype: int64

### Impute missing values

In [7]:
df_final['selftext']=df_final['selftext'].fillna('N/A')
df_final['title']=df_final['title'].fillna('N/A')

In [8]:
df_final['full_text'] = (
    "Subreddit: " + df_final['subreddit'].fillna('') + ". " +
    "Title: " + df_final['title'].fillna('') + ". " +
    "Body: " + df_final['selftext'].fillna('')
)

In [9]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389387 entries, 0 to 389386
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   selftext   389387 non-null  object
 1   subreddit  389387 non-null  object
 2   title      389387 non-null  object
 3   full_text  389387 non-null  object
dtypes: object(4)
memory usage: 11.9+ MB


### Clean the text
- lowercasing
- URLs, punctuation, emojis, extra space, user mentions removal
- replace newlines with space
- repeated words removal

In [10]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'\n', ' ', text)      # replace newlines with space
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove punctuation and special chars, keep letters and digits
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    text = re.sub(r'u\/\w+|@\w+', '', text) # remove user mentions
    # Remove emojis (unicode ranges)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002700-\U000027BF"  # dingbats
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'&\w+;', '', text) # Remove HTML entities like &amp;, &gt;, &lt;, etc.
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # "soooo" → "soo"
    return text

df_final['clean_text'] = df_final['full_text'].apply(clean_text)

### Short forms or slang customly detection

In [11]:
with open(os.path.join(DIC_DIR, "slang_dictionary.json"), "r") as f:
    slang_dict = json.load(f)

def expand_slang(text):
    words = text.split()
    return " ".join([slang_dict.get(w, w) for w in words])

df_final['expanded_text'] = df_final['clean_text'].apply(expand_slang)

### Spelling correction

In [12]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
spell_dict = "frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(os.path.join(DIC_DIR, spell_dict), term_index=0, count_index=1)

@lru_cache(maxsize=10000)
def cached_correct(word):
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    return suggestions[0].term if suggestions else word

def correct_spelling_fast_cached(text):
    words = text.split()
    return " ".join([cached_correct(w) for w in words])

df_final['corrected_text'] = df_final['expanded_text'].astype(str).apply(correct_spelling_fast_cached)

Note: Use SymSpell only if you detect gibberish or low confidence in production chatbot.

### Lemmatization
To reduce dimensionality or group similar words

In [13]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
df_final['corrected_text'] = df_final['corrected_text'].astype(str)

lemmatized_col = []
chunk_size = 100_000

for i in range(0, len(df_final), chunk_size):
    chunk = df_final['corrected_text'].iloc[i:i+chunk_size].tolist()
    partial_result = [
        " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])
        for doc in tqdm(nlp.pipe(chunk, batch_size=1000, n_process=2), desc=f"Chunk {i}")
    ]
    lemmatized_col.extend(partial_result)

df_final['lemmatized_text'] = lemmatized_col

Chunk 0: 100000it [15:40, 106.36it/s]
Chunk 100000: 100000it [14:34, 114.30it/s]
Chunk 200000: 100000it [14:20, 116.25it/s]
Chunk 300000: 89387it [13:14, 112.56it/s]


### Save lemmatized text into parquet file

In [14]:
df_final.to_parquet('processing.parquet')