In [7]:
import pandas as pd
import re
import string

# Load dataset
data_path = "../data/aimind_merged_title_body.csv"
df = pd.read_csv(data_path)

print(" Loaded dataset with shape:", df.shape)
print(" Columns:", df.columns.tolist())
df.head()

 Loaded dataset with shape: (151377, 3)
 Columns: ['confession', 'target', 'upvote_ratio']


Unnamed: 0,confession,target,upvote_ratio
0,New to Vyvanse... Do others get a heady rush f...,ADHD,1.0
1,why FUCK is a mental illness supposed to stay ...,OCD,1.0
2,PTSD? Hello. Sorry for my bad english but I ju...,ptsd,1.0
3,"Found mould in my home Hi all, contamination O...",OCD,0.78
4,Honestly I wanna die.,depression,1.0


In [8]:
# Define text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove emojis/non-ASCII
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    
    # Lowercase
    text = text.lower()
    
    # Remove all punctuation except ? and !
    punctuation_to_remove = string.punctuation.replace("?", "").replace("!", "")
    text = text.translate(str.maketrans('', '', punctuation_to_remove))
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text



In [None]:
if "confession" not in df.columns:
    raise ValueError("❌ The dataset does not have a 'confession' column.")

df["clean_text"] = df["confession"].apply(clean_text)

# Remove empty or very short cleaned entries
df = df[df["clean_text"].str.strip().astype(bool)]
df = df[df["clean_text"].str.len() > 10]

print(" Cleaned dataset shape:", df.shape)
df[["confession", "clean_text"]].sample(5)


✅ Cleaned dataset shape: (149175, 4)


Unnamed: 0,confession,clean_text
88702,If only I never had to go to the bathroom... M...,if only i never had to go to the bathroom my l...
105388,What do you do when you can't remember if you ...,what do you do when you cant remember if you t...
60546,"Hi Hi, I'm 23 and I'm from Brazil. I've been d...",hi hi im 23 and im from brazil ive been dealin...
57308,What is your clothes routines? For me I have w...,what is your clothes routines? for me i have w...
73441,"TW: sexual assault, TSA nightmare",tw sexual assault tsa nightmare


In [11]:
output_path = "../data/aimind_cleaned.csv"
df.to_csv(output_path, index=False)
print(f"✅ Cleaned dataset saved to {output_path}")


✅ Cleaned dataset saved to ../data/aimind_cleaned.csv
