In [1]:
pip install pandas numpy matplotlib seaborn scikit-learn nltk gensim textblob


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from textblob import TextBlob

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

file_path = r'C:\Users\User\Downloads\SMA2\reddit_mental_health_dataset_processed_senti.csv'
df = pd.read_csv(file_path)

print(df.info())
print(df.head())

def extract_features(text):
    if isinstance(text, float):  
        return 0, 0, 0, 0  
    tokens = word_tokenize(text)
    num_words = len(tokens)
    avg_word_len = np.mean([len(word) for word in tokens]) if num_words > 0 else 0
    sentiment = TextBlob(text).sentiment.polarity + 1  # Shift sentiment to positive range
    lexical_richness = len(set(tokens)) / num_words if num_words > 0 else 0
    return num_words, avg_word_len, sentiment, lexical_richness

print("Extracting text features...")
df[['num_words', 'avg_word_len', 'sentiment', 'lexical_richness']] = df['clean_text'].apply(lambda x: pd.Series(extract_features(x)))

print("Generating Word2Vec embeddings...")
sentences = [word_tokenize(str(text)) if isinstance(text, str) else [] for text in df['clean_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
embedding_features = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in sentences])

scaler = MinMaxScaler()
embedding_features = scaler.fit_transform(embedding_features)

print("Performing topic modeling...")
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_features = lda_model.fit_transform(embedding_features)

print("Ensuring non-negative values for feature selection...")
embedding_features = np.where(embedding_features < 0, 0, embedding_features)
lda_features = np.where(lda_features < 0, 0, lda_features)
df[['num_words', 'avg_word_len', 'sentiment', 'lexical_richness']] = np.where(df[['num_words', 'avg_word_len', 'sentiment', 'lexical_richness']] < 0, 0, df[['num_words', 'avg_word_len', 'sentiment', 'lexical_richness']])

combined_features = np.hstack((embedding_features, lda_features, df[['num_words', 'avg_word_len', 'sentiment', 'lexical_richness']].values))

print("Performing advanced feature selection...")
selector = SelectKBest(chi2, k=300)
X_selected = selector.fit_transform(combined_features, df['disorder_encoded'])

df.to_csv('reddit_mental_health_dataset_fengg1.csv', index=False)
print("Advanced Feature Engineering complete! Dataset saved.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44559 entries, 0 to 44558
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                44559 non-null  object 
 1   title             44559 non-null  object 
 2   text              44559 non-null  object 
 3   created_utc       44559 non-null  float64
 4   score             44559 non-null  int64  
 5   num_comments      44559 non-null  int64  
 6   subreddit         44559 non-null  object 
 7   category          44559 non-null  object 
 8   disorder          44559 non-null  object 
 9   clean_text        44413 non-null  object 
 10  disorder_encoded  44559 non-null  int64  
 11  negative          44559 non-null  float64
 12  neutral           44559 non-null  float64
 13  positive          44559 non-null  float64
 14  compound          44559 non-null  float64
dtypes: float64(5), int64(3), object(7)
memory usage: 5.1+ MB
None
        id               



Advanced Feature Engineering complete! Dataset saved.
