In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('mentalhealth.csv')

# Display first few rows
print(df.head())

# Check target distribution
print(df['target'].value_counts())


   Unnamed: 0                                               text  \
0           0  Welcome to /r/depression's check-in post - a p...   
1           1  We understand that most people who reply immed...   
2           2  Anyone else just miss physical touch? I crave ...   
3           3  I’m just so ashamed. Everyone and everything f...   
4           4  I really need a friend. I don't even have a si...   

                                               title  target  
0  Regular check-in post, with information about ...       1  
1  Our most-broken and least-understood rules is ...       1  
2  I haven’t been touched, or even hugged, in so ...       1  
3                    Being Depressed is Embarrassing       1  
4  I'm desperate for a friend and to feel loved b...       1  
target
1    1202
3    1201
4    1188
2    1185
0    1181
Name: count, dtype: int64


In [11]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Download stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('mentalhealth.csv')

# Fill missing values in the text column with empty strings
df['text'] = df['text'].fillna('')

# Preprocessing: remove stopwords, lowercase the text
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    text = str(text).lower()  # Ensure text is a string before processing
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

# Apply preprocessing to the text column
df['cleaned_text'] = df['text'].apply(preprocess)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['target'], test_size=0.2, random_state=42)

# Display first few rows to check the preprocessing
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rathi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Unnamed: 0                                               text  \
0           0  Welcome to /r/depression's check-in post - a p...   
1           1  We understand that most people who reply immed...   
2           2  Anyone else just miss physical touch? I crave ...   
3           3  I’m just so ashamed. Everyone and everything f...   
4           4  I really need a friend. I don't even have a si...   

                                               title  target  \
0  Regular check-in post, with information about ...       1   
1  Our most-broken and least-understood rules is ...       1   
2  I haven’t been touched, or even hugged, in so ...       1   
3                    Being Depressed is Embarrassing       1   
4  I'm desperate for a friend and to feel loved b...       1   

                                        cleaned_text  
0  welcome /r/depression's check-in post - place ...  
1  understand people reply immediately op invitat...  
2      anyone else miss physical touch? c

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Convert text into numerical features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = classifier.predict(X_test_tfidf)

# Display classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.71      0.76       236
           1       0.70      0.70      0.70       265
           2       0.82      0.68      0.74       211
           3       0.56      0.78      0.65       229
           4       0.80      0.73      0.76       251

    accuracy                           0.72      1192
   macro avg       0.74      0.72      0.72      1192
weighted avg       0.74      0.72      0.72      1192



In [16]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

# Download stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('mentalhealth.csv')

# Fill missing values in the text column with empty strings
df['text'] = df['text'].fillna('')

# Preprocessing: remove stopwords, lowercase the text
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    text = str(text).lower()  # Ensure text is a string before processing
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

# Apply preprocessing to the text column
df['cleaned_text'] = df['text'].apply(preprocess)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['target'], test_size=0.2, random_state=42)

# Convert text data into TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Make predictions and evaluate the model
y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Save the trained models for future use
with open('tfidf_model.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('classifier_model.pkl', 'wb') as f:
    pickle.dump(classifier, f)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rathi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 72.06%


In [19]:
# Function to preprocess new text input (similar to training data preprocessing)
def preprocess_input(text):
    stop_words = set(stopwords.words('english'))
    text = str(text).lower()  # Convert text to lowercase
    words = [word for word in text.split() if word not in stop_words]  # Remove stopwords
    return " ".join(words)

# Sample new text input (replace this with your own test sentences)
new_text = "I have been feeling very stressed and anxious lately."

# Preprocess the new text
cleaned_new_text = preprocess_input(new_text)

# Transform the new text using the saved TF-IDF vectorizer
new_text_tfidf = tfidf.transform([cleaned_new_text])

# Predict the mental health condition using the saved classifier
predicted_label = classifier.predict(new_text_tfidf)

# Mapping the predicted label to the mental health condition
label_mapping = {0: "Stress", 1: "Depression", 2: "Bipolar disorder", 3: "Personality disorder", 4: "Anxiety"}
predicted_condition = label_mapping[predicted_label[0]]

print(f"Predicted Mental Health Condition: {predicted_condition}")



Predicted Mental Health Condition: Stress


In [21]:
# Additional test cases
test_texts = [
    "I feel extremely hopeless and sad.",
    "I'm struggling with mood swings and extreme irritability.",
    "I have been really anxious about everything recently.",
    "I'm finding it hard to control my emotions.",
    "I've been feeling a lot better lately, much happier."
]

for text in test_texts:
    cleaned_text = preprocess_input(text)
    transformed_text = tfidf.transform([cleaned_text])
    predicted_label = classifier.predict(transformed_text)
    predicted_condition = label_mapping[predicted_label[0]]
    print(f"Input: {text}\nPredicted Mental Health Condition: {predicted_condition}\n")
/

Input: I feel extremely hopeless and sad.
Predicted Mental Health Condition: Depression

Input: I'm struggling with mood swings and extreme irritability.
Predicted Mental Health Condition: Bipolar disorder

Input: I have been really anxious about everything recently.
Predicted Mental Health Condition: Anxiety

Input: I'm finding it hard to control my emotions.
Predicted Mental Health Condition: Bipolar disorder

Input: I've been feeling a lot better lately, much happier.
Predicted Mental Health Condition: Personality disorder



()

In [22]:
from sklearn.metrics import classification_report

# Evaluate on the test set
y_pred = classifier.predict(tfidf.transform(X_test))

# Print detailed classification report
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))


                      precision    recall  f1-score   support

              Stress       0.82      0.71      0.76       236
          Depression       0.70      0.70      0.70       265
    Bipolar disorder       0.82      0.68      0.74       211
Personality disorder       0.56      0.78      0.65       229
             Anxiety       0.80      0.73      0.76       251

            accuracy                           0.72      1192
           macro avg       0.74      0.72      0.72      1192
        weighted avg       0.74      0.72      0.72      1192

