In [1]:
! pip install praw pandas nltk


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


In [3]:
import praw
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

# Reddit API Credentials (Replace with yours)
CLIENT_ID = "7tcVq4CBDQGv08QGJe7Npw"
CLIENT_SECRET = "SHPlS_TQacTG4xGWZ8MlqfSGxzP3Og"
USER_AGENT = "mental_health_analysis"

# Initialize Reddit API
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)

# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Fetch posts from r/depression
subreddit = reddit.subreddit("depression")
posts = subreddit.hot(limit=50)

data = []
for post in posts:
    cleaned_text = clean_text(post.title + " " + post.selftext)
    data.append([post.title, cleaned_text])

df = pd.DataFrame(data, columns=["Original Text", "Cleaned Text"])
df.to_csv("mental_health_reddit.csv", index=False)

print("✅ Reddit comments fetched and saved successfully!")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



✅ Reddit comments fetched and saved successfully!


## Step 2: Label the Data for Sentiment Analysis

In [5]:
pip install nltk pandas




In [6]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER if not already downloaded
nltk.download('vader_lexicon')

# Load the data
df = pd.read_csv("mental_health_reddit.csv")

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Function to label sentiment
def get_sentiment(text):
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# Apply sentiment analysis
df["Sentiment"] = df["Cleaned Text"].apply(get_sentiment)

# Save labeled data
df.to_csv("labeled_mental_health_data.csv", index=False)

print("✅ Data labeled successfully! Check labeled_mental_health_data.csv")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


✅ Data labeled successfully! Check labeled_mental_health_data.csv


## Step 3: Train the Sentiment Analysis Model

In [7]:
pip install scikit-learn pandas nltk joblib




In [9]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Download necessary NLTK data
nltk.download('punkt')

# Load labeled dataset
df = pd.read_csv("labeled_mental_health_data.csv")

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words="english")  # Removed tokenizer
X = vectorizer.fit_transform(df["Cleaned Text"])
y = df["Sentiment"]

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Model trained with {accuracy:.2f} accuracy!")
print(classification_report(y_test, y_pred))

# Save the model & vectorizer
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("✅ Model saved as sentiment_model.pkl & vectorizer.pkl")


✅ Model trained with 0.80 accuracy!
              precision    recall  f1-score   support

    Negative       0.80      1.00      0.89         8
    Positive       0.00      0.00      0.00         2

    accuracy                           0.80        10
   macro avg       0.40      0.50      0.44        10
weighted avg       0.64      0.80      0.71        10

✅ Model saved as sentiment_model.pkl & vectorizer.pkl


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
import numpy
import sklearn
import joblib

print("numpy version:", numpy.__version__)
print("scikit-learn version:", sklearn.__version__)
print("joblib version:", joblib.__version__)


numpy version: 2.0.2
scikit-learn version: 1.6.1
joblib version: 1.4.2
