In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
import re

nltk.download('stopwords')
from nltk.corpus import stopwords

# Step 2: Load dataset
df = pd.read_csv("tweet_emotions.csv")
print(df.head())

# Step 3: Label mental health related vs not
mental_labels = ['sadness', 'anger', 'fear', 'worry', 'lonely', 'boredom', 'hate', 'empty']
df['mental_health'] = df['sentiment'].apply(lambda x: 1 if x in mental_labels else 0)

# Step 4: Text preprocessing
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['clean_text'] = df['content'].apply(clean_text)

# Step 5: Split data
X = df['clean_text']
y = df['mental_health']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 7: Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Step 8: Predict and evaluate
y_pred = model.predict(X_test_vec)
print("Classification Report:\n", classification_report(y_test, y_pred))


#distribution od mental health vs not graph
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='mental_health', data=df)
plt.title("Distribution of Mental Health vs Not")
plt.xlabel("Label (0 = Not MH, 1 = MH)")
plt.ylabel("Tweet Count")
plt.show()

#Plot confusion matrix to visualize true vs false predictions
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Purples', xticklabels=['Not MH', 'MH'], yticklabels=['Not MH', 'MH'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# Save the trained model (for use in a web app later)
#import pickle

# Save vectorizer and model
#with open("tfidf_vectorizer.pkl", "wb") as f:
    #pickle.dump(vectorizer, f)

#with open("mental_health_model.pkl", "wb") as f:
    #pickle.dump(model, f)

