In [1]:
import pandas as pd

# Load the dataset
file_path = r"E:\My Office Work\IXD Labs\Workspace\Neuro Symbolic AI\\dataset.csv"
df = pd.read_csv(file_path)

# Display the first few rows and the columns
print(df.head())
print(df.columns)

                                                text  category  \
0  in the end we all die, nothing we do means any...       5.0   
1  Today would have been my best friend's 18th bi...       4.0   
2  So I couldn't feel any worse, and I've had eno...       5.0   
3  I am 22 and have never had a girlfriend and ha...       4.0   
4  I am almost certain that my depression is caus...       4.0   

                                         explanation  
0  feel more and more empty,fear lonliness,get ti...  
1   best friend's birthday, he's gone, never supp...  
2  absolute minimal contact with,  be more direct...  
3   never had a girlfriend, rejected countless times  
4  eating out with family, what after that? Paren...  
Index(['text', 'category', 'explanation'], dtype='object')


In [2]:
# Text cleaning

import re
from nltk.corpus import stopwords
import nltk

# Download stopwords
nltk.download('stopwords')

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters, digits, and punctuation
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

df['cleaned_text'] = df['text'].apply(lambda x: clean_text(x) if isinstance(x, str) else '')

df[['text', 'cleaned_text']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,cleaned_text
0,"in the end we all die, nothing we do means any...",end die nothing means anything anything throug...
1,Today would have been my best friend's 18th bi...,today would best friends th birthday wed going...
2,"So I couldn't feel any worse, and I've had eno...",couldnt feel worse ive enough tonight midnight...
3,I am 22 and have never had a girlfriend and ha...,never girlfriend rejected countless times see ...
4,I am almost certain that my depression is caus...,almost certain depression caused bullshit shap...


In [3]:
# EDA (Exploratory Data Analysiz)

df['cleaned_text_length'] = df['cleaned_text'].apply(len)
print(df['cleaned_text_length'].describe())

count     5052.000000
mean       567.680325
std        775.536087
min          0.000000
25%        157.000000
50%        339.500000
75%        688.250000
max      12615.000000
Name: cleaned_text_length, dtype: float64


In [4]:
# Data Processing

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])

In [5]:
# Model Training

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer


# Vectorize the cleaned text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])  # Transform the text data into feature vectors
y = df['category']  # Adjust to your actual target column

# Convert sparse matrix to DataFrame for easier NaN handling
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Check for NaN values in features (X)
print("Checking for NaN values in features (X):")
print(X_df.isnull().sum().sum())  # Check for NaN values in features

# Check for NaN values in target (y)
print("Checking for NaN values in target (y):")
print(pd.isnull(y).sum())  # Check for NaN values in target

# Remove rows with NaN in the target variable
df = df[~pd.isnull(df['category'])]

# Re-apply vectorization after removing NaN rows
X = vectorizer.fit_transform(df['cleaned_text'])  # Transform the cleaned text again
y = df['category']  # Update target variable after dropping NaN rows

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values in features (if any)
imputer = SimpleImputer(strategy='mean')  # Using 'mean' for numerical data; adjust accordingly
X_train_imputed = imputer.fit_transform(X_train.toarray())  # .toarray() converts sparse to dense
X_test_imputed = imputer.transform(X_test.toarray())

# Train the model
model = MultinomialNB()
model.fit(X_train_imputed, y_train)

# Make predictions
y_pred = model.predict(X_test_imputed)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Checking for NaN values in features (X):
0
Checking for NaN values in target (y):
9
Accuracy: 0.45391476709613476


In [6]:
# Evaluate the data model

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.14      0.25       147
         1.0       0.00      0.00      0.00        63
         2.0       0.64      0.31      0.42       119
         3.0       0.57      0.24      0.34       124
         4.0       0.44      0.75      0.56       283
         5.0       0.41      0.58      0.48       273

    accuracy                           0.45      1009
   macro avg       0.49      0.34      0.34      1009
weighted avg       0.51      0.45      0.41      1009



In [7]:
df.to_csv("cleaned_dataset.csv", index=False)

In [None]:
# Define symbolic depression keywords
depressive_keywords = [
    "hopeless", "tired", "exhausted", "sad", "depressed", "alone", "worthless",
    "guilty", "failure", "empty", "helpless", "useless"
]

# Function to flag depressive content based on keywords
def flag_depression(text):
    return int(any(word in text for word in depressive_keywords))

# Apply symbolic flagging
df['depression_flag'] = df['cleaned_text'].apply(flag_depression)

print(df[['cleaned_text', 'depression_flag']].head())


                                        cleaned_text  depression_flag
0  end die nothing means anything anything throug...                1
1  today would best friends th birthday wed going...                0
2  couldnt feel worse ive enough tonight midnight...                1
3  never girlfriend rejected countless times see ...                1
4  almost certain depression caused bullshit shap...                0


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize the cleaned text data
vectorizer = CountVectorizer()
X_text_features = vectorizer.fit_transform(df['cleaned_text'])

# Add the depression_flag as an additional feature
import scipy.sparse

X_combined = scipy.sparse.hstack((X_text_features, df[['depression_flag']].values))

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Define the target variable (e.g., category for depressive messages)
y = df['category']  # Replace 'category' with the actual column name for labels

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model on the combined features
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.45391476709613476
