## **Classification of Mental Health Disorder with Emotion Features from social media Text Using Machine Learning Methods**

*  Import Libraries
*  Dataset Upload
*  Data Cleaning and Preprocessing
*  EDA


#**Import Python Libraries**

In [5]:
%%capture
!pip install texthero
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install langdetect
!pip install contractions
!pip install beautifulsoup4
!pip install unidecode
!pip install transformers
!pip install nltk

In [6]:
%%capture
import re
import warnings
# Disable warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import nltk
import pandas as pd
import spacy
#import texthero as hero
from collections import Counter
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
from gensim import corpora
import contractions
from bs4 import BeautifulSoup
from textblob import TextBlob
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Download required nltk data
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# **Dataset Upload**

In [7]:
#Load dataset from Googledrive

data_path = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTJ5zPd_9pGT5V97N5O0-jkNIhj2h3WaIObTKps4P8KmBoE0o6F9cMCb8XuCW0CjkCAMuGOG3LJPdPu/pub?gid=1387298660&single=true&output=csv'


In [None]:
# Read data from csv file
dataset = pd.read_csv(data_path)
dataset.sample(5)

## **Data Preprocessing**

**Data Cleaning**

In [None]:
dataset.shape

In [None]:
#Check null values
dataset.isnull().sum()

In [None]:
#Remove any row with null values
dataset = dataset.dropna(how='any')
dataset.shape

In [None]:
#Check the count of the target labels
dataset['subreddit'].value_counts()

In [None]:
#Remove unnecessary Labels
labels_to_keep = ['depression', 'Anxiety', 'bipolar', 'BPD', 'ADHD', 'autism']

# Filter rows based on labels
dataset = dataset[dataset['subreddit'].isin(labels_to_keep)]
dataset['subreddit'].value_counts()

In [None]:
dataset.shape

In [None]:
# Drop rows with removed post
dataset.drop(dataset[(dataset['title'] =='\\[removed\\]')].index, inplace=True)
dataset.drop(dataset[(dataset['title'] =='[removed]')].index, inplace=True)
dataset.drop(dataset[(dataset['subreddit'] =='\\[removed\\]')].index, inplace=True)
dataset.drop(dataset[(dataset['subreddit'] =='[removed]')].index, inplace=True)
dataset.drop(dataset[(dataset['body'] =='\\[removed\\]')].index, inplace=True)
dataset.drop(dataset[(dataset['body'] =='[removed]')].index, inplace=True)
dataset.shape


In [16]:
# Rename labels
dataset['subreddit'] = dataset['subreddit'].replace({
    'depression': 'Depression',
    'autism': 'Autism',
    'bipolar': 'Bipolar'
})

In [None]:
#view new labels count
dataset['subreddit'].value_counts()

# **Exploratory Data Analysis (EDA)**

In [None]:
#Distribution of subreddit categories
labels = ['ADHD','Depression','Anxiety', 'BPD', 'Austism', 'Bipolar']
sizes = [24554, 19021, 11882, 9773, 7510, 4738 ]
custom_colours = ['g', 'b', 'r', 'c', 'm', 'y']

plt.figure(figsize=(20, 6), dpi=227)

plt.subplot(1, 2, 1)
plt.pie(sizes, labels=labels, textprops={'fontsize': 10}, startangle=140,
        autopct='%1.0f%%', colors=custom_colours, explode=[0, 0, 0, 0, 0, 0.05])

plt.subplot(1, 2, 2)
sns.barplot(x=labels, y=sizes)

plt.ylabel('Counts')  # Add y-axis label

# Add label values on top of the bars
for i, v in enumerate(sizes):
    plt.text(i, v + 1000, str(v), ha='center', va='bottom', fontsize=10)

plt.show()

In [None]:
#Join the Title and Text column to form a sentence
dataset['post'] = dataset['title'] + ' ' + dataset['body']
dataset.sample(5)

In [None]:
#Drop irrelevant columns
columns_to_drop = ['title', 'body']
dataset = dataset.drop(columns_to_drop, axis=1)
dataset.sample(5)

In [21]:
#Check the the lenth of posts
dataset['total_words'] = dataset['post'].apply(lambda x: len(x.split()))

def count_total_words(text):
    char = 0
    for word in text.split():
        char += len(word)
    return char

dataset['total_char'] = dataset["post"].apply(count_total_words)


In [None]:
dataset.sample(5)

### **Word Count**

In [23]:
# Concatenate all posts into a single string
dataset['post'] = dataset['post'].astype(str)
total_words = ' '.join(dataset['post'].values)
# Remove URLs, mentions, and hashtags from the text
total_words = re.sub(r'http\S+', '', total_words)
total_words = re.sub(r'@\S+', '', total_words)
total_words = re.sub(r'#\S+', '', total_words)
# Split the text into individual words
words = total_words.split()

In [24]:
# Remove stop words
stop_words = set(stopwords.words('english'))
words = [word for word in words if not word in stop_words]

In [None]:
# Count the frequency of each word
word_counts = Counter(words)
top_words = word_counts.most_common(25)
top_words

In [None]:
# Create a bar chart of the most common words
top_words = word_counts.most_common(10)  # Change the number to show more/less words
x_values = [word[0] for word in top_words]
y_values = [word[1] for word in top_words]

fig, ax = plt.subplots()
bars = ax.bar(x_values, y_values)
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.title('Top 10 Commonly Used Words')

# Add value labels on top of the bars
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom', color='black')

plt.show()

In [27]:
# Print the number of words and unique words
unique_words = set(total_words.lower().split())
print(f"Total words: {len(total_words.split())} | Unique words: {len(unique_words)}")

Total words: 14398634 | Unique words: 225273


In [None]:
#Kdeplot of the number of word of 'post' by 'subreddit'
fig, ax = plt.subplots(1, 2, figsize=(20, 6))

sns.kdeplot(x = dataset['total_words'], hue= dataset['subreddit'],  ax=ax[0], label='post lenght')
ax[0].set_title('Distribution of word count by Subreddit')

sns.kdeplot(x = dataset['total_char'], hue= dataset['subreddit'],  ax=ax[1], label='text word')
ax[1].set_title('Distribution of character count by Subreddit')
plt.show()


# **Data Pre-processing**

In [29]:
load_model = spacy.load('en_core_web_sm', disable=["parser", "ner"])

In [30]:
#Create functions for data cleaning and preprocessing

def lemmatization(text):
    doc = load_model(text)
    return " ".join([token.lemma_ for token in doc])

def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', str(text))

def remove_newline(text):
    return ' '.join(text.split())

def convert_lowercase(text):
    text = text.lower()
    return str(text)

def remove_html(text):
    return BeautifulSoup(text, "html.parser").text

def remove_whitespaces(text):
    return str(re.sub(r' +', ' ', text))

def remove_brackets(text):
    return re.sub(r'[\[\](){}<>\-_]', '', text)

def remove_quotes(text):
    return text.replace("'", "").replace('"', '')

def remove_digits(text):
    return ''.join([char for char in text if not char.isdigit()])

def fix_contractions(text):
    return contractions.fix(text)

def remove_mf_digits(text):
    return re.sub(r'\b[mfMFT]+\d+\b|\b\d+[mfMFT]+\b', '', text)

def remove_special_chars(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_chars_with_digits(text):
    return re.sub(r'\w\d\w', '', text)

In [31]:
def remove_stopwords(text):
    if isinstance(text, str):
        new_list = []
        words = nltk.word_tokenize(text)
        stopwrds = set(stopwords.words('english')) - {'not'}
        stopwrds.update(['d', 'm', 's', 're', 've', 'll'])

        for word in words:
            if word not in stopwrds:
                new_list.append(word)
        return ' '.join(new_list)
    else:
        return ""


def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)

# Define the misspelled abbreviations dictionary
abbr_dict = {
    "'cause": "because",
    "ain't": "am not",
    "can't": "can not",
    "cannot": "can not",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesnt": "does not",
    "dont": "do not",
    "gimme": "give me",
    "gotta": "got to",
    "hadn't": "had not",
    "hadnt": "had not",
    "hasn't": "has not",
    "hasnt": "has not",
    "haven't": "have not",
    "havent": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "here's": "here is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'm": "i am",
    "i'll": "i will",
    "i've": "i have",
    "i ve": "i have",
    "imma": "i am going to",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "lemme": "let me",
    "let's": "let us",
    "not've": "not have",
    "shouldn't": "should not",
    "she'll": "she will",
    "she's": "she is",
    "that's": "that is",
    "there's": "there is",
    "there're": "there are",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "wasnt": "was not",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "werent": "were not",
    "what's": "what is",
    "what're": "what are",
    "when's": "when is",
    "when're": "when are",
    "where's": "where is",
    "where're": "where are",
    "who's": "who is",
    "who're": "who are",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

def misspelled_abbreviations(text):
    # Replace '’' with '\'
    text = re.sub('’', '\'', text)

    # Replace abbreviations with their full form
    for abbr, expanded_form in abbr_dict.items():
        text = text.replace(abbr, expanded_form)

    return text

In [32]:
# Remove diacritics from the 'Sentence' column using Hero library
#hero.preprocessing.remove_diacritics(dataset.Sentence)

from unidecode import unidecode
dataset['post'] = dataset['post'].apply(unidecode)


In [None]:
# Create a list of functions to apply to each row in the 'sentence' column of the DataFrame
functions_list = [
    convert_lowercase,
    remove_brackets,
    remove_chars_with_digits,
    remove_mf_digits,
    fix_contractions,
    misspelled_abbreviations,
    remove_quotes,
    remove_url,
    remove_newline,
    remove_emojis,
    remove_html,
    remove_special_chars,
    remove_stopwords,
    remove_digits,
    remove_whitespaces,
    lemmatization
]

# Loop through each row in the 'sentence' column of the DataFrame and apply the functions in the list
for i, line in tqdm(dataset['post'].iteritems(), total=dataset.shape[0]):
    for func in functions_list:
        line = func(line)
    dataset.at[i, 'post'] = line

In [None]:
dataset.sample(5)

In [None]:
#Check null values
dataset.isnull().sum()

**Word Cloud showing each word importance per subreddit**

In [None]:
subreddit = dataset['subreddit'].unique()
subreddit = list(subreddit)
subreddit = list(subreddit) + list(subreddit[:3])

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 15))

for ax, sentiment in zip(axes.flatten(), subreddit):
    text = " ".join(dataset[dataset['subreddit'] == sentiment]['post'])
    cloud = WordCloud(width=800, height=800, background_color='black', min_font_size=10).generate(text)
    ax.imshow(cloud)
    ax.set_title(sentiment)
    ax.axis("off")

plt.tight_layout()
plt.show()

In [None]:
#Save the processed data
from google.colab import drive
import os

drive.mount('/content/drive')

dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/Main Dissertation/cleaned_full_dataset.csv', index=False)

## **References**

https://www.kaggle.com/code/anubhavgoyal10/spam-classifier-nlp-98-accuracy

https://www.kaggle.com/code/maeshi/text-classification-with-neural-networks

https://www.kaggle.com/code/mohamedabdelmohsen/emotion-analysis-and-classification-using-lstm-93
