In [14]:
# Imports
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

# Set pandas display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.copy_on_write = True

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
# importing dataset
df = pd.read_csv(r"../Input/Suicide_Detection.csv", usecols=["text", "class"])

In [16]:
# Initialize
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [17]:
negation_words = [
    "not", "no", "never", "nothing", "nobody", "neither", "nowhere", "cannot", 
    "can't", "won't", "don't", "doesn't", "didn't", "hasn't", "haven't", 
    "hadn't", "isn't", "aren't", "wasn't", "weren't", "without", "none", 
    "neither", "nobody", "nothing", "never", "naught", "naughtiness", "less"
]

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(text):
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    processed_words = []
    negation = False
    negation_word = ''
    for word in words:
        if word in negation_words:
            negation = True
            negation_word = word
        elif negation:
            if word.isdigit():
                processed_words.append(negation_word)
                processed_words.append(word)
            else:
                processed_words.append(f'not_{word}')
            negation = False
            negation_word = ''
        else:
            processed_words.append(word)
    processed_words = [word for word in processed_words if word not in stopwords]
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in processed_words]
    lemmatized_words = ' '.join(lemmatized_words)
    return lemmatized_words

df.loc[:, 'processed_text'] = df['text'].apply(preprocess_text)

In [18]:
df.to_csv(r'../Input/preprocessed.csv', index=False)